diff --git a/newsfragments/234.bugfix b/newsfragments/234.bugfix new file mode 100644 index 0000000..00a0ce9 --- /dev/null +++ b/newsfragments/234.bugfix @@ -0,0 +1,6 @@ +Handle broken channel/stream faults where the root's tty lock is left acquired by some +child actor who went MIA and the root ends up hanging indefinitely. + +There's two parts here: +- Don't shield wait on the lock +- Always do our best to release the lock on the expected worst case connection faults diff --git a/tractor/__init__.py b/tractor/__init__.py index a7cadb9..394693f 100644 --- a/tractor/__init__.py +++ b/tractor/__init__.py @@ -23,6 +23,7 @@ from ._exceptions import ( from ._debug import breakpoint, post_mortem from . import msg from ._root import run, run_daemon, open_root_actor +from ._portal import Portal __all__ = [ @@ -40,6 +41,7 @@ __all__ = [ 'msg', 'open_nursery', 'open_root_actor', + 'Portal', 'post_mortem', 'run', 'run_daemon', diff --git a/tractor/_debug.py b/tractor/_debug.py index 4fc7495..6e1d7f0 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -196,6 +196,22 @@ async def _acquire_debug_lock( log.debug(f"TTY lock released, remote task: {task_name}:{uid}") +def handler(signum, frame, *args): + """Specialized debugger compatible SIGINT handler. + + In childred we always ignore to avoid deadlocks since cancellation + should always be managed by the parent supervising actor. The root + is always cancelled on ctrl-c. + """ + if is_root_process(): + tractor.current_actor().cancel_soon() + else: + print( + "tractor ignores SIGINT while in debug mode\n" + "If you have a special need for it please open an issue.\n" + ) + + @tractor.context async def _hijack_stdin_for_child( @@ -222,24 +238,36 @@ async def _hijack_stdin_for_child( with trio.CancelScope(shield=True): - async with _acquire_debug_lock(subactor_uid): + try: + lock = None + async with _acquire_debug_lock(subactor_uid) as lock: - # indicate to child that we've locked stdio - await ctx.started('Locked') - log.pdb(f"Actor {subactor_uid} ACQUIRED stdin hijack lock") + # indicate to child that we've locked stdio + await ctx.started('Locked') + log.pdb(f"Actor {subactor_uid} ACQUIRED stdin hijack lock") - # wait for unlock pdb by child - async with ctx.open_stream() as stream: - try: + # wait for unlock pdb by child + async with ctx.open_stream() as stream: assert await stream.receive() == 'pdb_unlock' - except trio.BrokenResourceError: - # XXX: there may be a race with the portal teardown - # with the calling actor which we can safely ignore. - # The alternative would be sending an ack message - # and allowing the client to wait for us to teardown - # first? - pass + # try: + # assert await stream.receive() == 'pdb_unlock' + + except ( + trio.BrokenResourceError, + trio.Cancelled, # by local cancellation + trio.ClosedResourceError, # by self._rx_chan + ) as err: + # XXX: there may be a race with the portal teardown + # with the calling actor which we can safely ignore. + # The alternative would be sending an ack message + # and allowing the client to wait for us to teardown + # first? + if lock and lock.locked(): + lock.release() + + if isinstance(err, trio.Cancelled): + raise log.debug(f"TTY lock released, remote task: {task_name}:{subactor_uid}") @@ -283,7 +311,7 @@ async def _breakpoint( try: async with get_root() as portal: - log.error('got portal') + log.pdb('got portal') # this syncs to child's ``Context.started()`` call. async with portal.open_context( @@ -293,7 +321,7 @@ async def _breakpoint( ) as (ctx, val): - log.error('locked context') + log.pdb('locked context') assert val == 'Locked' async with ctx.open_stream() as stream: @@ -376,10 +404,17 @@ async def _breakpoint( # callbacks. Can't think of a nicer way then this atm. if _debug_lock.locked(): log.warning( - 'Root actor attempting to acquire active tty lock' + 'Root actor attempting to shield-acquire active tty lock' f' owned by {_global_actor_in_debug}') - await _debug_lock.acquire() + with trio.CancelScope(shield=True): + # must shield here to avoid hitting a ``Cancelled`` and + # a child getting stuck bc we clobbered the tty + await _debug_lock.acquire() + + else: + # may be cancelled + await _debug_lock.acquire() _global_actor_in_debug = actor.uid _local_task_in_debug = task_name @@ -402,12 +437,13 @@ async def _breakpoint( debug_func(actor) -def _mk_pdb(): +def _mk_pdb() -> PdbwTeardown: + # XXX: setting these flags on the pdb instance are absolutely - # critical to having ctrl-c work in the ``trio`` standard way! - # The stdlib's pdb supports entering the current sync frame - # on a SIGINT, with ``trio`` we pretty much never want this - # and we did we can handle it in the ``tractor`` task runtime. + # critical to having ctrl-c work in the ``trio`` standard way! The + # stdlib's pdb supports entering the current sync frame on a SIGINT, + # with ``trio`` we pretty much never want this and if we did we can + # handle it in the ``tractor`` task runtime. pdb = PdbwTeardown() pdb.allow_kbdint = True diff --git a/tractor/_entry.py b/tractor/_entry.py index 2f57048..c59e513 100644 --- a/tractor/_entry.py +++ b/tractor/_entry.py @@ -62,7 +62,7 @@ def _trio_main( """ # Disable sigint handling in children; # we don't need it thanks to our cancellation machinery. - signal.signal(signal.SIGINT, signal.SIG_IGN) + # signal.signal(signal.SIGINT, signal.SIG_IGN) log.info(f"Started new trio process for {actor.uid}") diff --git a/tractor/_trionics.py b/tractor/_trionics.py index 5b58511..17e7838 100644 --- a/tractor/_trionics.py +++ b/tractor/_trionics.py @@ -298,8 +298,8 @@ async def _open_and_supervise_one_cancels_all_nursery( f'child {_debug._global_actor_in_debug}\n' 'Waiting on tty lock to release..') - with trio.CancelScope(shield=True): - await debug_complete.wait() + # with trio.CancelScope(shield=True): + await debug_complete.wait() # if the caller's scope errored then we activate our # one-cancels-all supervisor strategy (don't