From 497fa72c967399e89744c06e2e628bda3f8fcea3 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 16 Nov 2020 00:01:21 -0500 Subject: [PATCH 1/8] Add a SIGINT handler that kills the process tree We're not actually using this but it's for reference if we do end up needing it. The std lib's `pdb` internals override SIGINT handling whenever one enters the debugger repl. Force a handler that kills the tree if SIGINT is triggered from the root actor, otherwise ignore it since supervised children should be managed already. This resolves an issue with guest mode where `pdb` causes SIGINTs to be swallowed resulting in the host loop never terminating the process tree. --- tractor/_debug.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tractor/_debug.py b/tractor/_debug.py index 4fc7495..7897e00 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -196,6 +196,22 @@ async def _acquire_debug_lock( log.debug(f"TTY lock released, remote task: {task_name}:{uid}") +def handler(signum, frame, *args): + """Specialized debugger compatible SIGINT handler. + + In childred we always ignore to avoid deadlocks since cancellation + should always be managed by the parent supervising actor. The root + is always cancelled on ctrl-c. + """ + if is_root_process(): + tractor.current_actor().cancel_soon() + else: + print( + "tractor ignores SIGINT while in debug mode\n" + "If you have a special need for it please open an issue.\n" + ) + + @tractor.context async def _hijack_stdin_for_child( From 68d56d5df0305ec2832a19dd1163745cdc2420ba Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 16 Nov 2020 00:07:43 -0500 Subject: [PATCH 2/8] Try not masking SIGINT in child processes --- tractor/_entry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tractor/_entry.py b/tractor/_entry.py index 2f57048..c59e513 100644 --- a/tractor/_entry.py +++ b/tractor/_entry.py @@ -62,7 +62,7 @@ def _trio_main( """ # Disable sigint handling in children; # we don't need it thanks to our cancellation machinery. - signal.signal(signal.SIGINT, signal.SIG_IGN) + # signal.signal(signal.SIGINT, signal.SIG_IGN) log.info(f"Started new trio process for {actor.uid}") From d906c81f144ec85c3be7f62ad00f751b1ff75c7a Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 25 May 2021 09:17:53 -0400 Subject: [PATCH 3/8] Export portal type at top level --- tractor/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tractor/__init__.py b/tractor/__init__.py index a7cadb9..394693f 100644 --- a/tractor/__init__.py +++ b/tractor/__init__.py @@ -23,6 +23,7 @@ from ._exceptions import ( from ._debug import breakpoint, post_mortem from . import msg from ._root import run, run_daemon, open_root_actor +from ._portal import Portal __all__ = [ @@ -40,6 +41,7 @@ __all__ = [ 'msg', 'open_nursery', 'open_root_actor', + 'Portal', 'post_mortem', 'run', 'run_daemon', From 4f166500d0a047e3a9a128b80d1245603f8a1ed9 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 2 Aug 2021 20:38:03 -0400 Subject: [PATCH 4/8] Add return type to debugger factory --- tractor/_debug.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index 7897e00..9ac79a1 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -418,12 +418,13 @@ async def _breakpoint( debug_func(actor) -def _mk_pdb(): +def _mk_pdb() -> PdbwTeardown: + # XXX: setting these flags on the pdb instance are absolutely - # critical to having ctrl-c work in the ``trio`` standard way! - # The stdlib's pdb supports entering the current sync frame - # on a SIGINT, with ``trio`` we pretty much never want this - # and we did we can handle it in the ``tractor`` task runtime. + # critical to having ctrl-c work in the ``trio`` standard way! The + # stdlib's pdb supports entering the current sync frame on a SIGINT, + # with ``trio`` we pretty much never want this and if we did we can + # handle it in the ``tractor`` task runtime. pdb = PdbwTeardown() pdb.allow_kbdint = True From 79f0d6fda0986424353b7175e5a319eeab86d406 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 19 Aug 2021 12:36:55 -0400 Subject: [PATCH 5/8] Attempt to avoid pdb lockups on channel breakage Always try to release the root tty lock on broken connection errors. --- tractor/_debug.py | 48 +++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index 9ac79a1..e9bfaf1 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -238,24 +238,29 @@ async def _hijack_stdin_for_child( with trio.CancelScope(shield=True): - async with _acquire_debug_lock(subactor_uid): + try: + lock = None + async with _acquire_debug_lock(subactor_uid) as lock: - # indicate to child that we've locked stdio - await ctx.started('Locked') - log.pdb(f"Actor {subactor_uid} ACQUIRED stdin hijack lock") + # indicate to child that we've locked stdio + await ctx.started('Locked') + log.pdb(f"Actor {subactor_uid} ACQUIRED stdin hijack lock") - # wait for unlock pdb by child - async with ctx.open_stream() as stream: - try: + # wait for unlock pdb by child + async with ctx.open_stream() as stream: assert await stream.receive() == 'pdb_unlock' - except trio.BrokenResourceError: - # XXX: there may be a race with the portal teardown - # with the calling actor which we can safely ignore. - # The alternative would be sending an ack message - # and allowing the client to wait for us to teardown - # first? - pass + # try: + # assert await stream.receive() == 'pdb_unlock' + + except trio.BrokenResourceError: + # XXX: there may be a race with the portal teardown + # with the calling actor which we can safely ignore. + # The alternative would be sending an ack message + # and allowing the client to wait for us to teardown + # first? + if lock and lock.locked(): + lock.release() log.debug(f"TTY lock released, remote task: {task_name}:{subactor_uid}") @@ -299,7 +304,7 @@ async def _breakpoint( try: async with get_root() as portal: - log.error('got portal') + log.pdb('got portal') # this syncs to child's ``Context.started()`` call. async with portal.open_context( @@ -309,7 +314,7 @@ async def _breakpoint( ) as (ctx, val): - log.error('locked context') + log.pdb('locked context') assert val == 'Locked' async with ctx.open_stream() as stream: @@ -392,10 +397,17 @@ async def _breakpoint( # callbacks. Can't think of a nicer way then this atm. if _debug_lock.locked(): log.warning( - 'Root actor attempting to acquire active tty lock' + 'Root actor attempting to shield-acquire active tty lock' f' owned by {_global_actor_in_debug}') - await _debug_lock.acquire() + with trio.CancelScope(shield=True): + # must shield here to avoid hitting a ``Cancelled`` and + # a child getting stuck bc we clobbered the tty + await _debug_lock.acquire() + + else: + # may be cancelled + await _debug_lock.acquire() _global_actor_in_debug = actor.uid _local_task_in_debug = task_name From 61d2307e5294b3aea5ff59d039c2bfe70ac64224 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 31 Aug 2021 17:46:00 -0400 Subject: [PATCH 6/8] Unlock pdb tty on all possible net faults --- tractor/_debug.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index e9bfaf1..6e1d7f0 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -253,7 +253,11 @@ async def _hijack_stdin_for_child( # try: # assert await stream.receive() == 'pdb_unlock' - except trio.BrokenResourceError: + except ( + trio.BrokenResourceError, + trio.Cancelled, # by local cancellation + trio.ClosedResourceError, # by self._rx_chan + ) as err: # XXX: there may be a race with the portal teardown # with the calling actor which we can safely ignore. # The alternative would be sending an ack message @@ -262,6 +266,9 @@ async def _hijack_stdin_for_child( if lock and lock.locked(): lock.release() + if isinstance(err, trio.Cancelled): + raise + log.debug(f"TTY lock released, remote task: {task_name}:{subactor_uid}") return "pdb_unlock_complete" From 3208b67f571d71e672b0dc139af20842c988bc8a Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 2 Sep 2021 13:02:01 -0400 Subject: [PATCH 7/8] Drop shielding on root lock acquire; seems to prevent hangs --- tractor/_trionics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tractor/_trionics.py b/tractor/_trionics.py index 5b58511..17e7838 100644 --- a/tractor/_trionics.py +++ b/tractor/_trionics.py @@ -298,8 +298,8 @@ async def _open_and_supervise_one_cancels_all_nursery( f'child {_debug._global_actor_in_debug}\n' 'Waiting on tty lock to release..') - with trio.CancelScope(shield=True): - await debug_complete.wait() + # with trio.CancelScope(shield=True): + await debug_complete.wait() # if the caller's scope errored then we activate our # one-cancels-all supervisor strategy (don't From 76342ed0c592a4a6a69dd62b29cf7c750dddc695 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 2 Sep 2021 16:37:10 -0400 Subject: [PATCH 8/8] Add news bit --- newsfragments/234.bugfix | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 newsfragments/234.bugfix diff --git a/newsfragments/234.bugfix b/newsfragments/234.bugfix new file mode 100644 index 0000000..00a0ce9 --- /dev/null +++ b/newsfragments/234.bugfix @@ -0,0 +1,6 @@ +Handle broken channel/stream faults where the root's tty lock is left acquired by some +child actor who went MIA and the root ends up hanging indefinitely. + +There's two parts here: +- Don't shield wait on the lock +- Always do our best to release the lock on the expected worst case connection faults