From f1b242f913b9bf3d84fbf6b8e9d7d154bb3ef9d5 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 28 Sep 2020 08:54:21 -0400 Subject: [PATCH] Block SIGINT handling while in the debugger This seems to prevent a certain class of bugs to do with the root actor cancelling local tasks and getting into deadlock while children are trying to acquire the tty lock. I'm not sure it's the best idea yet since you're pretty much guaranteed to get "stuck" if a child activates the debugger after the root has been cancelled (at least "stuck" in terms of SIGINT being ignored). That kinda race condition seems to still exist somehow: a child can "beat" the root to activating the tty lock and the parent is stuck waiting on the child to terminate via its nursery. --- tractor/_debug.py | 133 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 98 insertions(+), 35 deletions(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index c31f16f..4d43025 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -4,8 +4,9 @@ Multi-core debugging for da peeps! import bdb import sys from functools import partial -from contextlib import asynccontextmanager, AsyncExitStack +from contextlib import asynccontextmanager, contextmanager from typing import Awaitable, Tuple, Optional, Callable +import signal from async_generator import aclosing import tractor @@ -23,25 +24,27 @@ except ImportError: assert pdb.xpm, "pdbpp is not installed?" pdbpp = pdb - log = get_logger(__name__) __all__ = ['breakpoint', 'post_mortem'] -# placeholder for function to set a ``trio.Event`` +# placeholder for function to set a ``trio.Event`` on debugger exit _pdb_release_hook: Optional[Callable] = None +# actor-wide flag +_in_debug = False + +# lock in root actor preventing multi-access to local tty +_debug_lock = trio.StrictFIFOLock() + class TractorConfig(pdbpp.DefaultConfig): """Custom ``pdbpp`` goodness. """ # sticky_by_default = True - def teardown(self): - _pdb_release_hook() - class PdbwTeardown(pdbpp.Pdb): """Add teardown hooks to the regular ``pdbpp.Pdb``. @@ -52,12 +55,20 @@ class PdbwTeardown(pdbpp.Pdb): # TODO: figure out how to dissallow recursive .set_trace() entry # since that'll cause deadlock for us. def set_continue(self): - super().set_continue() - self.config.teardown() + global _in_debug + try: + super().set_continue() + finally: + _in_debug = False + _pdb_release_hook() def set_quit(self): - super().set_quit() - self.config.teardown() + global _in_debug + try: + super().set_quit() + finally: + _in_debug = False + _pdb_release_hook() # TODO: will be needed whenever we get to true remote debugging. @@ -95,40 +106,75 @@ class PdbwTeardown(pdbpp.Pdb): # if bmsg in _pdb_exit_patterns: # log.info("Closing stdin hijack") # break + + @asynccontextmanager async def _acquire_debug_lock(): """Acquire a actor local FIFO lock meant to mutex entry to a local debugger entry point to avoid tty clobbering by multiple processes. """ try: - actor = tractor.current_actor() - debug_lock = actor.statespace.setdefault( - '_debug_lock', trio.StrictFIFOLock() - ) - await debug_lock.acquire() + log.error("TTY BEING ACQUIRED") + await _debug_lock.acquire() log.error("TTY lock acquired") yield finally: - if debug_lock.locked(): - debug_lock.release() + if _debug_lock.locked(): + _debug_lock.release() log.error("TTY lock released") +def handler(signum, frame): + """Block SIGINT while in debug to avoid deadlocks with cancellation. + """ + print( + "tractor ignores SIGINT while in debug mode\n" + "If you have a special need for it please open an issue.\n" + ) + + +# don't allow those stdlib mofos to mess with sigint handler +pdbpp.pdb.Pdb.sigint_handler = handler + + +@contextmanager +def _disable_sigint(): + try: + # disable sigint handling while in debug + prior_handler = signal.signal(signal.SIGINT, handler) + yield + finally: + # restore SIGINT handling + signal.signal(signal.SIGINT, prior_handler) + + async def _hijack_stdin_relay_to_child( subactor_uid: Tuple[str, str] ) -> None: + actor = tractor.current_actor() + nursery = actor._actoruid2nursery[subactor_uid] + print(f'NURSERY: {nursery}') + print(f'nursery is cancelled {nursery.cancelled}') + if actor._is_cancelled or nursery.cancelled: + raise RuntimeError( + "Can not engage debugger actor is already cancelled") + + await trio.sleep(0) + # TODO: when we get to true remote debugging # this will deliver stdin data - log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock") + log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock") async with _acquire_debug_lock(): - log.warning(f"Actor {subactor_uid} acquired stdin hijack lock") - # indicate to child that we've locked stdio - yield 'Locked' + log.warning(f"Actor {subactor_uid} ACQUIRED stdin hijack lock") - # wait for cancellation of stream by child - await trio.sleep_forever() + with _disable_sigint(): + # indicate to child that we've locked stdio + yield 'Locked' - log.debug(f"Actor {subactor_uid} released stdin hijack lock") + # wait for cancellation of stream by child + await trio.sleep_forever() + + log.debug(f"Actor {subactor_uid} RELEASED stdin hijack lock") # XXX: We only make this sync in case someone wants to @@ -137,6 +183,7 @@ def _breakpoint(debug_func) -> Awaitable[None]: """``tractor`` breakpoint entry for engaging pdb machinery in subactors. """ + global _in_debug actor = tractor.current_actor() do_unlock = trio.Event() @@ -147,7 +194,7 @@ def _breakpoint(debug_func) -> Awaitable[None]: async with tractor._portal.open_portal( actor._parent_chan, start_msg_loop=False, - shield=True, + # shield=True, ) as portal: with trio.fail_after(1): agen = await portal.run( @@ -156,34 +203,48 @@ def _breakpoint(debug_func) -> Awaitable[None]: subactor_uid=actor.uid, ) async with aclosing(agen): + + # block until first yield above async for val in agen: + assert val == 'Locked' task_status.started() - with trio.CancelScope(shield=True): - await do_unlock.wait() - # trigger cancellation of remote stream - break + # with trio.CancelScope(shield=True): + await do_unlock.wait() + + # trigger cancellation of remote stream + break finally: log.debug(f"Exiting debugger for actor {actor}") - actor.statespace['_in_debug'] = False + global _in_debug + _in_debug = False + # actor.statespace['_in_debug'] = False log.debug(f"Child {actor} released parent stdio lock") async def _bp(): """Async breakpoint which schedules a parent stdio lock, and once complete enters the ``pdbpp`` debugging console. """ - in_debug = actor.statespace.setdefault('_in_debug', False) + global _in_debug + # in_debug = actor.statespace.setdefault('_in_debug', False) - if in_debug: - log.warning(f"Actor {actor} already has a debug lock, skipping...") - return + if _in_debug: + # if **this** actor is already in debug mode block here + # waiting for the control to be released - this allows + # support for recursive entries to `tractor.breakpoint()` + log.warning( + f"Actor {actor.uid} already has a debug lock, waiting...") + await do_unlock.wait() + await trio.sleep(0.1) + # return # assign unlock callback for debugger teardown hooks global _pdb_release_hook _pdb_release_hook = do_unlock.set - actor.statespace['_in_debug'] = True + # actor.statespace['_in_debug'] = True + _in_debug = True # TODO: need a more robust check for the "root" actor if actor._parent_chan: @@ -231,6 +292,7 @@ breakpoint = partial( def _post_mortem(actor): log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n") + # custom Pdb post-mortem entry pdbpp.xpm(Pdb=PdbwTeardown) @@ -250,6 +312,7 @@ async def _maybe_enter_pm(err): # there's races between when the parent actor has killed all # comms and when the child tries to contact said parent to # acquire the tty lock. + # Really we just want to mostly avoid catching KBIs here so there # might be a simpler check we can do? and trio.MultiError.filter(