Block SIGINT handling while in the debugger

This seems to prevent a certain class of bugs to do with the root actor
cancelling local tasks and getting into deadlock while children are
trying to acquire the tty lock. I'm not sure it's the best idea yet
since you're pretty much guaranteed to get "stuck" if a child activates
the debugger after the root has been cancelled (at least "stuck" in
terms of SIGINT being ignored). That kinda race condition seems to still
exist somehow: a child can "beat" the root to activating the tty lock
and the parent is stuck waiting on the child to terminate via its
nursery.
debug_tests
Tyler Goodlet 2020-09-28 08:54:21 -04:00
parent 9e1d9a8ce1
commit f1b242f913
1 changed files with 98 additions and 35 deletions

View File

@ -4,8 +4,9 @@ Multi-core debugging for da peeps!
import bdb import bdb
import sys import sys
from functools import partial from functools import partial
from contextlib import asynccontextmanager, AsyncExitStack from contextlib import asynccontextmanager, contextmanager
from typing import Awaitable, Tuple, Optional, Callable from typing import Awaitable, Tuple, Optional, Callable
import signal
from async_generator import aclosing from async_generator import aclosing
import tractor import tractor
@ -23,25 +24,27 @@ except ImportError:
assert pdb.xpm, "pdbpp is not installed?" assert pdb.xpm, "pdbpp is not installed?"
pdbpp = pdb pdbpp = pdb
log = get_logger(__name__) log = get_logger(__name__)
__all__ = ['breakpoint', 'post_mortem'] __all__ = ['breakpoint', 'post_mortem']
# placeholder for function to set a ``trio.Event`` # placeholder for function to set a ``trio.Event`` on debugger exit
_pdb_release_hook: Optional[Callable] = None _pdb_release_hook: Optional[Callable] = None
# actor-wide flag
_in_debug = False
# lock in root actor preventing multi-access to local tty
_debug_lock = trio.StrictFIFOLock()
class TractorConfig(pdbpp.DefaultConfig): class TractorConfig(pdbpp.DefaultConfig):
"""Custom ``pdbpp`` goodness. """Custom ``pdbpp`` goodness.
""" """
# sticky_by_default = True # sticky_by_default = True
def teardown(self):
_pdb_release_hook()
class PdbwTeardown(pdbpp.Pdb): class PdbwTeardown(pdbpp.Pdb):
"""Add teardown hooks to the regular ``pdbpp.Pdb``. """Add teardown hooks to the regular ``pdbpp.Pdb``.
@ -52,12 +55,20 @@ class PdbwTeardown(pdbpp.Pdb):
# TODO: figure out how to dissallow recursive .set_trace() entry # TODO: figure out how to dissallow recursive .set_trace() entry
# since that'll cause deadlock for us. # since that'll cause deadlock for us.
def set_continue(self): def set_continue(self):
super().set_continue() global _in_debug
self.config.teardown() try:
super().set_continue()
finally:
_in_debug = False
_pdb_release_hook()
def set_quit(self): def set_quit(self):
super().set_quit() global _in_debug
self.config.teardown() try:
super().set_quit()
finally:
_in_debug = False
_pdb_release_hook()
# TODO: will be needed whenever we get to true remote debugging. # TODO: will be needed whenever we get to true remote debugging.
@ -95,40 +106,75 @@ class PdbwTeardown(pdbpp.Pdb):
# if bmsg in _pdb_exit_patterns: # if bmsg in _pdb_exit_patterns:
# log.info("Closing stdin hijack") # log.info("Closing stdin hijack")
# break # break
@asynccontextmanager @asynccontextmanager
async def _acquire_debug_lock(): async def _acquire_debug_lock():
"""Acquire a actor local FIFO lock meant to mutex entry to a local """Acquire a actor local FIFO lock meant to mutex entry to a local
debugger entry point to avoid tty clobbering by multiple processes. debugger entry point to avoid tty clobbering by multiple processes.
""" """
try: try:
actor = tractor.current_actor() log.error("TTY BEING ACQUIRED")
debug_lock = actor.statespace.setdefault( await _debug_lock.acquire()
'_debug_lock', trio.StrictFIFOLock()
)
await debug_lock.acquire()
log.error("TTY lock acquired") log.error("TTY lock acquired")
yield yield
finally: finally:
if debug_lock.locked(): if _debug_lock.locked():
debug_lock.release() _debug_lock.release()
log.error("TTY lock released") log.error("TTY lock released")
def handler(signum, frame):
"""Block SIGINT while in debug to avoid deadlocks with cancellation.
"""
print(
"tractor ignores SIGINT while in debug mode\n"
"If you have a special need for it please open an issue.\n"
)
# don't allow those stdlib mofos to mess with sigint handler
pdbpp.pdb.Pdb.sigint_handler = handler
@contextmanager
def _disable_sigint():
try:
# disable sigint handling while in debug
prior_handler = signal.signal(signal.SIGINT, handler)
yield
finally:
# restore SIGINT handling
signal.signal(signal.SIGINT, prior_handler)
async def _hijack_stdin_relay_to_child( async def _hijack_stdin_relay_to_child(
subactor_uid: Tuple[str, str] subactor_uid: Tuple[str, str]
) -> None: ) -> None:
actor = tractor.current_actor()
nursery = actor._actoruid2nursery[subactor_uid]
print(f'NURSERY: {nursery}')
print(f'nursery is cancelled {nursery.cancelled}')
if actor._is_cancelled or nursery.cancelled:
raise RuntimeError(
"Can not engage debugger actor is already cancelled")
await trio.sleep(0)
# TODO: when we get to true remote debugging # TODO: when we get to true remote debugging
# this will deliver stdin data # this will deliver stdin data
log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock") log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock")
async with _acquire_debug_lock(): async with _acquire_debug_lock():
log.warning(f"Actor {subactor_uid} acquired stdin hijack lock") log.warning(f"Actor {subactor_uid} ACQUIRED stdin hijack lock")
# indicate to child that we've locked stdio
yield 'Locked'
# wait for cancellation of stream by child with _disable_sigint():
await trio.sleep_forever() # indicate to child that we've locked stdio
yield 'Locked'
log.debug(f"Actor {subactor_uid} released stdin hijack lock") # wait for cancellation of stream by child
await trio.sleep_forever()
log.debug(f"Actor {subactor_uid} RELEASED stdin hijack lock")
# XXX: We only make this sync in case someone wants to # XXX: We only make this sync in case someone wants to
@ -137,6 +183,7 @@ def _breakpoint(debug_func) -> Awaitable[None]:
"""``tractor`` breakpoint entry for engaging pdb machinery """``tractor`` breakpoint entry for engaging pdb machinery
in subactors. in subactors.
""" """
global _in_debug
actor = tractor.current_actor() actor = tractor.current_actor()
do_unlock = trio.Event() do_unlock = trio.Event()
@ -147,7 +194,7 @@ def _breakpoint(debug_func) -> Awaitable[None]:
async with tractor._portal.open_portal( async with tractor._portal.open_portal(
actor._parent_chan, actor._parent_chan,
start_msg_loop=False, start_msg_loop=False,
shield=True, # shield=True,
) as portal: ) as portal:
with trio.fail_after(1): with trio.fail_after(1):
agen = await portal.run( agen = await portal.run(
@ -156,34 +203,48 @@ def _breakpoint(debug_func) -> Awaitable[None]:
subactor_uid=actor.uid, subactor_uid=actor.uid,
) )
async with aclosing(agen): async with aclosing(agen):
# block until first yield above
async for val in agen: async for val in agen:
assert val == 'Locked' assert val == 'Locked'
task_status.started() task_status.started()
with trio.CancelScope(shield=True):
await do_unlock.wait()
# trigger cancellation of remote stream # with trio.CancelScope(shield=True):
break await do_unlock.wait()
# trigger cancellation of remote stream
break
finally: finally:
log.debug(f"Exiting debugger for actor {actor}") log.debug(f"Exiting debugger for actor {actor}")
actor.statespace['_in_debug'] = False global _in_debug
_in_debug = False
# actor.statespace['_in_debug'] = False
log.debug(f"Child {actor} released parent stdio lock") log.debug(f"Child {actor} released parent stdio lock")
async def _bp(): async def _bp():
"""Async breakpoint which schedules a parent stdio lock, and once complete """Async breakpoint which schedules a parent stdio lock, and once complete
enters the ``pdbpp`` debugging console. enters the ``pdbpp`` debugging console.
""" """
in_debug = actor.statespace.setdefault('_in_debug', False) global _in_debug
# in_debug = actor.statespace.setdefault('_in_debug', False)
if in_debug: if _in_debug:
log.warning(f"Actor {actor} already has a debug lock, skipping...") # if **this** actor is already in debug mode block here
return # waiting for the control to be released - this allows
# support for recursive entries to `tractor.breakpoint()`
log.warning(
f"Actor {actor.uid} already has a debug lock, waiting...")
await do_unlock.wait()
await trio.sleep(0.1)
# return
# assign unlock callback for debugger teardown hooks # assign unlock callback for debugger teardown hooks
global _pdb_release_hook global _pdb_release_hook
_pdb_release_hook = do_unlock.set _pdb_release_hook = do_unlock.set
actor.statespace['_in_debug'] = True # actor.statespace['_in_debug'] = True
_in_debug = True
# TODO: need a more robust check for the "root" actor # TODO: need a more robust check for the "root" actor
if actor._parent_chan: if actor._parent_chan:
@ -231,6 +292,7 @@ breakpoint = partial(
def _post_mortem(actor): def _post_mortem(actor):
log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n") log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")
# custom Pdb post-mortem entry
pdbpp.xpm(Pdb=PdbwTeardown) pdbpp.xpm(Pdb=PdbwTeardown)
@ -250,6 +312,7 @@ async def _maybe_enter_pm(err):
# there's races between when the parent actor has killed all # there's races between when the parent actor has killed all
# comms and when the child tries to contact said parent to # comms and when the child tries to contact said parent to
# acquire the tty lock. # acquire the tty lock.
# Really we just want to mostly avoid catching KBIs here so there # Really we just want to mostly avoid catching KBIs here so there
# might be a simpler check we can do? # might be a simpler check we can do?
and trio.MultiError.filter( and trio.MultiError.filter(