forked from goodboy/tractor
1
0
Fork 0

Block SIGINT handling while in the debugger

This seems to prevent a certain class of bugs to do with the root actor
cancelling local tasks and getting into deadlock while children are
trying to acquire the tty lock. I'm not sure it's the best idea yet
since you're pretty much guaranteed to get "stuck" if a child activates
the debugger after the root has been cancelled (at least "stuck" in
terms of SIGINT being ignored). That kinda race condition seems to still
exist somehow: a child can "beat" the root to activating the tty lock
and the parent is stuck waiting on the child to terminate via its
nursery.
debug_tests
Tyler Goodlet 2020-09-28 08:54:21 -04:00
parent 9e1d9a8ce1
commit f1b242f913
1 changed files with 98 additions and 35 deletions

View File

@ -4,8 +4,9 @@ Multi-core debugging for da peeps!
import bdb
import sys
from functools import partial
from contextlib import asynccontextmanager, AsyncExitStack
from contextlib import asynccontextmanager, contextmanager
from typing import Awaitable, Tuple, Optional, Callable
import signal
from async_generator import aclosing
import tractor
@ -23,25 +24,27 @@ except ImportError:
assert pdb.xpm, "pdbpp is not installed?"
pdbpp = pdb
log = get_logger(__name__)
__all__ = ['breakpoint', 'post_mortem']
# placeholder for function to set a ``trio.Event``
# placeholder for function to set a ``trio.Event`` on debugger exit
_pdb_release_hook: Optional[Callable] = None
# actor-wide flag
_in_debug = False
# lock in root actor preventing multi-access to local tty
_debug_lock = trio.StrictFIFOLock()
class TractorConfig(pdbpp.DefaultConfig):
"""Custom ``pdbpp`` goodness.
"""
# sticky_by_default = True
def teardown(self):
_pdb_release_hook()
class PdbwTeardown(pdbpp.Pdb):
"""Add teardown hooks to the regular ``pdbpp.Pdb``.
@ -52,12 +55,20 @@ class PdbwTeardown(pdbpp.Pdb):
# TODO: figure out how to dissallow recursive .set_trace() entry
# since that'll cause deadlock for us.
def set_continue(self):
super().set_continue()
self.config.teardown()
global _in_debug
try:
super().set_continue()
finally:
_in_debug = False
_pdb_release_hook()
def set_quit(self):
super().set_quit()
self.config.teardown()
global _in_debug
try:
super().set_quit()
finally:
_in_debug = False
_pdb_release_hook()
# TODO: will be needed whenever we get to true remote debugging.
@ -95,40 +106,75 @@ class PdbwTeardown(pdbpp.Pdb):
# if bmsg in _pdb_exit_patterns:
# log.info("Closing stdin hijack")
# break
@asynccontextmanager
async def _acquire_debug_lock():
"""Acquire a actor local FIFO lock meant to mutex entry to a local
debugger entry point to avoid tty clobbering by multiple processes.
"""
try:
actor = tractor.current_actor()
debug_lock = actor.statespace.setdefault(
'_debug_lock', trio.StrictFIFOLock()
)
await debug_lock.acquire()
log.error("TTY BEING ACQUIRED")
await _debug_lock.acquire()
log.error("TTY lock acquired")
yield
finally:
if debug_lock.locked():
debug_lock.release()
if _debug_lock.locked():
_debug_lock.release()
log.error("TTY lock released")
def handler(signum, frame):
"""Block SIGINT while in debug to avoid deadlocks with cancellation.
"""
print(
"tractor ignores SIGINT while in debug mode\n"
"If you have a special need for it please open an issue.\n"
)
# don't allow those stdlib mofos to mess with sigint handler
pdbpp.pdb.Pdb.sigint_handler = handler
@contextmanager
def _disable_sigint():
try:
# disable sigint handling while in debug
prior_handler = signal.signal(signal.SIGINT, handler)
yield
finally:
# restore SIGINT handling
signal.signal(signal.SIGINT, prior_handler)
async def _hijack_stdin_relay_to_child(
subactor_uid: Tuple[str, str]
) -> None:
actor = tractor.current_actor()
nursery = actor._actoruid2nursery[subactor_uid]
print(f'NURSERY: {nursery}')
print(f'nursery is cancelled {nursery.cancelled}')
if actor._is_cancelled or nursery.cancelled:
raise RuntimeError(
"Can not engage debugger actor is already cancelled")
await trio.sleep(0)
# TODO: when we get to true remote debugging
# this will deliver stdin data
log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock")
log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock")
async with _acquire_debug_lock():
log.warning(f"Actor {subactor_uid} acquired stdin hijack lock")
# indicate to child that we've locked stdio
yield 'Locked'
log.warning(f"Actor {subactor_uid} ACQUIRED stdin hijack lock")
# wait for cancellation of stream by child
await trio.sleep_forever()
with _disable_sigint():
# indicate to child that we've locked stdio
yield 'Locked'
log.debug(f"Actor {subactor_uid} released stdin hijack lock")
# wait for cancellation of stream by child
await trio.sleep_forever()
log.debug(f"Actor {subactor_uid} RELEASED stdin hijack lock")
# XXX: We only make this sync in case someone wants to
@ -137,6 +183,7 @@ def _breakpoint(debug_func) -> Awaitable[None]:
"""``tractor`` breakpoint entry for engaging pdb machinery
in subactors.
"""
global _in_debug
actor = tractor.current_actor()
do_unlock = trio.Event()
@ -147,7 +194,7 @@ def _breakpoint(debug_func) -> Awaitable[None]:
async with tractor._portal.open_portal(
actor._parent_chan,
start_msg_loop=False,
shield=True,
# shield=True,
) as portal:
with trio.fail_after(1):
agen = await portal.run(
@ -156,34 +203,48 @@ def _breakpoint(debug_func) -> Awaitable[None]:
subactor_uid=actor.uid,
)
async with aclosing(agen):
# block until first yield above
async for val in agen:
assert val == 'Locked'
task_status.started()
with trio.CancelScope(shield=True):
await do_unlock.wait()
# trigger cancellation of remote stream
break
# with trio.CancelScope(shield=True):
await do_unlock.wait()
# trigger cancellation of remote stream
break
finally:
log.debug(f"Exiting debugger for actor {actor}")
actor.statespace['_in_debug'] = False
global _in_debug
_in_debug = False
# actor.statespace['_in_debug'] = False
log.debug(f"Child {actor} released parent stdio lock")
async def _bp():
"""Async breakpoint which schedules a parent stdio lock, and once complete
enters the ``pdbpp`` debugging console.
"""
in_debug = actor.statespace.setdefault('_in_debug', False)
global _in_debug
# in_debug = actor.statespace.setdefault('_in_debug', False)
if in_debug:
log.warning(f"Actor {actor} already has a debug lock, skipping...")
return
if _in_debug:
# if **this** actor is already in debug mode block here
# waiting for the control to be released - this allows
# support for recursive entries to `tractor.breakpoint()`
log.warning(
f"Actor {actor.uid} already has a debug lock, waiting...")
await do_unlock.wait()
await trio.sleep(0.1)
# return
# assign unlock callback for debugger teardown hooks
global _pdb_release_hook
_pdb_release_hook = do_unlock.set
actor.statespace['_in_debug'] = True
# actor.statespace['_in_debug'] = True
_in_debug = True
# TODO: need a more robust check for the "root" actor
if actor._parent_chan:
@ -231,6 +292,7 @@ breakpoint = partial(
def _post_mortem(actor):
log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")
# custom Pdb post-mortem entry
pdbpp.xpm(Pdb=PdbwTeardown)
@ -250,6 +312,7 @@ async def _maybe_enter_pm(err):
# there's races between when the parent actor has killed all
# comms and when the child tries to contact said parent to
# acquire the tty lock.
# Really we just want to mostly avoid catching KBIs here so there
# might be a simpler check we can do?
and trio.MultiError.filter(