forked from goodboy/tractor
Block SIGINT handling while in the debugger
This seems to prevent a certain class of bugs to do with the root actor cancelling local tasks and getting into deadlock while children are trying to acquire the tty lock. I'm not sure it's the best idea yet since you're pretty much guaranteed to get "stuck" if a child activates the debugger after the root has been cancelled (at least "stuck" in terms of SIGINT being ignored). That kinda race condition seems to still exist somehow: a child can "beat" the root to activating the tty lock and the parent is stuck waiting on the child to terminate via its nursery.debug_tests
parent
9e1d9a8ce1
commit
f1b242f913
|
@ -4,8 +4,9 @@ Multi-core debugging for da peeps!
|
|||
import bdb
|
||||
import sys
|
||||
from functools import partial
|
||||
from contextlib import asynccontextmanager, AsyncExitStack
|
||||
from contextlib import asynccontextmanager, contextmanager
|
||||
from typing import Awaitable, Tuple, Optional, Callable
|
||||
import signal
|
||||
|
||||
from async_generator import aclosing
|
||||
import tractor
|
||||
|
@ -23,25 +24,27 @@ except ImportError:
|
|||
assert pdb.xpm, "pdbpp is not installed?"
|
||||
pdbpp = pdb
|
||||
|
||||
|
||||
log = get_logger(__name__)
|
||||
|
||||
|
||||
__all__ = ['breakpoint', 'post_mortem']
|
||||
|
||||
|
||||
# placeholder for function to set a ``trio.Event``
|
||||
# placeholder for function to set a ``trio.Event`` on debugger exit
|
||||
_pdb_release_hook: Optional[Callable] = None
|
||||
|
||||
# actor-wide flag
|
||||
_in_debug = False
|
||||
|
||||
# lock in root actor preventing multi-access to local tty
|
||||
_debug_lock = trio.StrictFIFOLock()
|
||||
|
||||
|
||||
class TractorConfig(pdbpp.DefaultConfig):
|
||||
"""Custom ``pdbpp`` goodness.
|
||||
"""
|
||||
# sticky_by_default = True
|
||||
|
||||
def teardown(self):
|
||||
_pdb_release_hook()
|
||||
|
||||
|
||||
class PdbwTeardown(pdbpp.Pdb):
|
||||
"""Add teardown hooks to the regular ``pdbpp.Pdb``.
|
||||
|
@ -52,12 +55,20 @@ class PdbwTeardown(pdbpp.Pdb):
|
|||
# TODO: figure out how to dissallow recursive .set_trace() entry
|
||||
# since that'll cause deadlock for us.
|
||||
def set_continue(self):
|
||||
global _in_debug
|
||||
try:
|
||||
super().set_continue()
|
||||
self.config.teardown()
|
||||
finally:
|
||||
_in_debug = False
|
||||
_pdb_release_hook()
|
||||
|
||||
def set_quit(self):
|
||||
global _in_debug
|
||||
try:
|
||||
super().set_quit()
|
||||
self.config.teardown()
|
||||
finally:
|
||||
_in_debug = False
|
||||
_pdb_release_hook()
|
||||
|
||||
|
||||
# TODO: will be needed whenever we get to true remote debugging.
|
||||
|
@ -95,40 +106,75 @@ class PdbwTeardown(pdbpp.Pdb):
|
|||
# if bmsg in _pdb_exit_patterns:
|
||||
# log.info("Closing stdin hijack")
|
||||
# break
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def _acquire_debug_lock():
|
||||
"""Acquire a actor local FIFO lock meant to mutex entry to a local
|
||||
debugger entry point to avoid tty clobbering by multiple processes.
|
||||
"""
|
||||
try:
|
||||
actor = tractor.current_actor()
|
||||
debug_lock = actor.statespace.setdefault(
|
||||
'_debug_lock', trio.StrictFIFOLock()
|
||||
)
|
||||
await debug_lock.acquire()
|
||||
log.error("TTY BEING ACQUIRED")
|
||||
await _debug_lock.acquire()
|
||||
log.error("TTY lock acquired")
|
||||
yield
|
||||
finally:
|
||||
if debug_lock.locked():
|
||||
debug_lock.release()
|
||||
if _debug_lock.locked():
|
||||
_debug_lock.release()
|
||||
log.error("TTY lock released")
|
||||
|
||||
|
||||
def handler(signum, frame):
|
||||
"""Block SIGINT while in debug to avoid deadlocks with cancellation.
|
||||
"""
|
||||
print(
|
||||
"tractor ignores SIGINT while in debug mode\n"
|
||||
"If you have a special need for it please open an issue.\n"
|
||||
)
|
||||
|
||||
|
||||
# don't allow those stdlib mofos to mess with sigint handler
|
||||
pdbpp.pdb.Pdb.sigint_handler = handler
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _disable_sigint():
|
||||
try:
|
||||
# disable sigint handling while in debug
|
||||
prior_handler = signal.signal(signal.SIGINT, handler)
|
||||
yield
|
||||
finally:
|
||||
# restore SIGINT handling
|
||||
signal.signal(signal.SIGINT, prior_handler)
|
||||
|
||||
|
||||
async def _hijack_stdin_relay_to_child(
|
||||
subactor_uid: Tuple[str, str]
|
||||
) -> None:
|
||||
actor = tractor.current_actor()
|
||||
nursery = actor._actoruid2nursery[subactor_uid]
|
||||
print(f'NURSERY: {nursery}')
|
||||
print(f'nursery is cancelled {nursery.cancelled}')
|
||||
if actor._is_cancelled or nursery.cancelled:
|
||||
raise RuntimeError(
|
||||
"Can not engage debugger actor is already cancelled")
|
||||
|
||||
await trio.sleep(0)
|
||||
|
||||
# TODO: when we get to true remote debugging
|
||||
# this will deliver stdin data
|
||||
log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock")
|
||||
log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock")
|
||||
async with _acquire_debug_lock():
|
||||
log.warning(f"Actor {subactor_uid} acquired stdin hijack lock")
|
||||
log.warning(f"Actor {subactor_uid} ACQUIRED stdin hijack lock")
|
||||
|
||||
with _disable_sigint():
|
||||
# indicate to child that we've locked stdio
|
||||
yield 'Locked'
|
||||
|
||||
# wait for cancellation of stream by child
|
||||
await trio.sleep_forever()
|
||||
|
||||
log.debug(f"Actor {subactor_uid} released stdin hijack lock")
|
||||
log.debug(f"Actor {subactor_uid} RELEASED stdin hijack lock")
|
||||
|
||||
|
||||
# XXX: We only make this sync in case someone wants to
|
||||
|
@ -137,6 +183,7 @@ def _breakpoint(debug_func) -> Awaitable[None]:
|
|||
"""``tractor`` breakpoint entry for engaging pdb machinery
|
||||
in subactors.
|
||||
"""
|
||||
global _in_debug
|
||||
actor = tractor.current_actor()
|
||||
do_unlock = trio.Event()
|
||||
|
||||
|
@ -147,7 +194,7 @@ def _breakpoint(debug_func) -> Awaitable[None]:
|
|||
async with tractor._portal.open_portal(
|
||||
actor._parent_chan,
|
||||
start_msg_loop=False,
|
||||
shield=True,
|
||||
# shield=True,
|
||||
) as portal:
|
||||
with trio.fail_after(1):
|
||||
agen = await portal.run(
|
||||
|
@ -156,34 +203,48 @@ def _breakpoint(debug_func) -> Awaitable[None]:
|
|||
subactor_uid=actor.uid,
|
||||
)
|
||||
async with aclosing(agen):
|
||||
|
||||
# block until first yield above
|
||||
async for val in agen:
|
||||
|
||||
assert val == 'Locked'
|
||||
task_status.started()
|
||||
with trio.CancelScope(shield=True):
|
||||
|
||||
# with trio.CancelScope(shield=True):
|
||||
await do_unlock.wait()
|
||||
|
||||
# trigger cancellation of remote stream
|
||||
break
|
||||
finally:
|
||||
log.debug(f"Exiting debugger for actor {actor}")
|
||||
actor.statespace['_in_debug'] = False
|
||||
global _in_debug
|
||||
_in_debug = False
|
||||
# actor.statespace['_in_debug'] = False
|
||||
log.debug(f"Child {actor} released parent stdio lock")
|
||||
|
||||
async def _bp():
|
||||
"""Async breakpoint which schedules a parent stdio lock, and once complete
|
||||
enters the ``pdbpp`` debugging console.
|
||||
"""
|
||||
in_debug = actor.statespace.setdefault('_in_debug', False)
|
||||
global _in_debug
|
||||
# in_debug = actor.statespace.setdefault('_in_debug', False)
|
||||
|
||||
if in_debug:
|
||||
log.warning(f"Actor {actor} already has a debug lock, skipping...")
|
||||
return
|
||||
if _in_debug:
|
||||
# if **this** actor is already in debug mode block here
|
||||
# waiting for the control to be released - this allows
|
||||
# support for recursive entries to `tractor.breakpoint()`
|
||||
log.warning(
|
||||
f"Actor {actor.uid} already has a debug lock, waiting...")
|
||||
await do_unlock.wait()
|
||||
await trio.sleep(0.1)
|
||||
# return
|
||||
|
||||
# assign unlock callback for debugger teardown hooks
|
||||
global _pdb_release_hook
|
||||
_pdb_release_hook = do_unlock.set
|
||||
|
||||
actor.statespace['_in_debug'] = True
|
||||
# actor.statespace['_in_debug'] = True
|
||||
_in_debug = True
|
||||
|
||||
# TODO: need a more robust check for the "root" actor
|
||||
if actor._parent_chan:
|
||||
|
@ -231,6 +292,7 @@ breakpoint = partial(
|
|||
|
||||
def _post_mortem(actor):
|
||||
log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")
|
||||
# custom Pdb post-mortem entry
|
||||
pdbpp.xpm(Pdb=PdbwTeardown)
|
||||
|
||||
|
||||
|
@ -250,6 +312,7 @@ async def _maybe_enter_pm(err):
|
|||
# there's races between when the parent actor has killed all
|
||||
# comms and when the child tries to contact said parent to
|
||||
# acquire the tty lock.
|
||||
|
||||
# Really we just want to mostly avoid catching KBIs here so there
|
||||
# might be a simpler check we can do?
|
||||
and trio.MultiError.filter(
|
||||
|
|
Loading…
Reference in New Issue