Block SIGINT handling while in the debugger
This seems to prevent a certain class of bugs to do with the root actor cancelling local tasks and getting into deadlock while children are trying to acquire the tty lock. I'm not sure it's the best idea yet since you're pretty much guaranteed to get "stuck" if a child activates the debugger after the root has been cancelled (at least "stuck" in terms of SIGINT being ignored). That kinda race condition seems to still exist somehow: a child can "beat" the root to activating the tty lock and the parent is stuck waiting on the child to terminate via its nursery.debug_tests
parent
9e1d9a8ce1
commit
f1b242f913
|
@ -4,8 +4,9 @@ Multi-core debugging for da peeps!
|
||||||
import bdb
|
import bdb
|
||||||
import sys
|
import sys
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from contextlib import asynccontextmanager, AsyncExitStack
|
from contextlib import asynccontextmanager, contextmanager
|
||||||
from typing import Awaitable, Tuple, Optional, Callable
|
from typing import Awaitable, Tuple, Optional, Callable
|
||||||
|
import signal
|
||||||
|
|
||||||
from async_generator import aclosing
|
from async_generator import aclosing
|
||||||
import tractor
|
import tractor
|
||||||
|
@ -23,25 +24,27 @@ except ImportError:
|
||||||
assert pdb.xpm, "pdbpp is not installed?"
|
assert pdb.xpm, "pdbpp is not installed?"
|
||||||
pdbpp = pdb
|
pdbpp = pdb
|
||||||
|
|
||||||
|
|
||||||
log = get_logger(__name__)
|
log = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['breakpoint', 'post_mortem']
|
__all__ = ['breakpoint', 'post_mortem']
|
||||||
|
|
||||||
|
|
||||||
# placeholder for function to set a ``trio.Event``
|
# placeholder for function to set a ``trio.Event`` on debugger exit
|
||||||
_pdb_release_hook: Optional[Callable] = None
|
_pdb_release_hook: Optional[Callable] = None
|
||||||
|
|
||||||
|
# actor-wide flag
|
||||||
|
_in_debug = False
|
||||||
|
|
||||||
|
# lock in root actor preventing multi-access to local tty
|
||||||
|
_debug_lock = trio.StrictFIFOLock()
|
||||||
|
|
||||||
|
|
||||||
class TractorConfig(pdbpp.DefaultConfig):
|
class TractorConfig(pdbpp.DefaultConfig):
|
||||||
"""Custom ``pdbpp`` goodness.
|
"""Custom ``pdbpp`` goodness.
|
||||||
"""
|
"""
|
||||||
# sticky_by_default = True
|
# sticky_by_default = True
|
||||||
|
|
||||||
def teardown(self):
|
|
||||||
_pdb_release_hook()
|
|
||||||
|
|
||||||
|
|
||||||
class PdbwTeardown(pdbpp.Pdb):
|
class PdbwTeardown(pdbpp.Pdb):
|
||||||
"""Add teardown hooks to the regular ``pdbpp.Pdb``.
|
"""Add teardown hooks to the regular ``pdbpp.Pdb``.
|
||||||
|
@ -52,12 +55,20 @@ class PdbwTeardown(pdbpp.Pdb):
|
||||||
# TODO: figure out how to dissallow recursive .set_trace() entry
|
# TODO: figure out how to dissallow recursive .set_trace() entry
|
||||||
# since that'll cause deadlock for us.
|
# since that'll cause deadlock for us.
|
||||||
def set_continue(self):
|
def set_continue(self):
|
||||||
super().set_continue()
|
global _in_debug
|
||||||
self.config.teardown()
|
try:
|
||||||
|
super().set_continue()
|
||||||
|
finally:
|
||||||
|
_in_debug = False
|
||||||
|
_pdb_release_hook()
|
||||||
|
|
||||||
def set_quit(self):
|
def set_quit(self):
|
||||||
super().set_quit()
|
global _in_debug
|
||||||
self.config.teardown()
|
try:
|
||||||
|
super().set_quit()
|
||||||
|
finally:
|
||||||
|
_in_debug = False
|
||||||
|
_pdb_release_hook()
|
||||||
|
|
||||||
|
|
||||||
# TODO: will be needed whenever we get to true remote debugging.
|
# TODO: will be needed whenever we get to true remote debugging.
|
||||||
|
@ -95,40 +106,75 @@ class PdbwTeardown(pdbpp.Pdb):
|
||||||
# if bmsg in _pdb_exit_patterns:
|
# if bmsg in _pdb_exit_patterns:
|
||||||
# log.info("Closing stdin hijack")
|
# log.info("Closing stdin hijack")
|
||||||
# break
|
# break
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def _acquire_debug_lock():
|
async def _acquire_debug_lock():
|
||||||
"""Acquire a actor local FIFO lock meant to mutex entry to a local
|
"""Acquire a actor local FIFO lock meant to mutex entry to a local
|
||||||
debugger entry point to avoid tty clobbering by multiple processes.
|
debugger entry point to avoid tty clobbering by multiple processes.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
actor = tractor.current_actor()
|
log.error("TTY BEING ACQUIRED")
|
||||||
debug_lock = actor.statespace.setdefault(
|
await _debug_lock.acquire()
|
||||||
'_debug_lock', trio.StrictFIFOLock()
|
|
||||||
)
|
|
||||||
await debug_lock.acquire()
|
|
||||||
log.error("TTY lock acquired")
|
log.error("TTY lock acquired")
|
||||||
yield
|
yield
|
||||||
finally:
|
finally:
|
||||||
if debug_lock.locked():
|
if _debug_lock.locked():
|
||||||
debug_lock.release()
|
_debug_lock.release()
|
||||||
log.error("TTY lock released")
|
log.error("TTY lock released")
|
||||||
|
|
||||||
|
|
||||||
|
def handler(signum, frame):
|
||||||
|
"""Block SIGINT while in debug to avoid deadlocks with cancellation.
|
||||||
|
"""
|
||||||
|
print(
|
||||||
|
"tractor ignores SIGINT while in debug mode\n"
|
||||||
|
"If you have a special need for it please open an issue.\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# don't allow those stdlib mofos to mess with sigint handler
|
||||||
|
pdbpp.pdb.Pdb.sigint_handler = handler
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _disable_sigint():
|
||||||
|
try:
|
||||||
|
# disable sigint handling while in debug
|
||||||
|
prior_handler = signal.signal(signal.SIGINT, handler)
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
# restore SIGINT handling
|
||||||
|
signal.signal(signal.SIGINT, prior_handler)
|
||||||
|
|
||||||
|
|
||||||
async def _hijack_stdin_relay_to_child(
|
async def _hijack_stdin_relay_to_child(
|
||||||
subactor_uid: Tuple[str, str]
|
subactor_uid: Tuple[str, str]
|
||||||
) -> None:
|
) -> None:
|
||||||
|
actor = tractor.current_actor()
|
||||||
|
nursery = actor._actoruid2nursery[subactor_uid]
|
||||||
|
print(f'NURSERY: {nursery}')
|
||||||
|
print(f'nursery is cancelled {nursery.cancelled}')
|
||||||
|
if actor._is_cancelled or nursery.cancelled:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Can not engage debugger actor is already cancelled")
|
||||||
|
|
||||||
|
await trio.sleep(0)
|
||||||
|
|
||||||
# TODO: when we get to true remote debugging
|
# TODO: when we get to true remote debugging
|
||||||
# this will deliver stdin data
|
# this will deliver stdin data
|
||||||
log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock")
|
log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock")
|
||||||
async with _acquire_debug_lock():
|
async with _acquire_debug_lock():
|
||||||
log.warning(f"Actor {subactor_uid} acquired stdin hijack lock")
|
log.warning(f"Actor {subactor_uid} ACQUIRED stdin hijack lock")
|
||||||
# indicate to child that we've locked stdio
|
|
||||||
yield 'Locked'
|
|
||||||
|
|
||||||
# wait for cancellation of stream by child
|
with _disable_sigint():
|
||||||
await trio.sleep_forever()
|
# indicate to child that we've locked stdio
|
||||||
|
yield 'Locked'
|
||||||
|
|
||||||
log.debug(f"Actor {subactor_uid} released stdin hijack lock")
|
# wait for cancellation of stream by child
|
||||||
|
await trio.sleep_forever()
|
||||||
|
|
||||||
|
log.debug(f"Actor {subactor_uid} RELEASED stdin hijack lock")
|
||||||
|
|
||||||
|
|
||||||
# XXX: We only make this sync in case someone wants to
|
# XXX: We only make this sync in case someone wants to
|
||||||
|
@ -137,6 +183,7 @@ def _breakpoint(debug_func) -> Awaitable[None]:
|
||||||
"""``tractor`` breakpoint entry for engaging pdb machinery
|
"""``tractor`` breakpoint entry for engaging pdb machinery
|
||||||
in subactors.
|
in subactors.
|
||||||
"""
|
"""
|
||||||
|
global _in_debug
|
||||||
actor = tractor.current_actor()
|
actor = tractor.current_actor()
|
||||||
do_unlock = trio.Event()
|
do_unlock = trio.Event()
|
||||||
|
|
||||||
|
@ -147,7 +194,7 @@ def _breakpoint(debug_func) -> Awaitable[None]:
|
||||||
async with tractor._portal.open_portal(
|
async with tractor._portal.open_portal(
|
||||||
actor._parent_chan,
|
actor._parent_chan,
|
||||||
start_msg_loop=False,
|
start_msg_loop=False,
|
||||||
shield=True,
|
# shield=True,
|
||||||
) as portal:
|
) as portal:
|
||||||
with trio.fail_after(1):
|
with trio.fail_after(1):
|
||||||
agen = await portal.run(
|
agen = await portal.run(
|
||||||
|
@ -156,34 +203,48 @@ def _breakpoint(debug_func) -> Awaitable[None]:
|
||||||
subactor_uid=actor.uid,
|
subactor_uid=actor.uid,
|
||||||
)
|
)
|
||||||
async with aclosing(agen):
|
async with aclosing(agen):
|
||||||
|
|
||||||
|
# block until first yield above
|
||||||
async for val in agen:
|
async for val in agen:
|
||||||
|
|
||||||
assert val == 'Locked'
|
assert val == 'Locked'
|
||||||
task_status.started()
|
task_status.started()
|
||||||
with trio.CancelScope(shield=True):
|
|
||||||
await do_unlock.wait()
|
|
||||||
|
|
||||||
# trigger cancellation of remote stream
|
# with trio.CancelScope(shield=True):
|
||||||
break
|
await do_unlock.wait()
|
||||||
|
|
||||||
|
# trigger cancellation of remote stream
|
||||||
|
break
|
||||||
finally:
|
finally:
|
||||||
log.debug(f"Exiting debugger for actor {actor}")
|
log.debug(f"Exiting debugger for actor {actor}")
|
||||||
actor.statespace['_in_debug'] = False
|
global _in_debug
|
||||||
|
_in_debug = False
|
||||||
|
# actor.statespace['_in_debug'] = False
|
||||||
log.debug(f"Child {actor} released parent stdio lock")
|
log.debug(f"Child {actor} released parent stdio lock")
|
||||||
|
|
||||||
async def _bp():
|
async def _bp():
|
||||||
"""Async breakpoint which schedules a parent stdio lock, and once complete
|
"""Async breakpoint which schedules a parent stdio lock, and once complete
|
||||||
enters the ``pdbpp`` debugging console.
|
enters the ``pdbpp`` debugging console.
|
||||||
"""
|
"""
|
||||||
in_debug = actor.statespace.setdefault('_in_debug', False)
|
global _in_debug
|
||||||
|
# in_debug = actor.statespace.setdefault('_in_debug', False)
|
||||||
|
|
||||||
if in_debug:
|
if _in_debug:
|
||||||
log.warning(f"Actor {actor} already has a debug lock, skipping...")
|
# if **this** actor is already in debug mode block here
|
||||||
return
|
# waiting for the control to be released - this allows
|
||||||
|
# support for recursive entries to `tractor.breakpoint()`
|
||||||
|
log.warning(
|
||||||
|
f"Actor {actor.uid} already has a debug lock, waiting...")
|
||||||
|
await do_unlock.wait()
|
||||||
|
await trio.sleep(0.1)
|
||||||
|
# return
|
||||||
|
|
||||||
# assign unlock callback for debugger teardown hooks
|
# assign unlock callback for debugger teardown hooks
|
||||||
global _pdb_release_hook
|
global _pdb_release_hook
|
||||||
_pdb_release_hook = do_unlock.set
|
_pdb_release_hook = do_unlock.set
|
||||||
|
|
||||||
actor.statespace['_in_debug'] = True
|
# actor.statespace['_in_debug'] = True
|
||||||
|
_in_debug = True
|
||||||
|
|
||||||
# TODO: need a more robust check for the "root" actor
|
# TODO: need a more robust check for the "root" actor
|
||||||
if actor._parent_chan:
|
if actor._parent_chan:
|
||||||
|
@ -231,6 +292,7 @@ breakpoint = partial(
|
||||||
|
|
||||||
def _post_mortem(actor):
|
def _post_mortem(actor):
|
||||||
log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")
|
log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")
|
||||||
|
# custom Pdb post-mortem entry
|
||||||
pdbpp.xpm(Pdb=PdbwTeardown)
|
pdbpp.xpm(Pdb=PdbwTeardown)
|
||||||
|
|
||||||
|
|
||||||
|
@ -250,6 +312,7 @@ async def _maybe_enter_pm(err):
|
||||||
# there's races between when the parent actor has killed all
|
# there's races between when the parent actor has killed all
|
||||||
# comms and when the child tries to contact said parent to
|
# comms and when the child tries to contact said parent to
|
||||||
# acquire the tty lock.
|
# acquire the tty lock.
|
||||||
|
|
||||||
# Really we just want to mostly avoid catching KBIs here so there
|
# Really we just want to mostly avoid catching KBIs here so there
|
||||||
# might be a simpler check we can do?
|
# might be a simpler check we can do?
|
||||||
and trio.MultiError.filter(
|
and trio.MultiError.filter(
|
||||||
|
|
Loading…
Reference in New Issue