Refine the handler for child vs. root cases

This gets very close to avoiding any possible hangs to do with tty
locking and SIGINT handling minus a special case that will be detailed
below.

Summary of implementation changes:

- convert `_mk_pdb()` -> `with _open_pdb() as pdb:` which implicitly
  handles the `bdb.BdbQuit` case such that debugger teardown hooks are
  always called.
- rename the handler to `shield_sigint()` and handle a variety of new
  cases:
  * the root is in debug but hasn't been cancelled -> call
    `Actor.cancel_soon()`
  * the root is in debug but *has* been called (`Actor.cancel_soon()`
    already called) -> raise KBI
  * a child is in debug *and* has a task locking the debugger -> ignore
    SIGINT in child *and* the root actor.
- if the debugger instance is provided to the handler at acquire time,
  on SIGINT handling completion re-print the last pdb++ REPL output so
  that the user realizes they are still actively in debug.
- ignore the unlock case where a race condition of "no task" holding the
  lock causes the `RuntimeError` normally associated with the "wrong
  task" doing so (not sure if this is a `trio` bug?).
- change debug logs to runtime level.

Unhandled case(s):

- a child is maybe in debug mode but does not itself have any task using
  the debugger.
    * ToDo: we need a way to decide what to do with
      "intermediate" child actors who themselves either are not in
      `debug_mode=True` but have children who *are* such that a SIGINT
      won't cause cancellation of that child-as-parent-of-another-child
      **iff** any of their children are in in debug mode.
sigintsaviour_ci_worked
Tyler Goodlet 2022-01-23 17:04:49 -05:00
parent 1e789ecad2
commit 688e0b9ebe
1 changed files with 95 additions and 40 deletions

View File

@ -20,6 +20,7 @@ Multi-core debugging for da peeps!
""" """
import bdb import bdb
import sys import sys
import signal
from functools import partial from functools import partial
from contextlib import asynccontextmanager as acm from contextlib import asynccontextmanager as acm
from contextlib import contextmanager as cm from contextlib import contextmanager as cm
@ -163,7 +164,7 @@ async def _acquire_debug_lock(
task_name = trio.lowlevel.current_task().name task_name = trio.lowlevel.current_task().name
log.debug( log.runtime(
f"Attempting to acquire TTY lock, remote task: {task_name}:{uid}" f"Attempting to acquire TTY lock, remote task: {task_name}:{uid}"
) )
@ -176,14 +177,14 @@ async def _acquire_debug_lock(
_no_remote_has_tty = trio.Event() _no_remote_has_tty = trio.Event()
try: try:
log.debug( log.runtime(
f"entering lock checkpoint, remote task: {task_name}:{uid}" f"entering lock checkpoint, remote task: {task_name}:{uid}"
) )
we_acquired = True we_acquired = True
await _debug_lock.acquire() await _debug_lock.acquire()
_global_actor_in_debug = uid _global_actor_in_debug = uid
log.debug(f"TTY lock acquired, remote task: {task_name}:{uid}") log.runtime(f"TTY lock acquired, remote task: {task_name}:{uid}")
# NOTE: critical section: this yield is unshielded! # NOTE: critical section: this yield is unshielded!
@ -199,7 +200,10 @@ async def _acquire_debug_lock(
finally: finally:
# if _global_actor_in_debug == uid: # if _global_actor_in_debug == uid:
if we_acquired and _debug_lock.locked(): if (
we_acquired
and _debug_lock.locked()
):
_debug_lock.release() _debug_lock.release()
# IFF there are no more requesting tasks queued up fire, the # IFF there are no more requesting tasks queued up fire, the
@ -211,13 +215,13 @@ async def _acquire_debug_lock(
if ( if (
not stats.owner not stats.owner
): ):
log.debug(f"No more tasks waiting on tty lock! says {uid}") log.runtime(f"No more tasks waiting on tty lock! says {uid}")
_no_remote_has_tty.set() _no_remote_has_tty.set()
_no_remote_has_tty = None _no_remote_has_tty = None
_global_actor_in_debug = None _global_actor_in_debug = None
log.debug(f"TTY lock released, remote task: {task_name}:{uid}") log.runtime(f"TTY lock released, remote task: {task_name}:{uid}")
@tractor.context @tractor.context
@ -262,9 +266,6 @@ async def _hijack_stdin_for_child(
async with ctx.open_stream() as stream: async with ctx.open_stream() as stream:
assert await stream.receive() == 'pdb_unlock' assert await stream.receive() == 'pdb_unlock'
# try:
# assert await stream.receive() == 'pdb_unlock'
except ( except (
# BaseException, # BaseException,
trio.MultiError, trio.MultiError,
@ -272,6 +273,7 @@ async def _hijack_stdin_for_child(
trio.Cancelled, # by local cancellation trio.Cancelled, # by local cancellation
trio.ClosedResourceError, # by self._rx_chan trio.ClosedResourceError, # by self._rx_chan
) as err: ) as err:
# XXX: there may be a race with the portal teardown # XXX: there may be a race with the portal teardown
# with the calling actor which we can safely ignore. # with the calling actor which we can safely ignore.
# The alternative would be sending an ack message # The alternative would be sending an ack message
@ -282,10 +284,12 @@ async def _hijack_stdin_for_child(
if isinstance(err, trio.Cancelled): if isinstance(err, trio.Cancelled):
raise raise
finally: finally:
log.debug( log.runtime(
"TTY lock released, remote task:" "TTY lock released, remote task:"
f"{task_name}:{subactor_uid}") f"{task_name}:{subactor_uid}"
)
return "pdb_unlock_complete" return "pdb_unlock_complete"
@ -358,8 +362,9 @@ async def _breakpoint(
# shield: bool = False # shield: bool = False
) -> None: ) -> None:
'''``tractor`` breakpoint entry for engaging pdb machinery '''
in the root or a subactor. breakpoint entry for engaging pdb machinery in the root or
a subactor.
''' '''
__tracebackhide__ = True __tracebackhide__ = True
@ -497,7 +502,13 @@ def _open_pdb() -> PdbwTeardown:
raise raise
def disable_sigint_in_pdb(signum, frame, *args): def shield_sigint(
signum: int,
frame: 'frame', # type: ignore # noqa
pdb: Optional[PdbwTeardown] = None,
*args,
) -> None:
''' '''
Specialized debugger compatible SIGINT handler. Specialized debugger compatible SIGINT handler.
@ -506,39 +517,86 @@ def disable_sigint_in_pdb(signum, frame, *args):
is always cancelled on ctrl-c. is always cancelled on ctrl-c.
''' '''
actor = tractor.current_actor() global _local_task_in_debug, _global_actor_in_debug
if not actor._cancel_called:
log.pdb(
f"{actor.uid} is in debug and has not been cancelled, "
"ignoring SIGINT\n"
)
else:
log.pdb(
f"{actor.uid} is already cancelling.."
)
global _global_actor_in_debug
in_debug = _global_actor_in_debug in_debug = _global_actor_in_debug
actor = tractor.current_actor()
# root actor branch that reports whether or not a child
# has locked debugger.
if ( if (
is_root_process() is_root_process()
and in_debug and in_debug
): ):
log.pdb(f'Root SIGINT disabled while {_global_actor_in_debug} is debugging') name = in_debug[0]
if name != 'root':
if in_debug[0] != 'root': log.pdb(
pass f"Ignoring SIGINT while child in debug mode: `{in_debug}`"
)
else:
log.pdb(
"Ignoring SIGINT while in debug mode"
)
# child actor that has locked the debugger
elif (
not is_root_process()
):
task = _local_task_in_debug
if task:
log.pdb(
f"Ignoring SIGINT while task in debug mode: `{task}`"
)
# TODO: how to handle the case of an intermediary-child actor
# that **is not** marked in debug mode?
# elif debug_mode():
else:
log.pdb(
"Ignoring SIGINT since debug mode is enabled"
)
# noone has the debugger so raise KBI
else:
# If we haven't tried to cancel the runtime then do that instead
# of raising a KBI (which may non-gracefully destroy
# a ``trio.run()``).
if not actor._cancel_called:
actor.cancel_soon()
# If the runtime is already cancelled it likely means the user
# hit ctrl-c again because teardown didn't full take place in
# which case we do the "hard" raising of a local KBI.
else: else:
# actor.cancel_soon()
raise KeyboardInterrupt raise KeyboardInterrupt
# maybe redraw/print last REPL output to console
if pdb:
# TODO: make this work like sticky mode where if there is output
# detected as written to the tty we redraw this part underneath
# and erase the past draw of this same bit above?
# pdb.sticky = True
# pdb._print_if_sticky()
# also see these links for an approach from ``ptk``:
# https://github.com/goodboy/tractor/issues/130#issuecomment-663752040
# https://github.com/prompt-toolkit/python-prompt-toolkit/blob/c2c6af8a0308f9e5d7c0e28cb8a02963fe0ce07a/prompt_toolkit/patch_stdout.py
pdb.do_longlist(None)
print(pdb.prompt, end='', flush=True)
@cm @cm
def disable_sigint(): def disable_sigint(
pdb: Optional[PdbwTeardown] = None
) -> None:
__tracebackhide__ = True __tracebackhide__ = True
# ensure the ``contextlib.contextmanager`` frame inside the wrapping # ensure the ``contextlib.contextmanager`` frame inside the wrapping
# ``.__exit__()`` method isn't shown either. # ``.__exit__()`` method isn't shown either.
import sys
frame = sys._getframe() frame = sys._getframe()
frame.f_back.f_globals['__tracebackhide__'] = True frame.f_back.f_globals['__tracebackhide__'] = True
# NOTE: this seems like a form of cpython bug wherein # NOTE: this seems like a form of cpython bug wherein
@ -548,10 +606,9 @@ def disable_sigint():
# for manual debugging if necessary # for manual debugging if necessary
# pdb.set_trace() # pdb.set_trace()
import signal
orig_handler = signal.signal( orig_handler = signal.signal(
signal.SIGINT, signal.SIGINT,
disable_sigint_in_pdb partial(shield_sigint, pdb=pdb),
) )
try: try:
yield yield
@ -564,10 +621,9 @@ def disable_sigint():
def _set_trace(actor=None): def _set_trace(actor=None):
__tracebackhide__ = True __tracebackhide__ = True
# pdb = _open_pdb()
with ( with (
_open_pdb() as pdb, _open_pdb() as pdb,
disable_sigint(), disable_sigint(pdb=pdb),
): ):
if actor is not None: if actor is not None:
log.pdb(f"\nAttaching pdb to actor: {actor.uid}\n") log.pdb(f"\nAttaching pdb to actor: {actor.uid}\n")
@ -601,10 +657,9 @@ breakpoint = partial(
def _post_mortem(actor): def _post_mortem(actor):
__tracebackhide__ = True __tracebackhide__ = True
# pdb = _mk_pdb()
with ( with (
_open_pdb() as pdb, _open_pdb() as pdb,
disable_sigint(), disable_sigint(pdb=pdb),
): ):
log.pdb(f"\nAttaching to pdb in crashed actor: {actor.uid}\n") log.pdb(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")