forked from goodboy/tractor
1
0
Fork 0

Support entering post mortem on crashes in root actor

debug_tests
Tyler Goodlet 2020-09-12 11:47:14 -04:00
parent 291ecec070
commit 150179bfe4
1 changed files with 85 additions and 44 deletions

View File

@ -1,8 +1,10 @@
""" """
Multi-core debugging for da peeps! Multi-core debugging for da peeps!
""" """
import bdb
import sys import sys
from functools import partial from functools import partial
from contextlib import asynccontextmanager, AsyncExitStack
from typing import Awaitable, Tuple, Optional, Callable from typing import Awaitable, Tuple, Optional, Callable
from async_generator import aclosing from async_generator import aclosing
@ -10,6 +12,7 @@ import tractor
import trio import trio
from .log import get_logger from .log import get_logger
from . import _state
try: try:
# wtf: only exported when installed in dev mode? # wtf: only exported when installed in dev mode?
@ -92,36 +95,39 @@ class PdbwTeardown(pdbpp.Pdb):
# if bmsg in _pdb_exit_patterns: # if bmsg in _pdb_exit_patterns:
# log.info("Closing stdin hijack") # log.info("Closing stdin hijack")
# break # break
@asynccontextmanager
async def _acquire_debug_lock():
"""Acquire a actor local FIFO lock meant to mutex entry to a local
debugger entry point to avoid tty clobbering by multiple processes.
"""
try:
actor = tractor.current_actor()
debug_lock = actor.statespace.setdefault(
'_debug_lock', trio.StrictFIFOLock()
)
await debug_lock.acquire()
log.error("TTY lock acquired")
yield
finally:
if debug_lock.locked():
debug_lock.release()
log.error("TTY lock released")
async def _hijack_stdin_relay_to_child( async def _hijack_stdin_relay_to_child(
subactor_uid: Tuple[str, str] subactor_uid: Tuple[str, str]
) -> None: ) -> None:
actor = tractor.current_actor()
debug_lock = actor.statespace.setdefault(
'_debug_lock', trio.StrictFIFOLock()
)
log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock")
await debug_lock.acquire()
log.warning(f"Actor {subactor_uid} acquired stdin hijack lock")
# TODO: when we get to true remote debugging # TODO: when we get to true remote debugging
# this will deliver stdin data # this will deliver stdin data
try: log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock")
async with _acquire_debug_lock():
log.warning(f"Actor {subactor_uid} acquired stdin hijack lock")
# indicate to child that we've locked stdio # indicate to child that we've locked stdio
yield 'Locked' yield 'Locked'
# wait for cancellation of stream by child # wait for cancellation of stream by child
await trio.sleep_forever() await trio.sleep_forever()
# TODO: for remote debugging schedule hijacking in root scope
# (see above)
# actor._root_nursery.start_soon(hijack_stdin)
finally:
if debug_lock.locked():
debug_lock.release()
log.debug(f"Actor {subactor_uid} released stdin hijack lock") log.debug(f"Actor {subactor_uid} released stdin hijack lock")
@ -137,16 +143,13 @@ def _breakpoint(debug_func) -> Awaitable[None]:
async def wait_for_parent_stdin_hijack( async def wait_for_parent_stdin_hijack(
task_status=trio.TASK_STATUS_IGNORED task_status=trio.TASK_STATUS_IGNORED
): ):
# TODO: need a more robust check for the "root" actor
if actor._parent_chan:
try: try:
async with tractor._portal.open_portal( async with tractor._portal.open_portal(
actor._parent_chan, actor._parent_chan,
start_msg_loop=False, start_msg_loop=False,
shield=True, shield=True,
) as portal: ) as portal:
# with trio.fail_after(1): with trio.fail_after(1):
agen = await portal.run( agen = await portal.run(
'tractor._debug', 'tractor._debug',
'_hijack_stdin_relay_to_child', '_hijack_stdin_relay_to_child',
@ -182,11 +185,28 @@ def _breakpoint(debug_func) -> Awaitable[None]:
actor.statespace['_in_debug'] = True actor.statespace['_in_debug'] = True
# TODO: need a more robust check for the "root" actor
if actor._parent_chan:
# this **must** be awaited by the caller and is done using the # this **must** be awaited by the caller and is done using the
# root nursery so that the debugger can continue to run without # root nursery so that the debugger can continue to run without
# being restricted by the scope of a new task nursery. # being restricted by the scope of a new task nursery.
await actor._service_n.start(wait_for_parent_stdin_hijack) await actor._service_n.start(wait_for_parent_stdin_hijack)
# block here one (at the appropriate frame *up* where
# ``breakpoint()`` was awaited and begin handling stdio
# debug_func(actor)
else:
# we also wait in the root-parent for any child that
# may have the tty locked prior
async def _lock(
task_status=trio.TASK_STATUS_IGNORED
):
async with _acquire_debug_lock():
task_status.started()
await do_unlock.wait()
await actor._service_n.start(_lock)
# block here one (at the appropriate frame *up* where # block here one (at the appropriate frame *up* where
# ``breakpoint()`` was awaited and begin handling stdio # ``breakpoint()`` was awaited and begin handling stdio
debug_func(actor) debug_func(actor)
@ -218,3 +238,24 @@ post_mortem = partial(
_breakpoint, _breakpoint,
_post_mortem, _post_mortem,
) )
async def _maybe_enter_pm(err):
if (
_state.debug_mode()
and not isinstance(err, bdb.BdbQuit)
# XXX: if the error is the likely result of runtime-wide
# cancellation, we don't want to enter the debugger since
# there's races between when the parent actor has killed all
# comms and when the child tries to contact said parent to
# acquire the tty lock.
# Really we just want to mostly avoid catching KBIs here so there
# might be a simpler check we can do?
and trio.MultiError.filter(
lambda exc: exc if not isinstance(exc, trio.Cancelled) else None,
err,
)
):
log.warning("Actor crashed, entering debug mode")
await post_mortem()