tractor/tractor/devx/_debug.py

2219 lines
70 KiB
Python

# tractor: structured concurrent "actors".
# Copyright 2018-eternity Tyler Goodlet.
# This program is free software: you can redistribute it and/or
# modify it under the terms of the GNU Affero General Public License
# as published by the Free Software Foundation, either version 3 of
# the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public
# License along with this program. If not, see
# <https://www.gnu.org/licenses/>.
"""
Multi-core debugging for da peeps!
"""
from __future__ import annotations
import bdb
from contextlib import (
asynccontextmanager as acm,
contextmanager as cm,
nullcontext,
_GeneratorContextManager,
_AsyncGeneratorContextManager,
)
from functools import (
partial,
cached_property,
)
import inspect
import os
import signal
import sys
import textwrap
import threading
import traceback
from typing import (
Any,
Callable,
AsyncIterator,
AsyncGenerator,
TypeAlias,
TYPE_CHECKING,
)
from types import (
FrameType,
ModuleType,
TracebackType,
)
from msgspec import Struct
import pdbp
import sniffio
import trio
from trio import CancelScope
from trio.lowlevel import (
current_task,
Task,
)
from trio import (
TaskStatus,
)
import tractor
from tractor.log import get_logger
from tractor._state import (
current_actor,
is_root_process,
debug_mode,
current_ipc_ctx,
)
# from .pformat import pformat_caller_frame
if TYPE_CHECKING:
from tractor._ipc import Channel
from tractor._context import Context
from tractor._runtime import (
Actor,
)
from tractor.msg import (
_codec,
)
log = get_logger(__name__)
# XXX HACKZONE XXX
# hide exit stack frames on nurseries and cancel-scopes!
# |_ so avoid seeing it when the `pdbp` REPL is first engaged from
# inside a `trio.open_nursery()` scope (with no line after it
# in before the block end??).
#
# TODO: FINALLY got this workin originally with
# `@pdbp.hideframe` around the `wrapper()` def embedded inside
# `_ki_protection_decoratior()`.. which is in the module:
# /home/goodboy/.virtualenvs/tractor311/lib/python3.11/site-packages/trio/_core/_ki.py
#
# -[ ] make an issue and patch for `trio` core? maybe linked
# to the long outstanding `pdb` one below?
# |_ it's funny that there's frame hiding throughout `._run.py`
# but not where it matters on the below exit funcs..
#
# -[ ] provide a patchset for the lonstanding
# |_ https://github.com/python-trio/trio/issues/1155
#
# -[ ] make a linked issue to ^ and propose allowing all the
# `._core._run` code to have their `__tracebackhide__` value
# configurable by a `RunVar` to allow getting scheduler frames
# if desired through configuration?
#
# -[ ] maybe dig into the core `pdb` issue why the extra frame is shown
# at all?
#
pdbp.hideframe(trio._core._run.NurseryManager.__aexit__)
pdbp.hideframe(trio._core._run.CancelScope.__exit__)
pdbp.hideframe(_GeneratorContextManager.__exit__)
pdbp.hideframe(_AsyncGeneratorContextManager.__aexit__)
pdbp.hideframe(trio.Event.wait)
__all__ = [
'breakpoint',
'post_mortem',
]
class LockStatus(
Struct,
tag=True,
tag_field='msg_type',
):
subactor_uid: tuple[str, str]
cid: str
locked: bool
class LockRelease(
Struct,
tag=True,
tag_field='msg_type',
):
subactor_uid: tuple[str, str]
cid: str
__pld_spec__: TypeAlias = LockStatus|LockRelease
class Lock:
'''
Actor-tree-global debug lock state, exists only in a root process.
Mostly to avoid a lot of global declarations for now XD.
'''
# XXX local ref to the `Pbp` instance, ONLY set in the
# actor-process that currently has activated a REPL
# i.e. it will be `None` (unset) in any other actor-process
# that does not have this lock acquired in the root proc.
repl: PdbREPL|None = None
@staticmethod
def get_locking_task_cs() -> CancelScope|None:
if not is_root_process():
raise RuntimeError(
'`Lock.locking_task_cs` is invalid in subactors!'
)
if ctx := Lock.ctx_in_debug:
return ctx._scope
return None
# ROOT ONLY
# ------ - -------
# the root-actor-ONLY singletons for,
#
# - the uid of the actor who's task is using a REPL
# - a literal task-lock,
# - a shielded-cancel-scope around the acquiring task*,
# - a broadcast event to signal no-actor using a REPL in tree,
# - a filter list to block subs-by-uid from locking.
#
# * in case it needs to be manually cancelled in root due to
# a stale lock condition (eg. IPC failure with the locking
# child
ctx_in_debug: Context|None = None
no_remote_has_tty: trio.Event|None = None
_debug_lock: trio.StrictFIFOLock = trio.StrictFIFOLock()
_blocked: set[
tuple[str, str] # `Actor.uid` for per actor
|str # Context.cid for per task
] = set()
@classmethod
def repr(cls) -> str:
# both root and subs
fields: str = (
f'repl: {cls.repl}\n'
)
if is_root_process():
lock_stats: trio.LockStatistics = cls._debug_lock.statistics()
fields += (
f'no_remote_has_tty: {cls.no_remote_has_tty}\n'
f'_blocked: {cls._blocked}\n\n'
f'ctx_in_debug: {cls.ctx_in_debug}\n\n'
f'_debug_lock: {cls._debug_lock}\n'
f'lock_stats: {lock_stats}\n'
)
body: str = textwrap.indent(
fields,
prefix=' |_',
)
return (
f'<{cls.__name__}(\n'
f'{body}'
')>'
)
@classmethod
@pdbp.hideframe
def release(
cls,
force: bool = False,
):
lock: trio.StrictFIFOLock = cls._debug_lock
try:
if lock.locked():
if not DebugStatus.is_main_trio_thread():
trio.from_thread.run_sync(
cls._debug_lock.release
)
else:
cls._debug_lock.release()
message: str = 'TTY lock released for child\n'
else:
message: str = 'TTY lock not held by any child\n'
except RuntimeError as rte:
message: str = 'TTY lock FAILED to release for child??\n'
log.exception(message)
# uhhh makes no sense but been seeing the non-owner
# release error even though this is definitely the task
# that locked?
owner = cls._debug_lock.statistics().owner
# if (
# owner
# and
# cls.remote_task_in_debug is None
# ):
# raise RuntimeError(
# 'Stale `Lock` detected, no remote task active!?\n'
# f'|_{owner}\n'
# # f'{cls}'
# ) from rte
if owner:
raise rte
# OW suppress, can't member why tho .. XD
# something somethin corrupts a cancel-scope
# somewhere..
finally:
# IFF there are no more requesting tasks queued up fire, the
# "tty-unlocked" event thereby alerting any monitors of the lock that
# we are now back in the "tty unlocked" state. This is basically
# and edge triggered signal around an empty queue of sub-actor
# tasks that may have tried to acquire the lock.
stats = cls._debug_lock.statistics()
if (
not stats.owner
or force
# and cls.no_remote_has_tty is not None
):
message += '-> No more child ctx tasks hold the TTY lock!\n'
# set and release
if cls.no_remote_has_tty is not None:
cls.no_remote_has_tty.set()
cls.no_remote_has_tty = None
# cls.remote_task_in_debug = None
else:
message += (
f'-> Not signalling `Lock.no_remote_has_tty` since it has value:{cls.no_remote_has_tty}\n'
)
else:
# wakeup any waiters since the lock was released
# (presumably) temporarily.
if no_remote_has_tty := cls.no_remote_has_tty:
no_remote_has_tty.set()
no_remote_has_tty = trio.Event()
message += (
f'-> A child ctx task still owns the `Lock` ??\n'
f' |_owner task: {stats.owner}\n'
)
cls.ctx_in_debug = None
@classmethod
@acm
async def acquire(
cls,
ctx: Context,
# subactor_uid: tuple[str, str],
# remote_task_uid: str,
) -> AsyncIterator[trio.StrictFIFOLock]:
'''
Acquire a root-actor local FIFO lock which tracks mutex access of
the process tree's global debugger breakpoint.
This lock avoids tty clobbering (by preventing multiple processes
reading from stdstreams) and ensures multi-actor, sequential access
to the ``pdb`` repl.
'''
if not is_root_process():
raise RuntimeError('Only callable by a root actor task!')
# subactor_uid: tuple[str, str] = ctx.chan.uid
we_acquired: bool = False
log.runtime(
f'Attempting to acquire TTY lock for sub-actor\n'
f'{ctx}'
)
try:
pre_msg: str = (
f'Entering lock checkpoint for sub-actor\n'
f'{ctx}'
)
stats = cls._debug_lock.statistics()
if owner := stats.owner:
# and cls.no_remote_has_tty is not None
pre_msg += (
f'\n'
f'`Lock` already held by local task?\n'
f'{owner}\n\n'
# f'On behalf of task: {cls.remote_task_in_debug!r}\n'
f'On behalf of IPC ctx\n'
f'{ctx}'
)
log.runtime(pre_msg)
# NOTE: if the surrounding cancel scope from the
# `lock_tty_for_child()` caller is cancelled, this line should
# unblock and NOT leave us in some kind of
# a "child-locked-TTY-but-child-is-uncontactable-over-IPC"
# condition.
await cls._debug_lock.acquire()
cls.ctx_in_debug = ctx
we_acquired = True
if cls.no_remote_has_tty is None:
# mark the tty lock as being in use so that the runtime
# can try to avoid clobbering any connection from a child
# that's currently relying on it.
cls.no_remote_has_tty = trio.Event()
# cls.remote_task_in_debug = remote_task_uid
log.runtime(
f'TTY lock acquired for sub-actor\n'
f'{ctx}'
)
# NOTE: critical section: this yield is unshielded!
# IF we received a cancel during the shielded lock entry of some
# next-in-queue requesting task, then the resumption here will
# result in that ``trio.Cancelled`` being raised to our caller
# (likely from ``lock_tty_for_child()`` below)! In
# this case the ``finally:`` below should trigger and the
# surrounding caller side context should cancel normally
# relaying back to the caller.
yield cls._debug_lock
finally:
message :str = 'Exiting `Lock.acquire()` on behalf of sub-actor\n'
if (
we_acquired
# and
# cls._debug_lock.locked()
):
message += '-> TTY lock released by child\n'
cls.release()
else:
message += '-> TTY lock never acquired by child??\n'
log.runtime(
f'{message}\n'
f'{ctx}'
)
@tractor.context
async def lock_tty_for_child(
ctx: Context,
subactor_task_uid: tuple[str, int],
) -> LockStatus|LockRelease:
'''
Lock the TTY in the root process of an actor tree in a new
inter-actor-context-task such that the ``pdbp`` debugger console
can be mutex-allocated to the calling sub-actor for REPL control
without interference by other processes / threads.
NOTE: this task must be invoked in the root process of the actor
tree. It is meant to be invoked as an rpc-task and should be
highly reliable at releasing the mutex complete!
'''
subactor_uid: tuple[str, str] = ctx.chan.uid
# NOTE: we use the IPC ctx's cancel scope directly in order to
# ensure that on any transport failure, or cancellation request
# from the child we expect
# `Context._maybe_cancel_and_set_remote_error()` to cancel this
# scope despite the shielding we apply below.
debug_lock_cs: CancelScope = ctx._scope
try:
if ctx.cid in Lock._blocked:
raise RuntimeError(
f'Double lock request!?\n'
f'The same remote task already has an active request for TTY lock ??\n\n'
f'subactor uid: {subactor_uid}\n\n'
'This might be mean that the requesting task '
'in `request_root_stdio_lock()` may have crashed?\n'
'Consider that an internal bug exists given the TTY '
'`Lock`ing IPC dialog..\n'
)
root_task_name: str = current_task().name
if tuple(subactor_uid) in Lock._blocked:
log.warning(
f'Subactor is blocked from acquiring debug lock..\n'
f'subactor_uid: {subactor_uid}\n'
f'remote task: {subactor_task_uid}\n'
)
ctx._enter_debugger_on_cancel: bool = False
await ctx.cancel(f'Debug lock blocked for {subactor_uid}')
# TODO: remove right?
# return LockStatus(
# subactor_uid=subactor_uid,
# cid=ctx.cid,
# locked=False,
# )
# TODO: when we get to true remote debugging
# this will deliver stdin data?
log.debug(
'Subactor attempting to acquire TTY lock\n'
f'root task: {root_task_name}\n'
f'subactor_uid: {subactor_uid}\n'
f'remote task: {subactor_task_uid}\n'
)
DebugStatus.shield_sigint()
Lock._blocked.add(ctx.cid)
with (
# enable the locking msgspec
apply_debug_pldec(),
):
async with Lock.acquire(ctx=ctx):
debug_lock_cs.shield = True
# indicate to child that we've locked stdio
await ctx.started(
LockStatus(
subactor_uid=subactor_uid,
cid=ctx.cid,
locked=True,
)
)
log.debug( f'Actor {subactor_uid} acquired TTY lock')
# wait for unlock pdb by child
async with ctx.open_stream() as stream:
release_msg: LockRelease = await stream.receive()
# TODO: security around only releasing if
# these match?
log.pdb(
f'TTY lock released requested\n\n'
f'{release_msg}\n'
)
assert release_msg.cid == ctx.cid
assert release_msg.subactor_uid == tuple(subactor_uid)
log.debug(f'Actor {subactor_uid} released TTY lock')
return LockStatus(
subactor_uid=subactor_uid,
cid=ctx.cid,
locked=False,
)
except BaseException:
log.exception(
'Errored during root TTY-lock dialog?\n'
'Forcing release since an internal error caused this!\n'
)
Lock.release(force=True)
raise
finally:
Lock._blocked.remove(ctx.cid)
if (no_locker := Lock.no_remote_has_tty):
no_locker.set()
DebugStatus.unshield_sigint()
# TODO: rename to ReplState or somethin?
# DebugRequest, make it a singleton instance?
class DebugStatus:
'''
Singleton-state for debugging machinery in a subactor.
Composes conc primitives for syncing with a root actor to
acquire the tree-global (TTY) `Lock` such that only ever one
actor's task can have the REPL active at a given time.
Methods to shield the process' `SIGINT` handler are used
whenever a local task is an active REPL.
'''
repl: PdbREPL|None = None
repl_task: Task|None = None
req_ctx: Context|None = None
req_cs: CancelScope|None = None
repl_release: trio.Event|None = None
req_finished: trio.Event|None = None
lock_status: LockStatus|None = None
_orig_sigint_handler: Callable|None = None
_trio_handler: (
Callable[[int, FrameType|None], Any]
|int
| None
) = None
@classmethod
def repr(cls) -> str:
fields: str = (
f'repl: {cls.repl}\n'
f'repl_task: {cls.repl_task}\n'
f'repl_release: {cls.repl_release}\n'
f'req_ctx: {cls.req_ctx}\n'
)
body: str = textwrap.indent(
fields,
prefix=' |_',
)
return (
f'<{cls.__name__}(\n'
f'{body}'
')>'
)
# TODO: how do you get this to work on a non-inited class?
# __repr__ = classmethod(repr)
# __str__ = classmethod(repr)
@classmethod
def shield_sigint(cls):
'''
Shield out SIGINT handling (which by default triggers
`trio.Task` cancellation) in subactors when a `pdb` REPL
is active.
Avoids cancellation of the current actor (task) when the user
mistakenly sends ctl-c or via a recevied signal (from an
external request). Explicit runtime cancel requests are
allowed until the current REPL-session (the blocking call
`Pdb.interaction()`) exits, normally via the 'continue' or
'quit' command - at which point the orig SIGINT handler is
restored via `.unshield_sigint()` below.
Impl notes:
-----------
- we prefer that `trio`'s default handler is always used when
SIGINT is unshielded (hence disabling the `pdb.Pdb`
defaults in `mk_pdb()`) such that reliable KBI cancellation
is always enforced.
- we always detect whether we're running from a non-main
thread, in which case schedule the SIGINT shielding override
to in the main thread as per,
https://docs.python.org/3/library/signal.html#signals-and-threads
'''
#
# XXX detect whether we're running from a non-main thread
# in which case schedule the SIGINT shielding override
# to in the main thread.
# https://docs.python.org/3/library/signal.html#signals-and-threads
if not cls.is_main_trio_thread():
cls._orig_sigint_handler: Callable = trio.from_thread.run_sync(
signal.signal,
signal.SIGINT,
shield_sigint_handler,
)
else:
cls._orig_sigint_handler = signal.signal(
signal.SIGINT,
shield_sigint_handler,
)
@classmethod
@pdbp.hideframe # XXX NOTE XXX see below in `.pause_from_sync()`
def unshield_sigint(cls):
'''
Un-shield SIGINT for REPL-active (su)bactor.
See details in `.shield_sigint()`.
'''
# always restore ``trio``'s sigint handler. see notes below in
# the pdb factory about the nightmare that is that code swapping
# out the handler when the repl activates...
if not cls.is_main_trio_thread():
trio.from_thread.run_sync(
signal.signal,
signal.SIGINT,
cls._trio_handler,
)
else:
trio_h: Callable = cls._trio_handler
# XXX should never really happen XXX
if not trio_h:
mk_pdb().set_trace()
signal.signal(
signal.SIGINT,
cls._trio_handler,
)
cls._orig_sigint_handler = None
@classmethod
def is_main_trio_thread(cls) -> bool:
'''
Check if we're the "main" thread (as in the first one
started by cpython) AND that it is ALSO the thread that
called `trio.run()` and not some thread spawned with
`trio.to_thread.run_sync()`.
'''
is_trio_main = (
# TODO: since this is private, @oremanj says
# we should just copy the impl for now..
(is_main_thread := trio._util.is_main_thread())
and
(async_lib := sniffio.current_async_library()) == 'trio'
)
if (
not is_trio_main
and is_main_thread
):
log.warning(
f'Current async-lib detected by `sniffio`: {async_lib}\n'
)
return is_trio_main
# XXX apparently unreliable..see ^
# (
# threading.current_thread()
# is not threading.main_thread()
# )
@classmethod
@pdbp.hideframe
def release(
cls,
cancel_req_task: bool = True,
):
try:
# sometimes the task might already be terminated in
# which case this call will raise an RTE?
if cls.repl_release is not None:
cls.repl_release.set()
finally:
# if req_ctx := cls.req_ctx:
# req_ctx._scope.cancel()
if (
cancel_req_task
and
(req_cs := cls.req_cs)
):
req_cs.cancel()
# restore original sigint handler
cls.unshield_sigint()
# actor-local state, irrelevant for non-root.
cls.repl_task = None
cls.repl = None
class TractorConfig(pdbp.DefaultConfig):
'''
Custom `pdbp` config which tries to use the best tradeoff
between pretty and minimal.
'''
use_pygments: bool = True
sticky_by_default: bool = False
enable_hidden_frames: bool = True
# much thanks @mdmintz for the hot tip!
# fixes line spacing issue when resizing terminal B)
truncate_long_lines: bool = False
# ------ - ------
# our own custom config vars mostly
# for syncing with the actor tree's singleton
# TTY `Lock`.
class PdbREPL(pdbp.Pdb):
'''
Add teardown hooks and local state describing any
ongoing TTY `Lock` request dialog.
'''
# override the pdbp config with our coolio one
# NOTE: this is only loaded when no `~/.pdbrc` exists
# so we should prolly pass it into the .__init__() instead?
# i dunno, see the `DefaultFactory` and `pdb.Pdb` impls.
DefaultConfig = TractorConfig
status = DebugStatus
# def preloop(self):
# print('IN PRELOOP')
# super().preloop()
# TODO: cleaner re-wrapping of all this?
# -[ ] figure out how to disallow recursive .set_trace() entry
# since that'll cause deadlock for us.
# -[ ] maybe a `@cm` to call `super().<same_meth_name>()`?
# -[ ] look at hooking into the `pp` hook specially with our
# own set of pretty-printers?
# * `.pretty_struct.Struct.pformat()`
# * `.pformat(MsgType.pld)`
# * `.pformat(Error.tb_str)`?
# * .. maybe more?
#
def set_continue(self):
try:
super().set_continue()
finally:
DebugStatus.release()
# NOTE: for subactors the stdio lock is released via the
# allocated RPC locker task, so for root we have to do it
# manually.
if is_root_process():
Lock.release()
def set_quit(self):
try:
super().set_quit()
finally:
DebugStatus.release(
cancel_req_task=False,
)
if is_root_process():
Lock.release()
# TODO: special handling where we just want the next LOC and
# not to resume to the next pause/crash point?
# def set_next(
# self,
# frame: FrameType
# ) -> None:
# try:
# super().set_next(frame)
# finally:
# Lock.release()
# XXX NOTE: we only override this because apparently the stdlib pdb
# bois likes to touch the SIGINT handler as much as i like to touch
# my d$%&.
def _cmdloop(self):
self.cmdloop()
@cached_property
def shname(self) -> str | None:
'''
Attempt to return the login shell name with a special check for
the infamous `xonsh` since it seems to have some issues much
different from std shells when it comes to flushing the prompt?
'''
# SUPER HACKY and only really works if `xonsh` is not used
# before spawning further sub-shells..
shpath = os.getenv('SHELL', None)
if shpath:
if (
os.getenv('XONSH_LOGIN', default=False)
or 'xonsh' in shpath
):
return 'xonsh'
return os.path.basename(shpath)
return None
@cm
def apply_debug_pldec() -> _codec.MsgCodec:
'''
Apply the subactor TTY `Lock`-ing protocol's msgspec temporarily
(only in the current task).
'''
from tractor.msg import (
_ops as msgops,
)
orig_plrx: msgops.PldRx = msgops.current_pldrx()
orig_pldec: msgops.MsgDec = orig_plrx.pld_dec
try:
with msgops.limit_plds(
spec=__pld_spec__,
) as debug_dec:
assert debug_dec is msgops.current_pldrx().pld_dec
log.runtime(
'Applied `.devx._debug` pld-spec\n\n'
f'{debug_dec}\n'
)
yield debug_dec
finally:
assert (
(plrx := msgops.current_pldrx()) is orig_plrx
and
plrx.pld_dec is orig_pldec
)
log.runtime(
'Reverted to previous pld-spec\n\n'
f'{orig_pldec}\n'
)
# TODO: add this formatter to `.devx.pformat()`!
def pformat_cs(
cs: CancelScope,
var_name: str = 'cs',
) -> str:
return (
f'{var_name}: {cs}\n'
f'{var_name}.cancel_called = {cs.cancel_called}\n'
f'{var_name}.cancelled_caught = {cs.cancelled_caught}\n'
f'{var_name}._cancel_status = {cs._cancel_status}\n'
f'{var_name}.shield = {cs.shield}\n'
)
async def request_root_stdio_lock(
actor_uid: tuple[str, str],
task_uid: tuple[str, int],
task_status: TaskStatus[CancelScope] = trio.TASK_STATUS_IGNORED
):
'''
Connect to the root actor of this process tree and RPC-invoke
a task which acquires a std-streams global `Lock`: a actor tree
global mutex which prevents other subactors from entering
a `PdbREPL` at the same time as any other.
The actual `Lock` singleton exists ONLY in the root actor's
memory and does nothing more then set process-tree global state.
The actual `PdbREPL` interaction is completely isolated to each
sub-actor and with the `Lock` merely providing the multi-process
syncing mechanism to avoid any subactor (or the root itself) from
entering the REPL at the same time.
'''
# TODO: likely we can implement this mutex more generally as
# a `._sync.Lock`?
# -[ ] simply add the wrapping needed for the debugger specifics?
# - the `__pld_spec__` impl and maybe better APIs for the client
# vs. server side state tracking? (`Lock` + `DebugStatus`)
# -[ ] for eg. `mp` has a multi-proc lock via the manager
# - https://docs.python.org/3.8/library/multiprocessing.html#synchronization-primitives
# -[ ] technically we need a `RLock` since re-acquire should be a noop
# - https://docs.python.org/3.8/library/multiprocessing.html#multiprocessing.RLock
DebugStatus.req_finished = trio.Event()
try:
from tractor._discovery import get_root
with (
# NOTE: we need this to ensure that this task exits
# BEFORE the REPl instance raises an error like
# `bdb.BdbQuit` directly, OW you get a trio cs stack
# corruption!
# Further, the since this task is spawned inside the
# `Context._scope_nursery: trio.Nursery`, once an RPC
# task errors that cs is cancel_called and so if we want
# to debug the TPC task that failed we need to shield
# against that expected `.cancel()` call and instead
# expect all of the `PdbREPL`.set_[continue/quit/]()`
# methods to unblock this task by setting the
# `.repl_release: # trio.Event`.
trio.CancelScope(shield=True) as req_cs,
# NOTE: set it here in the locker request task bc it's
# possible for multiple such requests for the lock in any
# single sub-actor AND there will be a race between when the
# root locking task delivers the `Started(pld=LockStatus)`
# and when the REPL is actually entered by the requesting
# application task who called
# `.pause()`/`.post_mortem()`.
#
# SO, applying the pld-spec here means it is only applied to
# this IPC-ctx request task, NOT any other task(s)
# including the one that actually enters the REPL. This
# is oc desired bc ow the debugged task will msg-type-error.
#
apply_debug_pldec() as debug_dec,
):
log.critical(
'Request cancel-scope is:\n\n'
f'{pformat_cs(req_cs, var_name="req_cs")}\n\n'
)
DebugStatus.req_cs = req_cs
try:
# TODO: merge into single async with ?
async with get_root() as portal:
async with portal.open_context(
lock_tty_for_child,
subactor_task_uid=task_uid,
) as (ctx, status):
DebugStatus.req_ctx = ctx
from tractor.msg import (
_ops as msgops,
)
assert (
msgops.current_pldrx().pld_dec is debug_dec
)
log.debug(
'Subactor locked TTY with msg\n\n'
f'{status}\n'
)
# mk_pdb().set_trace()
assert status.subactor_uid == actor_uid
assert status.cid
# set last rxed lock dialog status.
DebugStatus.lock_status = status
async with ctx.open_stream() as stream:
assert DebugStatus.repl_release
task_status.started(ctx)
# wait for local task to exit its
# `PdbREPL.interaction()`, call
# `DebugStatus.release()` and then
# unblock here.
await DebugStatus.repl_release.wait()
await stream.send(
LockRelease(
subactor_uid=actor_uid,
cid=status.cid,
)
)
# sync with child-side root locker task
# completion
status: LockStatus = await ctx.result()
assert not status.locked
DebugStatus.lock_status = status
log.pdb(
'TTY lock was released for subactor with msg\n\n'
f'{status}\n\n'
f'Exitting {ctx.side!r}-side of locking ctx'
)
except (
tractor.ContextCancelled,
trio.Cancelled,
):
log.exception(
'Debug lock request CANCELLED?\n\n'
f'{pformat_cs(req_cs, var_name="req_cs")}\n\n'
f'{pformat_cs(ctx._scope, var_name="ctx._scope")}\n\n'
f'{ctx}'
)
raise
except (
BaseException,
):
log.exception(
'Failed during root TTY-lock dialog?\n'
f'{ctx}\n'
f'Cancelling IPC ctx!\n'
)
await ctx.cancel()
raise
except (
tractor.ContextCancelled,
trio.Cancelled,
):
log.cancel(
'Debug lock request CANCELLED?\n'
f'{ctx}\n'
)
raise
except BaseException:
log.exception('Errored during root TTY-lock dialog?')
raise
finally:
log.debug('Exiting debugger TTY lock request func from child')
# signal request task exit
DebugStatus.req_finished.set()
def mk_pdb() -> PdbREPL:
'''
Deliver a new `PdbREPL`: a multi-process safe `pdbp.Pdb`-variant
using the magic of `tractor`'s SC-safe IPC.
B)
Our `pdb.Pdb` subtype accomplishes multi-process safe debugging
by:
- mutexing access to the root process' std-streams (& thus parent
process TTY) via an IPC managed `Lock` singleton per
actor-process tree.
- temporarily overriding any subactor's SIGINT handler to shield
during live REPL sessions in sub-actors such that cancellation
is never (mistakenly) triggered by a ctrl-c and instead only by
explicit runtime API requests or after the
`pdb.Pdb.interaction()` call has returned.
FURTHER, the `pdbp.Pdb` instance is configured to be `trio`
"compatible" from a SIGINT handling perspective; we mask out
the default `pdb` handler and instead apply `trio`s default
which mostly addresses all issues described in:
- https://github.com/python-trio/trio/issues/1155
The instance returned from this factory should always be
preferred over the default `pdb[p].set_trace()` whenever using
a `pdb` REPL inside a `trio` based runtime.
'''
pdb = PdbREPL()
# XXX: These are the important flags mentioned in
# https://github.com/python-trio/trio/issues/1155
# which resolve the traceback spews to console.
pdb.allow_kbdint = True
pdb.nosigint = True
return pdb
def any_connected_locker_child() -> bool:
'''
Predicate to determine if a reported child subactor in debug
is actually connected.
Useful to detect stale `Lock` requests after IPC failure.
'''
actor: Actor = current_actor()
if not is_root_process():
raise RuntimeError('This is a root-actor only API!')
if (
(ctx := Lock.ctx_in_debug)
and
(uid_in_debug := ctx.chan.uid)
):
chans: list[tractor.Channel] = actor._peers.get(
tuple(uid_in_debug)
)
if chans:
return any(
chan.connected()
for chan in chans
)
return False
def shield_sigint_handler(
signum: int,
frame: 'frame', # type: ignore # noqa
*args,
) -> None:
'''
Specialized, debugger-aware SIGINT handler.
In childred we always ignore/shield for SIGINT to avoid
deadlocks since cancellation should always be managed by the
supervising parent actor. The root actor-proces is always
cancelled on ctrl-c.
'''
__tracebackhide__: bool = True
actor: Actor = current_actor()
def do_cancel():
# If we haven't tried to cancel the runtime then do that instead
# of raising a KBI (which may non-gracefully destroy
# a ``trio.run()``).
if not actor._cancel_called:
actor.cancel_soon()
# If the runtime is already cancelled it likely means the user
# hit ctrl-c again because teardown didn't fully take place in
# which case we do the "hard" raising of a local KBI.
else:
raise KeyboardInterrupt
# only set in the actor actually running the REPL
repl: PdbREPL|None = DebugStatus.repl
# TODO: maybe we should flatten out all these cases using
# a match/case?
#
# root actor branch that reports whether or not a child
# has locked debugger.
if is_root_process():
# try to see if the supposed (sub)actor in debug still
# has an active connection to *this* actor, and if not
# it's likely they aren't using the TTY lock / debugger
# and we should propagate SIGINT normally.
any_connected: bool = any_connected_locker_child()
# if not any_connected:
# return do_cancel()
problem = (
f'root {actor.uid} handling SIGINT\n'
f'any_connected: {any_connected}\n\n'
f'{Lock.repr()}\n'
)
if (
(ctx := Lock.ctx_in_debug)
and
(uid_in_debug := ctx.chan.uid) # "someone" is (ostensibly) using debug `Lock`
):
name_in_debug: str = uid_in_debug[0]
assert not repl
# if not repl: # but it's NOT us, the root actor.
# sanity: since no repl ref is set, we def shouldn't
# be the lock owner!
assert name_in_debug != 'root'
# IDEAL CASE: child has REPL as expected
if any_connected: # there are subactors we can contact
# XXX: only if there is an existing connection to the
# (sub-)actor in debug do we ignore SIGINT in this
# parent! Otherwise we may hang waiting for an actor
# which has already terminated to unlock.
#
# NOTE: don't emit this with `.pdb()` level in
# root without a higher level.
log.runtime(
f'Ignoring SIGINT while debug REPL in use by child '
f'{uid_in_debug}\n'
)
problem = None
else:
problem += (
'\n'
f'A `pdb` REPL is SUPPOSEDLY in use by child {uid_in_debug}\n'
f'BUT, no child actors are IPC contactable!?!?\n'
)
# IDEAL CASE: root has REPL as expected
else:
# root actor still has this SIGINT handler active without
# an actor using the `Lock` (a bug state) ??
# => so immediately cancel any stale lock cs and revert
# the handler!
if not repl:
# TODO: WHEN should we revert back to ``trio``
# handler if this one is stale?
# -[ ] maybe after a counts work of ctl-c mashes?
# -[ ] use a state var like `stale_handler: bool`?
problem += (
'\n'
'No subactor is using a `pdb` REPL according `Lock.ctx_in_debug`?\n'
'BUT, the root should be using it, WHY this handler ??\n'
)
else:
log.pdb(
'Ignoring SIGINT while pdb REPL in use by root actor..\n'
)
problem = None
# XXX if one is set it means we ARE NOT operating an ideal
# case where a child subactor or us (the root) has the
# lock without any other detected problems.
if problem:
# detect, report and maybe clear a stale lock request
# cancel scope.
lock_cs: trio.CancelScope = Lock.get_locking_task_cs()
maybe_stale_lock_cs: bool = (
lock_cs is not None
and not lock_cs.cancel_called
)
if maybe_stale_lock_cs:
problem += (
'\n'
'Stale `Lock.ctx_in_debug._scope: CancelScope` detected?\n'
f'{Lock.ctx_in_debug}\n\n'
'-> Calling ctx._scope.cancel()!\n'
)
lock_cs.cancel()
# TODO: wen do we actually want/need this, see above.
# DebugStatus.unshield_sigint()
log.warning(problem)
# child actor that has locked the debugger
elif not is_root_process():
log.warning(
f'Subactor {actor.uid} handling SIGINT\n\n'
f'{Lock.repr()}\n'
)
rent_chan: Channel = actor._parent_chan
if (
rent_chan is None
or
not rent_chan.connected()
):
log.warning(
'This sub-actor thinks it is debugging '
'but it has no connection to its parent ??\n'
f'{actor.uid}\n'
'Allowing SIGINT propagation..'
)
DebugStatus.unshield_sigint()
# do_cancel()
task: str|None = DebugStatus.repl_task
if (
task
and
repl
):
log.pdb(
f'Ignoring SIGINT while local task using debug REPL\n'
f'|_{task}\n'
f' |_{repl}\n'
)
else:
msg: str = (
'SIGINT shield handler still active BUT, \n\n'
)
if task is None:
msg += (
f'- No local task claims to be in debug?\n'
f' |_{task}\n\n'
)
if repl is None:
msg += (
f'- No local REPL is currently active?\n'
f' |_{repl}\n\n'
)
log.warning(
msg
+
'Reverting handler to `trio` default!\n'
)
DebugStatus.unshield_sigint()
# XXX ensure that the reverted-to-handler actually is
# able to rx what should have been **this** KBI ;)
do_cancel()
# raise KeyboardInterrupt
# TODO: how to handle the case of an intermediary-child actor
# that **is not** marked in debug mode? See oustanding issue:
# https://github.com/goodboy/tractor/issues/320
# elif debug_mode():
# NOTE: currently (at least on ``fancycompleter`` 0.9.2)
# it looks to be that the last command that was run (eg. ll)
# will be repeated by default.
# maybe redraw/print last REPL output to console since
# we want to alert the user that more input is expect since
# nothing has been done dur to ignoring sigint.
if (
repl # only when current actor has a REPL engaged
):
# XXX: yah, mega hack, but how else do we catch this madness XD
if repl.shname == 'xonsh':
repl.stdout.write(repl.prompt)
repl.stdout.flush()
# TODO: make this work like sticky mode where if there is output
# detected as written to the tty we redraw this part underneath
# and erase the past draw of this same bit above?
# repl.sticky = True
# repl._print_if_sticky()
# also see these links for an approach from ``ptk``:
# https://github.com/goodboy/tractor/issues/130#issuecomment-663752040
# https://github.com/prompt-toolkit/python-prompt-toolkit/blob/c2c6af8a0308f9e5d7c0e28cb8a02963fe0ce07a/prompt_toolkit/patch_stdout.py
# XXX only for tracing this handler
# log.warning('exiting SIGINT')
_pause_msg: str = 'Attaching to pdb REPL in actor'
async def _pause(
debug_func: Callable|None,
# NOTE: must be passed in the `.pause_from_sync()` case!
repl: PdbREPL|None = None,
# TODO: allow caller to pause despite task cancellation,
# exactly the same as wrapping with:
# with CancelScope(shield=True):
# await pause()
# => the REMAINING ISSUE is that the scope's .__exit__() frame
# is always show in the debugger on entry.. and there seems to
# be no way to override it?..
#
# shield: bool = False,
hide_tb: bool = True,
# bc, `debug_func()`, `_enter_repl_sync()` and `_pause()`
# extra_frames_up_when_async: int = 3,
task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED,
**debug_func_kwargs,
) -> None:
'''
Inner impl for `pause()` to avoid the `trio.CancelScope.__exit__()`
stack frame when not shielded (since apparently i can't figure out
how to hide it using the normal mechanisms..)
Hopefully we won't need this in the long run.
'''
__tracebackhide__: bool = hide_tb
actor: Actor = current_actor()
try:
# TODO: use the `Task` instance instead for `is` checks
# below!
task: Task = trio.lowlevel.current_task()
except RuntimeError as rte:
if actor.is_infected_aio():
raise RuntimeError(
'`tractor.pause[_from_sync]()` not yet supported '
'for infected `asyncio` mode!'
) from rte
# TODO: this should be created as part of `DebugRequest()` init
# which should instead be a one-shot-use singleton much like
# the `PdbREPL`.
if (
not DebugStatus.repl_release
or
DebugStatus.repl_release.is_set()
):
DebugStatus.repl_release = trio.Event()
if debug_func is not None:
debug_func = partial(debug_func)
repl: PdbREPL = repl or mk_pdb()
# TODO: maybe make this a `PdbREPL` method or mod func?
# -[ ] factor out better, main reason for it is common logic for
# both root and sub repl entry
def _enter_repl_sync(
debug_func: Callable,
) -> None:
__tracebackhide__: bool = hide_tb
# TODO: do we want to support using this **just** for the
# locking / common code (prolly to help address #320)?
#
if debug_func is None:
task_status.started(DebugStatus)
else:
# block here one (at the appropriate frame *up*) where
# ``breakpoint()`` was awaited and begin handling stdio.
log.debug('Entering sync world of the `pdb` REPL..')
# XXX used by the SIGINT handler to check if
# THIS actor is in REPL interaction
try:
# TODO: move this into a `open_debug_request()` @acm?
# -[ ] prolly makes the most send to do the request
# task spawn as part of an `@acm` api which
# delivers the `DebugRequest` instance and ensures
# encapsing all the pld-spec and debug-nursery?
#
# set local actor task to avoid recurrent
# entries/requests from the same local task
# (to the root process).
DebugStatus.repl_task = task
DebugStatus.repl = repl
DebugStatus.shield_sigint()
# enter `PdbREPL` specific method
debug_func(
repl=repl,
hide_tb=hide_tb,
**debug_func_kwargs,
)
except trio.Cancelled:
log.exception(
'Cancelled during invoke of internal `debug_func = '
f'{debug_func.func.__name__}`\n'
)
# NOTE: DON'T release lock yet
raise
except BaseException:
log.exception(
'Failed to invoke internal `debug_func = '
f'{debug_func.func.__name__}`\n'
)
# NOTE: OW this is ONLY called from the
# `.set_continue/next` hooks!
DebugStatus.release()
raise
try:
if is_root_process():
# we also wait in the root-parent for any child that
# may have the tty locked prior
# TODO: wait, what about multiple root tasks acquiring it though?
ctx: Context|None = Lock.ctx_in_debug
if (
ctx is None
and
DebugStatus.repl
and
DebugStatus.repl_task is task
):
# re-entrant root process already has it: noop.
log.warning(
f'{task.name}@{actor.uid} already has TTY lock\n'
f'ignoring..'
)
await trio.lowlevel.checkpoint()
return
# XXX: since we need to enter pdb synchronously below,
# we have to release the lock manually from pdb completion
# callbacks. Can't think of a nicer way then this atm.
if Lock._debug_lock.locked():
log.warning(
'attempting to shield-acquire active TTY lock owned by\n'
f'{ctx}'
)
# must shield here to avoid hitting a ``Cancelled`` and
# a child getting stuck bc we clobbered the tty
with trio.CancelScope(shield=True):
await Lock._debug_lock.acquire()
else:
# may be cancelled
await Lock._debug_lock.acquire()
# enter REPL from root, no TTY locking IPC ctx necessary
_enter_repl_sync(debug_func)
return # next branch is mutex and for subactors
# TODO: need a more robust check for the "root" actor
elif (
not is_root_process()
and actor._parent_chan # a connected child
):
if DebugStatus.repl_task:
# Recurrence entry case: this task already has the lock and
# is likely recurrently entering a breakpoint
#
# NOTE: noop on recurrent entry case but we want to trigger
# a checkpoint to allow other actors error-propagate and
# potetially avoid infinite re-entries in some
# subactor that would otherwise not bubble until the
# next checkpoint was hit.
if (
(repl_task := DebugStatus.repl_task)
and
repl_task is task
):
log.warning(
f'{task.name}@{actor.uid} already has TTY lock\n'
f'ignoring..'
)
await trio.lowlevel.checkpoint()
return
# if **this** actor is already in debug REPL we want
# to maintain actor-local-task mutex access, so block
# here waiting for the control to be released - this
# -> allows for recursive entries to `tractor.pause()`
log.warning(
f'{task.name}@{actor.uid} already has TTY lock\n'
f'waiting for release..'
)
await DebugStatus.repl_release.wait()
await trio.sleep(0.1)
# this **must** be awaited by the caller and is done using the
# root nursery so that the debugger can continue to run without
# being restricted by the scope of a new task nursery.
# TODO: if we want to debug a trio.Cancelled triggered exception
# we have to figure out how to avoid having the service nursery
# cancel on this task start? I *think* this works below:
# ```python
# actor._service_n.cancel_scope.shield = shield
# ```
# but not entirely sure if that's a sane way to implement it?
try:
# NOTE spawn the stdio locker request task inside the
# current `Context._scope_nursery` to entsure that
# the request never can outlive the task's (parent)
# lifetime.
curr_ctx: Context = current_ipc_ctx()
# TODO: see `_errors_relayed_via_ipc()` where we
# should dynamically open a `debug_tn` for use here,
# BUT it needs to be outside the normal error
# catching and `_maybe_enter_debugger()` call!
# ctx: Context = await curr_ctx._debug_tn.start(
ctx: Context = await actor._service_n.start(
request_root_stdio_lock,
actor.uid,
(task.name, id(task)), # task uuid (effectively)
)
# our locker task should be the one in ctx
# with the root actor
assert (
ctx
is
DebugStatus.req_ctx
is not
curr_ctx
)
# enter REPL
_enter_repl_sync(debug_func)
except RuntimeError:
if actor._cancel_called:
# service nursery won't be usable and we
# don't want to lock up the root either way since
# we're in (the midst of) cancellation.
return
raise
# TODO: prolly factor this plus the similar block from
# `_enter_repl_sync()` into a common @cm?
except BaseException as repl_err:
if isinstance(repl_err, bdb.BdbQuit):
log.devx(
'REPL for pdb was quit!\n'
)
else:
log.exception(
'Failed to engage debugger via `_pause()` ??\n'
)
DebugStatus.release()
# sanity checks for ^ on request/status teardown
assert DebugStatus.repl is None
assert DebugStatus.repl_task is None
req_ctx: Context = DebugStatus.req_ctx
if req_ctx:
assert req_ctx._scope.cancel_called
raise
def _set_trace(
repl: PdbREPL, # passed by `_pause()`
hide_tb: bool,
# partial-ed in by `.pause()`
api_frame: FrameType,
):
__tracebackhide__: bool = hide_tb
actor: tractor.Actor = current_actor()
# else:
# TODO: maybe print the actor supervion tree up to the
# root here? Bo
log.pdb(
f'{_pause_msg}\n'
'|\n'
# TODO: make an `Actor.__repr()__`
f'|_ {current_task()} @ {actor.uid}\n'
)
# presuming the caller passed in the "api frame"
# (the last frame before user code - like `.pause()`)
# then we only step up one frame to where the user
# called our API.
caller_frame: FrameType = api_frame.f_back # type: ignore
# engage ze REPL
# B~()
repl.set_trace(frame=caller_frame)
async def pause(
*,
hide_tb: bool = True,
api_frame: FrameType|None = None,
# TODO: figure out how to still make this work:
# -[ ] pass it direct to `_pause()`?
# -[ ] use it to set the `debug_nursery.cancel_scope.shield`
shield: bool = False,
**_pause_kwargs,
) -> None:
'''
A pause point (more commonly known as a "breakpoint") interrupt
instruction for engaging a blocking debugger instance to
conduct manual console-based-REPL-interaction from within
`tractor`'s async runtime, normally from some single-threaded
and currently executing actor-hosted-`trio`-task in some
(remote) process.
NOTE: we use the semantics "pause" since it better encompasses
the entirety of the necessary global-runtime-state-mutation any
actor-task must access and lock in order to get full isolated
control over the process tree's root TTY:
https://en.wikipedia.org/wiki/Breakpoint
'''
__tracebackhide__: bool = True
# always start 1 level up from THIS in user code since normally
# `tractor.pause()` is called explicitly by use-app code thus
# making it the highest up @api_frame.
api_frame: FrameType = api_frame or inspect.currentframe()
# XXX TODO: this was causing cs-stack corruption in trio due to
# usage within the `Context._scope_nursery` (which won't work
# based on scoping of it versus call to `_maybe_enter_debugger()`
# from `._rpc._invoke()`)
# with trio.CancelScope(
# shield=shield,
# ) as cs:
# NOTE: so the caller can always manually cancel even
# if shielded!
# task_status.started(cs)
# log.critical(
# '`.pause() cancel-scope is:\n\n'
# f'{pformat_cs(cs, var_name="pause_cs")}\n\n'
# )
await _pause(
debug_func=partial(
_set_trace,
api_frame=api_frame,
),
# task_status=task_status,
**_pause_kwargs
)
# XXX avoid cs stack corruption when `PdbREPL.interaction()`
# raises `BdbQuit`.
# await DebugStatus.req_finished.wait()
_gb_mod: None|ModuleType|False = None
def maybe_import_greenback(
raise_not_found: bool = True,
force_reload: bool = False,
) -> ModuleType|False:
# be cached-fast on module-already-inited
global _gb_mod
if _gb_mod is False:
return False
elif (
_gb_mod is not None
and not force_reload
):
return _gb_mod
try:
import greenback
_gb_mod = greenback
return greenback
except ModuleNotFoundError as mnf:
log.debug(
'`greenback` is not installed.\n'
'No sync debug support!\n'
)
_gb_mod = False
if raise_not_found:
raise RuntimeError(
'The `greenback` lib is required to use `tractor.pause_from_sync()`!\n'
'https://github.com/oremanj/greenback\n'
) from mnf
return False
async def maybe_init_greenback(
**kwargs,
) -> None|ModuleType:
if mod := maybe_import_greenback(**kwargs):
await mod.ensure_portal()
log.info(
'`greenback` portal opened!\n'
'Sync debug support activated!\n'
)
return mod
return None
# TODO: allow pausing from sync code.
# normally by remapping python's builtin breakpoint() hook to this
# runtime aware version which takes care of all .
def pause_from_sync(
hide_tb: bool = False,
) -> None:
__tracebackhide__: bool = hide_tb
actor: tractor.Actor = current_actor(
err_on_no_runtime=False,
)
log.debug(
f'{actor.uid}: JUST ENTERED `tractor.pause_from_sync()`'
f'|_{actor}\n'
)
if not actor:
raise RuntimeError(
'Not inside the `tractor`-runtime?\n'
'`tractor.pause_from_sync()` is not functional without a wrapping\n'
'- `async with tractor.open_nursery()` or,\n'
'- `async with tractor.open_root_actor()`\n'
)
# NOTE: once supported, remove this AND the one
# inside `._pause()`!
if actor.is_infected_aio():
raise RuntimeError(
'`tractor.pause[_from_sync]()` not yet supported '
'for infected `asyncio` mode!'
)
# raises on not-found by default
greenback: ModuleType = maybe_import_greenback()
mdb: PdbREPL = mk_pdb()
# run async task which will lock out the root proc's TTY.
if not Lock.is_main_trio_thread():
# TODO: we could also check for a non-`.to_thread` context
# using `trio.from_thread.check_cancelled()` (says
# oremanj) wherein we get the following outputs:
#
# `RuntimeError`: non-`.to_thread` spawned thread
# noop: non-cancelled `.to_thread`
# `trio.Cancelled`: cancelled `.to_thread`
#
trio.from_thread.run(
partial(
pause,
debug_func=None,
pdb=mdb,
hide_tb=hide_tb,
)
)
# TODO: maybe the `trio.current_task()` id/name if avail?
DebugStatus.repl_task: str = str(threading.current_thread())
else: # we are presumably the `trio.run()` + main thread
greenback.await_(
pause(
debug_func=None,
pdb=mdb,
hide_tb=hide_tb,
)
)
DebugStatus.repl_task: str = current_task()
# TODO: ensure we aggressively make the user aware about
# entering the global ``breakpoint()`` built-in from sync
# code?
_set_trace(
api_frame=inspect.current_frame(),
actor=actor,
pdb=mdb,
hide_tb=hide_tb,
# TODO? will we ever need it?
# -> the gb._await() won't be affected by cancellation?
# shield=shield,
)
# LEGACY NOTE on next LOC's frame showing weirdness..
#
# XXX NOTE XXX no other LOC can be here without it
# showing up in the REPL's last stack frame !?!
# -[ ] tried to use `@pdbp.hideframe` decoration but
# still doesn't work
# NOTE prefer a new "pause" semantic since it better describes
# "pausing the actor's runtime" for this particular
# paralell task to do debugging in a REPL.
async def breakpoint(**kwargs):
log.warning(
'`tractor.breakpoint()` is deprecated!\n'
'Please use `tractor.pause()` instead!\n'
)
__tracebackhide__: bool = True
await pause(
api_frame=inspect.currentframe(),
**kwargs,
)
_crash_msg: str = (
'Attaching to pdb REPL in crashed actor'
)
def _post_mortem(
# provided and passed by `_pause()`
repl: PdbREPL,
# XXX all `partial`-ed in by `post_mortem()` below!
tb: TracebackType,
api_frame: FrameType,
shield: bool = False,
hide_tb: bool = False,
) -> None:
'''
Enter the ``pdbpp`` port mortem entrypoint using our custom
debugger instance.
'''
__tracebackhide__: bool = hide_tb
actor: tractor.Actor = current_actor()
# TODO: print the actor supervion tree up to the root
# here! Bo
log.pdb(
f'{_crash_msg}\n'
'|\n'
# f'|_ {current_task()}\n'
f'|_ {current_task()} @ {actor.uid}\n'
# f'|_ @{actor.uid}\n'
# TODO: make an `Actor.__repr()__`
# f'|_ {current_task()} @ {actor.name}\n'
)
# NOTE only replacing this from `pdbp.xpm()` to add the
# `end=''` to the print XD
print(traceback.format_exc(), end='')
caller_frame: FrameType = api_frame.f_back
# NOTE: see the impl details of followings to understand usage:
# - `pdbp.post_mortem()`
# - `pdbp.xps()`
# - `bdb.interaction()`
repl.reset()
repl.interaction(
frame=caller_frame,
# frame=None,
traceback=tb,
)
async def post_mortem(
*,
tb: TracebackType|None = None,
api_frame: FrameType|None = None,
hide_tb: bool = False,
# TODO: support shield here just like in `pause()`?
# shield: bool = False,
**_pause_kwargs,
) -> None:
__tracebackhide__: bool = hide_tb
tb: TracebackType = tb or sys.exc_info()[2]
# TODO: do upward stack scan for highest @api_frame and
# use its parent frame as the expected user-app code
# interact point.
api_frame: FrameType = api_frame or inspect.currentframe()
await _pause(
debug_func=partial(
_post_mortem,
api_frame=api_frame,
tb=tb,
),
hide_tb=hide_tb,
**_pause_kwargs
)
async def _maybe_enter_pm(
err: BaseException,
*,
tb: TracebackType|None = None,
api_frame: FrameType|None = None,
hide_tb: bool = False,
):
from tractor._exceptions import is_multi_cancelled
if (
debug_mode()
# NOTE: don't enter debug mode recursively after quitting pdb
# Iow, don't re-enter the repl if the `quit` command was issued
# by the user.
and not isinstance(err, bdb.BdbQuit)
# XXX: if the error is the likely result of runtime-wide
# cancellation, we don't want to enter the debugger since
# there's races between when the parent actor has killed all
# comms and when the child tries to contact said parent to
# acquire the tty lock.
# Really we just want to mostly avoid catching KBIs here so there
# might be a simpler check we can do?
and not is_multi_cancelled(err)
):
api_frame: FrameType = api_frame or inspect.currentframe()
tb: TracebackType = tb or sys.exc_info()[2]
await post_mortem(
api_frame=api_frame,
tb=tb,
)
return True
else:
return False
@acm
async def acquire_debug_lock(
subactor_uid: tuple[str, str],
) -> AsyncGenerator[
trio.CancelScope|None,
tuple,
]:
'''
Request to acquire the TTY `Lock` in the root actor, release on exit.
This helper is for actor's who don't actually need to acquired
the debugger but want to wait until the lock is free in the
process-tree root such that they don't clobber an ongoing pdb
REPL session in some peer or child!
'''
if not debug_mode():
yield None
return
async with trio.open_nursery() as n:
ctx: Context = await n.start(
request_root_stdio_lock,
subactor_uid,
)
yield ctx
ctx.cancel()
async def maybe_wait_for_debugger(
poll_steps: int = 2,
poll_delay: float = 0.1,
child_in_debug: bool = False,
header_msg: str = '',
) -> bool: # was locked and we polled?
if (
not debug_mode()
and not child_in_debug
):
return False
msg: str = header_msg
if (
is_root_process()
):
# If we error in the root but the debugger is
# engaged we don't want to prematurely kill (and
# thus clobber access to) the local tty since it
# will make the pdb repl unusable.
# Instead try to wait for pdb to be released before
# tearing down.
ctx_in_debug: Context|None = Lock.ctx_in_debug
in_debug: tuple[str, str]|None = ctx_in_debug.chan.uid if ctx_in_debug else None
if in_debug == current_actor().uid:
log.debug(
msg
+
'Root already owns the TTY LOCK'
)
return True
elif in_debug:
msg += (
f'Debug `Lock` in use by subactor\n|\n|_{in_debug}\n'
)
# TODO: could this make things more deterministic?
# wait to see if a sub-actor task will be
# scheduled and grab the tty lock on the next
# tick?
# XXX => but it doesn't seem to work..
# await trio.testing.wait_all_tasks_blocked(cushion=0)
else:
log.debug(
msg
+
'Root immediately acquired debug TTY LOCK'
)
return False
for istep in range(poll_steps):
if (
Lock.no_remote_has_tty is not None
and not Lock.no_remote_has_tty.is_set()
and in_debug is not None
):
# caller_frame_info: str = pformat_caller_frame()
log.debug(
msg
+
'\nRoot is waiting on tty lock to release from\n\n'
# f'{caller_frame_info}\n'
)
if not any_connected_locker_child():
Lock.get_locking_task_cs().cancel()
with trio.CancelScope(shield=True):
await Lock.no_remote_has_tty.wait()
log.pdb(
f'Subactor released debug lock\n'
f'|_{in_debug}\n'
)
break
# is no subactor locking debugger currently?
if (
in_debug is None
and (
Lock.no_remote_has_tty is None
or Lock.no_remote_has_tty.is_set()
)
):
log.pdb(
msg
+
'Root acquired tty lock!'
)
break
else:
# TODO: don't need this right?
# await trio.lowlevel.checkpoint()
log.debug(
'Root polling for debug:\n'
f'poll step: {istep}\n'
f'poll delya: {poll_delay}'
)
with CancelScope(shield=True):
await trio.sleep(poll_delay)
continue
# fallthrough on failure to acquire..
# else:
# raise RuntimeError(
# msg
# +
# 'Root actor failed to acquire debug lock?'
# )
return True
# else:
# # TODO: non-root call for #320?
# this_uid: tuple[str, str] = current_actor().uid
# async with acquire_debug_lock(
# subactor_uid=this_uid,
# ):
# pass
return False
# TODO: better naming and what additionals?
# - [ ] optional runtime plugging?
# - [ ] detection for sync vs. async code?
# - [ ] specialized REPL entry when in distributed mode?
# - [x] allow ignoring kbi Bo
@cm
def open_crash_handler(
catch: set[BaseException] = {
Exception,
BaseException,
},
ignore: set[BaseException] = {
KeyboardInterrupt,
},
):
'''
Generic "post mortem" crash handler using `pdbp` REPL debugger.
We expose this as a CLI framework addon to both `click` and
`typer` users so they can quickly wrap cmd endpoints which get
automatically wrapped to use the runtime's `debug_mode: bool`
AND `pdbp.pm()` around any code that is PRE-runtime entry
- any sync code which runs BEFORE the main call to
`trio.run()`.
'''
try:
yield
except tuple(catch) as err:
if type(err) not in ignore:
pdbp.xpm()
raise
@cm
def maybe_open_crash_handler(pdb: bool = False):
'''
Same as `open_crash_handler()` but with bool input flag
to allow conditional handling.
Normally this is used with CLI endpoints such that if the --pdb
flag is passed the pdb REPL is engaed on any crashes B)
'''
rtctx = nullcontext
if pdb:
rtctx = open_crash_handler
with rtctx():
yield