Route `stackscope` SIGUSR1 onto trio loop

Signal handlers fire in a non-trio stack frame; calling
`stackscope.extract(recurse_child_tasks=True)` from there
only walks the `<init>` task and misses everything inside
`async_main`'s nurseries — exactly the part you want to
see during a hang.

Fix: capture `trio.lowlevel.current_trio_token()` at
`enable_stack_on_sig()` time and stash it as a module-
level `_trio_token`. The SIGUSR1 handler then dispatches
the dump *onto* the trio loop via
`_trio_token.run_sync_soon(_safe_dump_task_tree)`, so
`stackscope.extract` runs from a real trio-task context
and walks the full nursery tree.

Late-binding: pytest's `pytest_configure` calls
`enable_stack_on_sig()` outside any `trio.run`, so token
capture there is a `RuntimeError` — left at `None`. The
runtime re-calls `enable_stack_on_sig()` from inside
`async_main` (subactor side) where the token IS
available, so subactors get the full-tree path.
`dump_tree_on_sig` falls back to a direct call when
`_trio_token is None` (parent process pre-trio.run, or
signal delivered after `trio.run` returns).

`_safe_dump_task_tree()` is a `run_sync_soon`-friendly
wrapper that swallows any exception from
`dump_task_tree()` — trio prints + crashes on uncaught
exceptions in scheduled callbacks; better to log + keep
the run alive so the user can re-trigger.

Other,
- emit `capture-bypass tee: <fpath>` line + `tail -f`
  hint in the rendered dump header so users know where
  to find the artifact even when stdio is captured.
- swap the inline `f'     |_{actor}'` line for a
  `_pformat.nest_from_op` rendering of `actor_repr`
  (matches the rest of the runtime's nested-op style).
- log lines on handler install + already-installed
  branches now note `(trio_token captured: <bool>)`
  so it's obvious from the log whether the full-tree
  path is wired.

(this patch was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
subint_forkserver_backend
Gud Boi 2026-04-29 12:01:03 -04:00
parent 8c730193f9
commit 2d4995e08d
1 changed files with 73 additions and 13 deletions

View File

@ -47,7 +47,9 @@ from typing import (
import trio import trio
from tractor.runtime import _state from tractor.runtime import _state
from tractor import log as logmod from tractor import log as logmod
from tractor.devx import debug from tractor.devx import (
debug,
)
log = logmod.get_logger() log = logmod.get_logger()
@ -109,16 +111,29 @@ def dump_task_tree() -> None:
# |_{Supervisor/Scope # |_{Supervisor/Scope
# |_[Storage/Memory/IPC-Stream/Data-Struct # |_[Storage/Memory/IPC-Stream/Data-Struct
fpath: str = f'/tmp/tractor-stackscope-{os.getpid()}.log'
from . import _pformat
actor_repr: str = _pformat.nest_from_op(
input_op='|_',
text=f'{actor}',
nest_prefilx='|_',
nest_indent=3,
)
full_dump: str = ( full_dump: str = (
f'Dumping `stackscope` tree for actor\n' f'Dumping `stackscope` tree for actor\n'
f'(>: {actor.uid!r}\n' f'(>: {actor.uid!r}\n'
f' |_{mp.current_process()}\n' f' |_{mp.current_process()}\n'
f' |_{thr}\n' f' |_{thr}\n'
f' |_{actor}\n' # TODO, use the nest_from_op
f'{actor_repr}'
# f' |_{actor}'
f'\n' f'\n'
f'{sigint_handler_report}\n' f'{sigint_handler_report}\n'
f'signal.getsignal(SIGINT) -> {current_sigint_handler!r}\n' f'signal.getsignal(SIGINT) -> {current_sigint_handler!r}\n'
f'\n' f'\n'
f'capture-bypass tee: {fpath}\n'
f'(`tail -f {fpath}` to follow across signals)\n'
f'\n'
f'------ start-of-{actor.uid!r} ------\n' f'------ start-of-{actor.uid!r} ------\n'
f'|\n' f'|\n'
f'{tree_str}' f'{tree_str}'
@ -131,7 +146,6 @@ def dump_task_tree() -> None:
# `--capture=fd` swallows `log.devx()` above; the # `--capture=fd` swallows `log.devx()` above; the
# following two writes guarantee the dump reaches the # following two writes guarantee the dump reaches the
# human even when stdio is captured. # human even when stdio is captured.
fpath: str = f'/tmp/tractor-stackscope-{os.getpid()}.log'
try: try:
with open(fpath, 'a') as f: with open(fpath, 'a') as f:
f.write(full_dump + '\n') f.write(full_dump + '\n')
@ -151,6 +165,34 @@ def dump_task_tree() -> None:
_handler_lock = RLock() _handler_lock = RLock()
_tree_dumped: bool = False _tree_dumped: bool = False
# Captured at `enable_stack_on_sig()` time when running
# inside a trio task. `dump_tree_on_sig` uses this to
# schedule `dump_task_tree` ON the trio loop via
# `token.run_sync_soon` so stackscope sees a real current
# task and can recurse into nursery children. Without
# it (signal handler running in a non-trio stack frame),
# `stackscope.extract` only walks the `<init>` task and
# misses everything inside `async_main`'s nurseries.
_trio_token: trio.lowlevel.TrioToken|None = None
def _safe_dump_task_tree() -> None:
'''
`run_sync_soon`-friendly wrapper that swallows any
exception from `dump_task_tree`. Trio prints
+ crashes on uncaught exceptions in scheduled
callbacks; we'd rather log + keep the test running so
the user can re-trigger the dump.
'''
try:
dump_task_tree()
except BaseException:
log.exception(
'`dump_task_tree()` raised (scheduled via '
'`run_sync_soon`); continuing.\n'
)
def dump_tree_on_sig( def dump_tree_on_sig(
sig: int, sig: int,
@ -174,16 +216,17 @@ def dump_tree_on_sig(
'Trying to dump `stackscope` tree..\n' 'Trying to dump `stackscope` tree..\n'
) )
try: try:
dump_task_tree() # Prefer scheduling on the trio loop — runs the
# await actor._service_n.start_soon( # dump from a real trio-task context so
# partial( # `stackscope.extract(recurse_child_tasks=True)`
# trio.to_thread.run_sync, # walks every nursery child instead of seeing
# dump_task_tree, # only the `<init>` task. Falls back to a direct
# ) # call when no token was captured (e.g. signal
# ) # delivered outside a trio.run).
# trio.lowlevel.current_trio_token().run_sync_soon( if _trio_token is not None:
# dump_task_tree _trio_token.run_sync_soon(_safe_dump_task_tree)
# ) else:
dump_task_tree()
except RuntimeError: except RuntimeError:
log.exception( log.exception(
@ -269,11 +312,27 @@ def enable_stack_on_sig(
) )
return None return None
# Capture the trio token if we're inside `trio.run`
# so SIGUSR1 dispatches the dump *onto* the trio loop
# (full task-tree visibility). When called outside trio
# (e.g. from `pytest_configure`), token capture fails
# silently and `dump_tree_on_sig` falls back to the
# direct-call path.
global _trio_token
try:
_trio_token = trio.lowlevel.current_trio_token()
except RuntimeError:
# not in a `trio.run` — leave None; runtime can
# re-call `enable_stack_on_sig()` later from
# inside `async_main` to capture it.
_trio_token = None
handler: Callable|int = getsignal(sig) handler: Callable|int = getsignal(sig)
if handler is dump_tree_on_sig: if handler is dump_tree_on_sig:
log.devx( log.devx(
'A `SIGUSR1` handler already exists?\n' 'A `SIGUSR1` handler already exists?\n'
f'|_ {handler!r}\n' f'|_ {handler!r}\n'
f'(trio_token captured: {_trio_token is not None})\n'
) )
return return
@ -287,5 +346,6 @@ def enable_stack_on_sig(
f'{stackscope!r}\n\n' f'{stackscope!r}\n\n'
f'With `SIGUSR1` handler\n' f'With `SIGUSR1` handler\n'
f'|_{dump_tree_on_sig}\n' f'|_{dump_tree_on_sig}\n'
f'(trio_token captured: {_trio_token is not None})\n'
) )
return stackscope return stackscope