Fix `SIGUSR1` tree-dump ordering in `_stackscope`

Factor the sub-actor relay loop out of
`dump_tree_on_sig()` into `_relay_sig_to_subactors()`
and chain both dump + relay in a single
`run_sync_soon` callback (`_dump_then_relay`) so the
parent's task-tree flushes BEFORE any sub receives
the signal — fixes a hierarchical-ordering race
where subs could dump ahead of the parent in the
muxed pty stream.

Also,
- gate file/tty sink writes behind `write_file` +
  `write_tty` params on `dump_task_tree()`.
- use `actor.aid.uid` instead of deprecated `.uid`.
- update `test_shield_pause` expects to match the
  new sequential parent -> relay-log -> sub ordering.

(this commit msg was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
subint_forkserver_backend
Gud Boi 2026-04-30 19:35:55 -04:00
parent 61d4525137
commit e2b790a70d
3 changed files with 179 additions and 69 deletions

View File

@ -49,9 +49,11 @@ async def main(
tractor.open_nursery(
debug_mode=True,
enable_stack_on_sig=True,
# maybe_enable_greenback=False,
loglevel='devx',
loglevel='devx', # XXX REQUIRED log level!
enable_transports=[tpt],
# maybe_enable_greenback=True,
# ^TODO? maybe a "smarter" way todo all this is how
# `modden` does with a rtv serialized through the osenv?
) as an,
):
ptl: tractor.Portal = await an.start_actor(
@ -63,7 +65,9 @@ async def main(
start_n_shield_hang,
) as (ctx, cpid):
_, proc, _ = an._children[ptl.chan.uid]
_, proc, _ = an._children[
ptl.chan.aid.uid
]
assert cpid == proc.pid
print(

View File

@ -21,6 +21,7 @@ import os
import signal
import time
from typing import (
Callable,
TYPE_CHECKING,
)
@ -47,7 +48,10 @@ if TYPE_CHECKING:
@no_macos
def test_shield_pause(
spawn: PexpectSpawner,
spawn: Callable[
...,
PexpectSpawner,
],
):
'''
Verify the `tractor.pause()/.post_mortem()` API works inside an
@ -55,8 +59,10 @@ def test_shield_pause(
next checkpoint wherein the cancelled will get raised.
'''
child = spawn(
'shield_hang_in_sub'
child: PexpectSpawner = spawn(
'shield_hang_in_sub',
loglevel='devx',
# ^XXX REQUIRED for below patt matching!
)
expect(
child,
@ -86,31 +92,54 @@ def test_shield_pause(
# end-of-tree delimiter
"end-of-\('root'",
)
assert_before(
_before: str = assert_before(
child,
[
# 'Srying to dump `stackscope` tree..',
# 'Dumping `stackscope` tree for actor',
"('root'", # uid line
# TODO!? this used to show?
# TODO!? this in-task-code used to show??
# -[ ] mk reproducable for @oremanj?
# => SOLVED? by our `trio_token.run_sync_soon()`
# approach?
#
# parent block point (non-shielded)
# 'await trio.sleep_forever() # in root',
]
)
# NOTE, hierarchical-ordering invariant restored by
# `_dump_then_relay` (co-scheduled dump+relay on the
# trio loop, see `tractor.devx._stackscope`): the
# parent's full task-tree prints BEFORE the 'Relaying
# `SIGUSR1`' log msg, which prints BEFORE any sub-
# actor receives the signal and dumps its own tree.
# So the relay log appears BETWEEN `end-of-('root'`
# (above) and `end-of-('hanger'` (below).
handle_out_of_order: bool = False
if (
handle_out_of_order
and
"end-of-('hanger'" in _before
):
assert "('hanger'" in _before
assert 'Relaying `SIGUSR1`[10] to sub-actor' in _before
else:
expect(
child,
# end-of-tree delimiter
'Relaying `SIGUSR1`\\[10\\] to sub-actor',
)
expect(
child,
# end-of-subactor's-tree delimiter
"end-of-\('hanger'",
)
assert_before(
_before: str = assert_before(
child,
[
# relay to the sub should be reported
'Relaying `SIGUSR1`[10] to sub-actor',
"('hanger'", # uid line
# TODO!? SEE ABOVE
@ -119,6 +148,7 @@ def test_shield_pause(
]
)
# simulate the user sending a ctl-c to the hanging program.
# this should result in the terminator kicking in since
# the sub is shield blocking and can't respond to SIGINT.

View File

@ -24,7 +24,7 @@ disjoint, parallel executing tasks in separate actors.
'''
from __future__ import annotations
# from functools import partial
from functools import partial
from threading import (
current_thread,
Thread,
@ -63,7 +63,10 @@ if TYPE_CHECKING:
@trio.lowlevel.disable_ki_protection
def dump_task_tree() -> None:
def dump_task_tree(
write_file: bool = False,
write_tty: bool = False,
) -> None:
'''
Do a classic `stackscope.extract()` task-tree dump to console at
`.devx()` level.
@ -121,7 +124,7 @@ def dump_task_tree() -> None:
)
full_dump: str = (
f'Dumping `stackscope` tree for actor\n'
f'(>: {actor.uid!r}\n'
f'(>: {actor.aid.uid!r}\n'
f' |_{mp.current_process()}\n'
f' |_{thr}\n'
# TODO, use the nest_from_op
@ -134,11 +137,11 @@ def dump_task_tree() -> None:
f'capture-bypass tee: {fpath}\n'
f'(`tail -f {fpath}` to follow across signals)\n'
f'\n'
f'------ start-of-{actor.uid!r} ------\n'
f'------ start-of-{actor.aid.uid!r} ------\n'
f'|\n'
f'{tree_str}'
f'|\n'
f'|_____ end-of-{actor.uid!r} ______\n'
f'|_____ end-of-{actor.aid.uid!r} ______\n'
)
log.devx(full_dump)
@ -146,6 +149,7 @@ def dump_task_tree() -> None:
# `--capture=fd` swallows `log.devx()` above; the
# following two writes guarantee the dump reaches the
# human even when stdio is captured.
if write_file:
try:
with open(fpath, 'a') as f:
f.write(full_dump + '\n')
@ -154,6 +158,7 @@ def dump_task_tree() -> None:
f'Failed to tee stackscope dump to {fpath!r}'
)
if write_tty:
try:
with open('/dev/tty', 'w') as tty:
tty.write(full_dump + '\n')
@ -167,7 +172,7 @@ _tree_dumped: bool = False
# Captured at `enable_stack_on_sig()` time when running
# inside a trio task. `dump_tree_on_sig` uses this to
# schedule `dump_task_tree` ON the trio loop via
# schedule `dump_task_tree()` ON the trio loop via
# `token.run_sync_soon` so stackscope sees a real current
# task and can recurse into nursery children. Without
# it (signal handler running in a non-trio stack frame),
@ -176,13 +181,70 @@ _tree_dumped: bool = False
_trio_token: trio.lowlevel.TrioToken|None = None
def _safe_dump_task_tree() -> None:
def _relay_sig_to_subactors(sig: int) -> None:
'''
`run_sync_soon`-friendly wrapper that swallows any
exception from `dump_task_tree`. Trio prints
+ crashes on uncaught exceptions in scheduled
callbacks; we'd rather log + keep the test running so
the user can re-trigger the dump.
Forward `sig` to every live sub-actor's underlying
process so each runs its own `dump_tree_on_sig`
handler.
Factored out of `dump_tree_on_sig` so the
`run_sync_soon`-deferred path can call it AFTER
the parent's `dump_task_tree()` completes — see
`_dump_then_relay` below for why ordering matters.
'''
an: ActorNursery
for an in _state.current_actor()._actoruid2nursery.values():
subproc: ProcessType
subactor: Actor
for (
subactor,
subproc,
_,
) in an._children.values():
log.warning(
f'Relaying `SIGUSR1`[{sig}] to sub-actor\n'
f'{subactor}\n'
f' |_{subproc}\n'
)
# bc of course stdlib can't have a std API.. XD
match subproc:
case trio.Process():
subproc.send_signal(sig)
case mp.Process():
subproc._send_signal(sig)
def _dump_then_relay(
sig: int|None,
) -> None:
'''
`run_sync_soon`-friendly callback: dump THIS actor's
task tree first, THEN relay `sig` to subactors so
their dumps can't race ahead of ours.
Hierarchical-ordering preservation: the legacy
direct-call path (pre-`run_sync_soon`) ran the dump
synchronously inside the signal handler, then
relayed guaranteeing parent-output-before-child
in the multiplexed pty stream. The pure-deferred
path (schedule dump only, relay sync from handler)
inverts that: relay fires while the parent's
dump is still queued, subs receive SIGUSR1 and
schedule their own dumps, all dumps then race in
arbitrary order through stdio.
Co-scheduling fixes that: by chaining relay AFTER
`dump_task_tree()` inside the same trio-loop
callback, parent output flushes before any sub
receives the signal, restoring the
parent relay-log sub-dump ordering humans
expect when reading hang-investigation traces.
Trio prints + crashes on uncaught exceptions in
scheduled callbacks; we swallow + log so the test
keeps running and the user can re-trigger.
'''
try:
@ -193,6 +255,17 @@ def _safe_dump_task_tree() -> None:
'`run_sync_soon`); continuing.\n'
)
if sig is None:
return
try:
_relay_sig_to_subactors(sig)
except BaseException:
log.exception(
f'`_relay_sig_to_subactors({sig})` raised '
f'(scheduled via `run_sync_soon`); continuing.\n'
)
def dump_tree_on_sig(
sig: int,
@ -223,8 +296,23 @@ def dump_tree_on_sig(
# only the `<init>` task. Falls back to a direct
# call when no token was captured (e.g. signal
# delivered outside a trio.run).
#
# Co-schedule the relay-to-subs in the SAME
# callback so parent's dump prints BEFORE any
# sub receives SIGUSR1 — see `_dump_then_relay`
# for the full hierarchical-ordering rationale.
if _trio_token is not None:
_trio_token.run_sync_soon(_safe_dump_task_tree)
_trio_token.run_sync_soon(
partial(
_dump_then_relay,
sig=sig if relay_to_subs else None,
)
)
# NOTE, `_dump_then_relay` handles the relay
# internally; bail out before the
# direct-path relay below.
return
else:
dump_task_tree()
@ -246,27 +334,15 @@ def dump_tree_on_sig(
# 'Supposedly we dumped just fine..?'
# )
# Direct-path relay (only reached when `_trio_token`
# was None — the run_sync_soon path returned above
# to let `_dump_then_relay` handle the relay
# in-callback).
if not relay_to_subs:
log.devx(f'Skipping {sig!r} relay to subactors..')
return
an: ActorNursery
for an in _state.current_actor()._actoruid2nursery.values():
subproc: ProcessType
subactor: Actor
for subactor, subproc, _ in an._children.values():
log.warning(
f'Relaying `SIGUSR1`[{sig}] to sub-actor\n'
f'{subactor}\n'
f' |_{subproc}\n'
)
# bc of course stdlib can't have a std API.. XD
match subproc:
case trio.Process():
subproc.send_signal(sig)
case mp.Process():
subproc._send_signal(sig)
_relay_sig_to_subactors(sig)
def enable_stack_on_sig(