Fix `SIGUSR1` tree-dump ordering in `_stackscope`

Factor the sub-actor relay loop out of
`dump_tree_on_sig()` into `_relay_sig_to_subactors()`
and chain both dump + relay in a single
`run_sync_soon` callback (`_dump_then_relay`) so the
parent's task-tree flushes BEFORE any sub receives
the signal — fixes a hierarchical-ordering race
where subs could dump ahead of the parent in the
muxed pty stream.

Also,
- gate file/tty sink writes behind `write_file` +
  `write_tty` params on `dump_task_tree()`.
- use `actor.aid.uid` instead of deprecated `.uid`.
- update `test_shield_pause` expects to match the
  new sequential parent -> relay-log -> sub ordering.

(this commit msg was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
subint_forkserver_backend
Gud Boi 2026-04-30 19:35:55 -04:00
parent 61d4525137
commit e2b790a70d
3 changed files with 179 additions and 69 deletions

View File

@ -49,9 +49,11 @@ async def main(
tractor.open_nursery( tractor.open_nursery(
debug_mode=True, debug_mode=True,
enable_stack_on_sig=True, enable_stack_on_sig=True,
# maybe_enable_greenback=False, loglevel='devx', # XXX REQUIRED log level!
loglevel='devx',
enable_transports=[tpt], enable_transports=[tpt],
# maybe_enable_greenback=True,
# ^TODO? maybe a "smarter" way todo all this is how
# `modden` does with a rtv serialized through the osenv?
) as an, ) as an,
): ):
ptl: tractor.Portal = await an.start_actor( ptl: tractor.Portal = await an.start_actor(
@ -63,7 +65,9 @@ async def main(
start_n_shield_hang, start_n_shield_hang,
) as (ctx, cpid): ) as (ctx, cpid):
_, proc, _ = an._children[ptl.chan.uid] _, proc, _ = an._children[
ptl.chan.aid.uid
]
assert cpid == proc.pid assert cpid == proc.pid
print( print(

View File

@ -21,6 +21,7 @@ import os
import signal import signal
import time import time
from typing import ( from typing import (
Callable,
TYPE_CHECKING, TYPE_CHECKING,
) )
@ -47,7 +48,10 @@ if TYPE_CHECKING:
@no_macos @no_macos
def test_shield_pause( def test_shield_pause(
spawn: PexpectSpawner, spawn: Callable[
...,
PexpectSpawner,
],
): ):
''' '''
Verify the `tractor.pause()/.post_mortem()` API works inside an Verify the `tractor.pause()/.post_mortem()` API works inside an
@ -55,8 +59,10 @@ def test_shield_pause(
next checkpoint wherein the cancelled will get raised. next checkpoint wherein the cancelled will get raised.
''' '''
child = spawn( child: PexpectSpawner = spawn(
'shield_hang_in_sub' 'shield_hang_in_sub',
loglevel='devx',
# ^XXX REQUIRED for below patt matching!
) )
expect( expect(
child, child,
@ -86,31 +92,54 @@ def test_shield_pause(
# end-of-tree delimiter # end-of-tree delimiter
"end-of-\('root'", "end-of-\('root'",
) )
assert_before( _before: str = assert_before(
child, child,
[ [
# 'Srying to dump `stackscope` tree..', # 'Srying to dump `stackscope` tree..',
# 'Dumping `stackscope` tree for actor', # 'Dumping `stackscope` tree for actor',
"('root'", # uid line "('root'", # uid line
# TODO!? this used to show? # TODO!? this in-task-code used to show??
# -[ ] mk reproducable for @oremanj? # -[ ] mk reproducable for @oremanj?
# => SOLVED? by our `trio_token.run_sync_soon()`
# approach?
# #
# parent block point (non-shielded) # parent block point (non-shielded)
# 'await trio.sleep_forever() # in root', # 'await trio.sleep_forever() # in root',
] ]
) )
# NOTE, hierarchical-ordering invariant restored by
# `_dump_then_relay` (co-scheduled dump+relay on the
# trio loop, see `tractor.devx._stackscope`): the
# parent's full task-tree prints BEFORE the 'Relaying
# `SIGUSR1`' log msg, which prints BEFORE any sub-
# actor receives the signal and dumps its own tree.
# So the relay log appears BETWEEN `end-of-('root'`
# (above) and `end-of-('hanger'` (below).
handle_out_of_order: bool = False
if (
handle_out_of_order
and
"end-of-('hanger'" in _before
):
assert "('hanger'" in _before
assert 'Relaying `SIGUSR1`[10] to sub-actor' in _before
else:
expect( expect(
child, child,
# end-of-tree delimiter 'Relaying `SIGUSR1`\\[10\\] to sub-actor',
)
expect(
child,
# end-of-subactor's-tree delimiter
"end-of-\('hanger'", "end-of-\('hanger'",
) )
assert_before( _before: str = assert_before(
child, child,
[ [
# relay to the sub should be reported
'Relaying `SIGUSR1`[10] to sub-actor',
"('hanger'", # uid line "('hanger'", # uid line
# TODO!? SEE ABOVE # TODO!? SEE ABOVE
@ -119,6 +148,7 @@ def test_shield_pause(
] ]
) )
# simulate the user sending a ctl-c to the hanging program. # simulate the user sending a ctl-c to the hanging program.
# this should result in the terminator kicking in since # this should result in the terminator kicking in since
# the sub is shield blocking and can't respond to SIGINT. # the sub is shield blocking and can't respond to SIGINT.

View File

@ -24,7 +24,7 @@ disjoint, parallel executing tasks in separate actors.
''' '''
from __future__ import annotations from __future__ import annotations
# from functools import partial from functools import partial
from threading import ( from threading import (
current_thread, current_thread,
Thread, Thread,
@ -63,7 +63,10 @@ if TYPE_CHECKING:
@trio.lowlevel.disable_ki_protection @trio.lowlevel.disable_ki_protection
def dump_task_tree() -> None: def dump_task_tree(
write_file: bool = False,
write_tty: bool = False,
) -> None:
''' '''
Do a classic `stackscope.extract()` task-tree dump to console at Do a classic `stackscope.extract()` task-tree dump to console at
`.devx()` level. `.devx()` level.
@ -121,7 +124,7 @@ def dump_task_tree() -> None:
) )
full_dump: str = ( full_dump: str = (
f'Dumping `stackscope` tree for actor\n' f'Dumping `stackscope` tree for actor\n'
f'(>: {actor.uid!r}\n' f'(>: {actor.aid.uid!r}\n'
f' |_{mp.current_process()}\n' f' |_{mp.current_process()}\n'
f' |_{thr}\n' f' |_{thr}\n'
# TODO, use the nest_from_op # TODO, use the nest_from_op
@ -134,11 +137,11 @@ def dump_task_tree() -> None:
f'capture-bypass tee: {fpath}\n' f'capture-bypass tee: {fpath}\n'
f'(`tail -f {fpath}` to follow across signals)\n' f'(`tail -f {fpath}` to follow across signals)\n'
f'\n' f'\n'
f'------ start-of-{actor.uid!r} ------\n' f'------ start-of-{actor.aid.uid!r} ------\n'
f'|\n' f'|\n'
f'{tree_str}' f'{tree_str}'
f'|\n' f'|\n'
f'|_____ end-of-{actor.uid!r} ______\n' f'|_____ end-of-{actor.aid.uid!r} ______\n'
) )
log.devx(full_dump) log.devx(full_dump)
@ -146,6 +149,7 @@ def dump_task_tree() -> None:
# `--capture=fd` swallows `log.devx()` above; the # `--capture=fd` swallows `log.devx()` above; the
# following two writes guarantee the dump reaches the # following two writes guarantee the dump reaches the
# human even when stdio is captured. # human even when stdio is captured.
if write_file:
try: try:
with open(fpath, 'a') as f: with open(fpath, 'a') as f:
f.write(full_dump + '\n') f.write(full_dump + '\n')
@ -154,6 +158,7 @@ def dump_task_tree() -> None:
f'Failed to tee stackscope dump to {fpath!r}' f'Failed to tee stackscope dump to {fpath!r}'
) )
if write_tty:
try: try:
with open('/dev/tty', 'w') as tty: with open('/dev/tty', 'w') as tty:
tty.write(full_dump + '\n') tty.write(full_dump + '\n')
@ -167,7 +172,7 @@ _tree_dumped: bool = False
# Captured at `enable_stack_on_sig()` time when running # Captured at `enable_stack_on_sig()` time when running
# inside a trio task. `dump_tree_on_sig` uses this to # inside a trio task. `dump_tree_on_sig` uses this to
# schedule `dump_task_tree` ON the trio loop via # schedule `dump_task_tree()` ON the trio loop via
# `token.run_sync_soon` so stackscope sees a real current # `token.run_sync_soon` so stackscope sees a real current
# task and can recurse into nursery children. Without # task and can recurse into nursery children. Without
# it (signal handler running in a non-trio stack frame), # it (signal handler running in a non-trio stack frame),
@ -176,13 +181,70 @@ _tree_dumped: bool = False
_trio_token: trio.lowlevel.TrioToken|None = None _trio_token: trio.lowlevel.TrioToken|None = None
def _safe_dump_task_tree() -> None: def _relay_sig_to_subactors(sig: int) -> None:
''' '''
`run_sync_soon`-friendly wrapper that swallows any Forward `sig` to every live sub-actor's underlying
exception from `dump_task_tree`. Trio prints process so each runs its own `dump_tree_on_sig`
+ crashes on uncaught exceptions in scheduled handler.
callbacks; we'd rather log + keep the test running so
the user can re-trigger the dump. Factored out of `dump_tree_on_sig` so the
`run_sync_soon`-deferred path can call it AFTER
the parent's `dump_task_tree()` completes — see
`_dump_then_relay` below for why ordering matters.
'''
an: ActorNursery
for an in _state.current_actor()._actoruid2nursery.values():
subproc: ProcessType
subactor: Actor
for (
subactor,
subproc,
_,
) in an._children.values():
log.warning(
f'Relaying `SIGUSR1`[{sig}] to sub-actor\n'
f'{subactor}\n'
f' |_{subproc}\n'
)
# bc of course stdlib can't have a std API.. XD
match subproc:
case trio.Process():
subproc.send_signal(sig)
case mp.Process():
subproc._send_signal(sig)
def _dump_then_relay(
sig: int|None,
) -> None:
'''
`run_sync_soon`-friendly callback: dump THIS actor's
task tree first, THEN relay `sig` to subactors so
their dumps can't race ahead of ours.
Hierarchical-ordering preservation: the legacy
direct-call path (pre-`run_sync_soon`) ran the dump
synchronously inside the signal handler, then
relayed guaranteeing parent-output-before-child
in the multiplexed pty stream. The pure-deferred
path (schedule dump only, relay sync from handler)
inverts that: relay fires while the parent's
dump is still queued, subs receive SIGUSR1 and
schedule their own dumps, all dumps then race in
arbitrary order through stdio.
Co-scheduling fixes that: by chaining relay AFTER
`dump_task_tree()` inside the same trio-loop
callback, parent output flushes before any sub
receives the signal, restoring the
parent relay-log sub-dump ordering humans
expect when reading hang-investigation traces.
Trio prints + crashes on uncaught exceptions in
scheduled callbacks; we swallow + log so the test
keeps running and the user can re-trigger.
''' '''
try: try:
@ -193,6 +255,17 @@ def _safe_dump_task_tree() -> None:
'`run_sync_soon`); continuing.\n' '`run_sync_soon`); continuing.\n'
) )
if sig is None:
return
try:
_relay_sig_to_subactors(sig)
except BaseException:
log.exception(
f'`_relay_sig_to_subactors({sig})` raised '
f'(scheduled via `run_sync_soon`); continuing.\n'
)
def dump_tree_on_sig( def dump_tree_on_sig(
sig: int, sig: int,
@ -223,8 +296,23 @@ def dump_tree_on_sig(
# only the `<init>` task. Falls back to a direct # only the `<init>` task. Falls back to a direct
# call when no token was captured (e.g. signal # call when no token was captured (e.g. signal
# delivered outside a trio.run). # delivered outside a trio.run).
#
# Co-schedule the relay-to-subs in the SAME
# callback so parent's dump prints BEFORE any
# sub receives SIGUSR1 — see `_dump_then_relay`
# for the full hierarchical-ordering rationale.
if _trio_token is not None: if _trio_token is not None:
_trio_token.run_sync_soon(_safe_dump_task_tree) _trio_token.run_sync_soon(
partial(
_dump_then_relay,
sig=sig if relay_to_subs else None,
)
)
# NOTE, `_dump_then_relay` handles the relay
# internally; bail out before the
# direct-path relay below.
return
else: else:
dump_task_tree() dump_task_tree()
@ -246,27 +334,15 @@ def dump_tree_on_sig(
# 'Supposedly we dumped just fine..?' # 'Supposedly we dumped just fine..?'
# ) # )
# Direct-path relay (only reached when `_trio_token`
# was None — the run_sync_soon path returned above
# to let `_dump_then_relay` handle the relay
# in-callback).
if not relay_to_subs: if not relay_to_subs:
log.devx(f'Skipping {sig!r} relay to subactors..')
return return
an: ActorNursery _relay_sig_to_subactors(sig)
for an in _state.current_actor()._actoruid2nursery.values():
subproc: ProcessType
subactor: Actor
for subactor, subproc, _ in an._children.values():
log.warning(
f'Relaying `SIGUSR1`[{sig}] to sub-actor\n'
f'{subactor}\n'
f' |_{subproc}\n'
)
# bc of course stdlib can't have a std API.. XD
match subproc:
case trio.Process():
subproc.send_signal(sig)
case mp.Process():
subproc._send_signal(sig)
def enable_stack_on_sig( def enable_stack_on_sig(