Fix `SIGUSR1` tree-dump ordering in `_stackscope`
Factor the sub-actor relay loop out of `dump_tree_on_sig()` into `_relay_sig_to_subactors()` and chain both dump + relay in a single `run_sync_soon` callback (`_dump_then_relay`) so the parent's task-tree flushes BEFORE any sub receives the signal — fixes a hierarchical-ordering race where subs could dump ahead of the parent in the muxed pty stream. Also, - gate file/tty sink writes behind `write_file` + `write_tty` params on `dump_task_tree()`. - use `actor.aid.uid` instead of deprecated `.uid`. - update `test_shield_pause` expects to match the new sequential parent -> relay-log -> sub ordering. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-codesubint_forkserver_backend
parent
61d4525137
commit
e2b790a70d
|
|
@ -49,9 +49,11 @@ async def main(
|
|||
tractor.open_nursery(
|
||||
debug_mode=True,
|
||||
enable_stack_on_sig=True,
|
||||
# maybe_enable_greenback=False,
|
||||
loglevel='devx',
|
||||
loglevel='devx', # XXX REQUIRED log level!
|
||||
enable_transports=[tpt],
|
||||
# maybe_enable_greenback=True,
|
||||
# ^TODO? maybe a "smarter" way todo all this is how
|
||||
# `modden` does with a rtv serialized through the osenv?
|
||||
) as an,
|
||||
):
|
||||
ptl: tractor.Portal = await an.start_actor(
|
||||
|
|
@ -63,7 +65,9 @@ async def main(
|
|||
start_n_shield_hang,
|
||||
) as (ctx, cpid):
|
||||
|
||||
_, proc, _ = an._children[ptl.chan.uid]
|
||||
_, proc, _ = an._children[
|
||||
ptl.chan.aid.uid
|
||||
]
|
||||
assert cpid == proc.pid
|
||||
|
||||
print(
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ import os
|
|||
import signal
|
||||
import time
|
||||
from typing import (
|
||||
Callable,
|
||||
TYPE_CHECKING,
|
||||
)
|
||||
|
||||
|
|
@ -47,7 +48,10 @@ if TYPE_CHECKING:
|
|||
|
||||
@no_macos
|
||||
def test_shield_pause(
|
||||
spawn: PexpectSpawner,
|
||||
spawn: Callable[
|
||||
...,
|
||||
PexpectSpawner,
|
||||
],
|
||||
):
|
||||
'''
|
||||
Verify the `tractor.pause()/.post_mortem()` API works inside an
|
||||
|
|
@ -55,8 +59,10 @@ def test_shield_pause(
|
|||
next checkpoint wherein the cancelled will get raised.
|
||||
|
||||
'''
|
||||
child = spawn(
|
||||
'shield_hang_in_sub'
|
||||
child: PexpectSpawner = spawn(
|
||||
'shield_hang_in_sub',
|
||||
loglevel='devx',
|
||||
# ^XXX REQUIRED for below patt matching!
|
||||
)
|
||||
expect(
|
||||
child,
|
||||
|
|
@ -86,31 +92,54 @@ def test_shield_pause(
|
|||
# end-of-tree delimiter
|
||||
"end-of-\('root'",
|
||||
)
|
||||
assert_before(
|
||||
_before: str = assert_before(
|
||||
child,
|
||||
[
|
||||
# 'Srying to dump `stackscope` tree..',
|
||||
# 'Dumping `stackscope` tree for actor',
|
||||
"('root'", # uid line
|
||||
|
||||
# TODO!? this used to show?
|
||||
# TODO!? this in-task-code used to show??
|
||||
# -[ ] mk reproducable for @oremanj?
|
||||
# => SOLVED? by our `trio_token.run_sync_soon()`
|
||||
# approach?
|
||||
#
|
||||
# parent block point (non-shielded)
|
||||
# 'await trio.sleep_forever() # in root',
|
||||
]
|
||||
)
|
||||
|
||||
# NOTE, hierarchical-ordering invariant restored by
|
||||
# `_dump_then_relay` (co-scheduled dump+relay on the
|
||||
# trio loop, see `tractor.devx._stackscope`): the
|
||||
# parent's full task-tree prints BEFORE the 'Relaying
|
||||
# `SIGUSR1`' log msg, which prints BEFORE any sub-
|
||||
# actor receives the signal and dumps its own tree.
|
||||
# So the relay log appears BETWEEN `end-of-('root'`
|
||||
# (above) and `end-of-('hanger'` (below).
|
||||
handle_out_of_order: bool = False
|
||||
|
||||
if (
|
||||
handle_out_of_order
|
||||
and
|
||||
"end-of-('hanger'" in _before
|
||||
):
|
||||
assert "('hanger'" in _before
|
||||
assert 'Relaying `SIGUSR1`[10] to sub-actor' in _before
|
||||
|
||||
else:
|
||||
expect(
|
||||
child,
|
||||
# end-of-tree delimiter
|
||||
'Relaying `SIGUSR1`\\[10\\] to sub-actor',
|
||||
)
|
||||
expect(
|
||||
child,
|
||||
# end-of-subactor's-tree delimiter
|
||||
"end-of-\('hanger'",
|
||||
)
|
||||
assert_before(
|
||||
_before: str = assert_before(
|
||||
child,
|
||||
[
|
||||
# relay to the sub should be reported
|
||||
'Relaying `SIGUSR1`[10] to sub-actor',
|
||||
|
||||
"('hanger'", # uid line
|
||||
|
||||
# TODO!? SEE ABOVE
|
||||
|
|
@ -119,6 +148,7 @@ def test_shield_pause(
|
|||
]
|
||||
)
|
||||
|
||||
|
||||
# simulate the user sending a ctl-c to the hanging program.
|
||||
# this should result in the terminator kicking in since
|
||||
# the sub is shield blocking and can't respond to SIGINT.
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ disjoint, parallel executing tasks in separate actors.
|
|||
|
||||
'''
|
||||
from __future__ import annotations
|
||||
# from functools import partial
|
||||
from functools import partial
|
||||
from threading import (
|
||||
current_thread,
|
||||
Thread,
|
||||
|
|
@ -63,7 +63,10 @@ if TYPE_CHECKING:
|
|||
|
||||
|
||||
@trio.lowlevel.disable_ki_protection
|
||||
def dump_task_tree() -> None:
|
||||
def dump_task_tree(
|
||||
write_file: bool = False,
|
||||
write_tty: bool = False,
|
||||
) -> None:
|
||||
'''
|
||||
Do a classic `stackscope.extract()` task-tree dump to console at
|
||||
`.devx()` level.
|
||||
|
|
@ -121,7 +124,7 @@ def dump_task_tree() -> None:
|
|||
)
|
||||
full_dump: str = (
|
||||
f'Dumping `stackscope` tree for actor\n'
|
||||
f'(>: {actor.uid!r}\n'
|
||||
f'(>: {actor.aid.uid!r}\n'
|
||||
f' |_{mp.current_process()}\n'
|
||||
f' |_{thr}\n'
|
||||
# TODO, use the nest_from_op
|
||||
|
|
@ -134,11 +137,11 @@ def dump_task_tree() -> None:
|
|||
f'capture-bypass tee: {fpath}\n'
|
||||
f'(`tail -f {fpath}` to follow across signals)\n'
|
||||
f'\n'
|
||||
f'------ start-of-{actor.uid!r} ------\n'
|
||||
f'------ start-of-{actor.aid.uid!r} ------\n'
|
||||
f'|\n'
|
||||
f'{tree_str}'
|
||||
f'|\n'
|
||||
f'|_____ end-of-{actor.uid!r} ______\n'
|
||||
f'|_____ end-of-{actor.aid.uid!r} ______\n'
|
||||
)
|
||||
log.devx(full_dump)
|
||||
|
||||
|
|
@ -146,6 +149,7 @@ def dump_task_tree() -> None:
|
|||
# `--capture=fd` swallows `log.devx()` above; the
|
||||
# following two writes guarantee the dump reaches the
|
||||
# human even when stdio is captured.
|
||||
if write_file:
|
||||
try:
|
||||
with open(fpath, 'a') as f:
|
||||
f.write(full_dump + '\n')
|
||||
|
|
@ -154,6 +158,7 @@ def dump_task_tree() -> None:
|
|||
f'Failed to tee stackscope dump to {fpath!r}'
|
||||
)
|
||||
|
||||
if write_tty:
|
||||
try:
|
||||
with open('/dev/tty', 'w') as tty:
|
||||
tty.write(full_dump + '\n')
|
||||
|
|
@ -167,7 +172,7 @@ _tree_dumped: bool = False
|
|||
|
||||
# Captured at `enable_stack_on_sig()` time when running
|
||||
# inside a trio task. `dump_tree_on_sig` uses this to
|
||||
# schedule `dump_task_tree` ON the trio loop via
|
||||
# schedule `dump_task_tree()` ON the trio loop via
|
||||
# `token.run_sync_soon` so stackscope sees a real current
|
||||
# task and can recurse into nursery children. Without
|
||||
# it (signal handler running in a non-trio stack frame),
|
||||
|
|
@ -176,13 +181,70 @@ _tree_dumped: bool = False
|
|||
_trio_token: trio.lowlevel.TrioToken|None = None
|
||||
|
||||
|
||||
def _safe_dump_task_tree() -> None:
|
||||
def _relay_sig_to_subactors(sig: int) -> None:
|
||||
'''
|
||||
`run_sync_soon`-friendly wrapper that swallows any
|
||||
exception from `dump_task_tree`. Trio prints
|
||||
+ crashes on uncaught exceptions in scheduled
|
||||
callbacks; we'd rather log + keep the test running so
|
||||
the user can re-trigger the dump.
|
||||
Forward `sig` to every live sub-actor's underlying
|
||||
process so each runs its own `dump_tree_on_sig`
|
||||
handler.
|
||||
|
||||
Factored out of `dump_tree_on_sig` so the
|
||||
`run_sync_soon`-deferred path can call it AFTER
|
||||
the parent's `dump_task_tree()` completes — see
|
||||
`_dump_then_relay` below for why ordering matters.
|
||||
|
||||
'''
|
||||
an: ActorNursery
|
||||
for an in _state.current_actor()._actoruid2nursery.values():
|
||||
subproc: ProcessType
|
||||
subactor: Actor
|
||||
for (
|
||||
subactor,
|
||||
subproc,
|
||||
_,
|
||||
) in an._children.values():
|
||||
log.warning(
|
||||
f'Relaying `SIGUSR1`[{sig}] to sub-actor\n'
|
||||
f'{subactor}\n'
|
||||
f' |_{subproc}\n'
|
||||
)
|
||||
# bc of course stdlib can't have a std API.. XD
|
||||
match subproc:
|
||||
case trio.Process():
|
||||
subproc.send_signal(sig)
|
||||
|
||||
case mp.Process():
|
||||
subproc._send_signal(sig)
|
||||
|
||||
|
||||
def _dump_then_relay(
|
||||
sig: int|None,
|
||||
) -> None:
|
||||
'''
|
||||
`run_sync_soon`-friendly callback: dump THIS actor's
|
||||
task tree first, THEN relay `sig` to subactors so
|
||||
their dumps can't race ahead of ours.
|
||||
|
||||
Hierarchical-ordering preservation: the legacy
|
||||
direct-call path (pre-`run_sync_soon`) ran the dump
|
||||
synchronously inside the signal handler, then
|
||||
relayed — guaranteeing parent-output-before-child
|
||||
in the multiplexed pty stream. The pure-deferred
|
||||
path (schedule dump only, relay sync from handler)
|
||||
inverts that: relay fires while the parent's
|
||||
dump is still queued, subs receive SIGUSR1 and
|
||||
schedule their own dumps, all dumps then race in
|
||||
arbitrary order through stdio.
|
||||
|
||||
Co-scheduling fixes that: by chaining relay AFTER
|
||||
`dump_task_tree()` inside the same trio-loop
|
||||
callback, parent output flushes before any sub
|
||||
receives the signal, restoring the
|
||||
parent → relay-log → sub-dump ordering humans
|
||||
expect when reading hang-investigation traces.
|
||||
|
||||
Trio prints + crashes on uncaught exceptions in
|
||||
scheduled callbacks; we swallow + log so the test
|
||||
keeps running and the user can re-trigger.
|
||||
|
||||
'''
|
||||
try:
|
||||
|
|
@ -193,6 +255,17 @@ def _safe_dump_task_tree() -> None:
|
|||
'`run_sync_soon`); continuing.\n'
|
||||
)
|
||||
|
||||
if sig is None:
|
||||
return
|
||||
|
||||
try:
|
||||
_relay_sig_to_subactors(sig)
|
||||
except BaseException:
|
||||
log.exception(
|
||||
f'`_relay_sig_to_subactors({sig})` raised '
|
||||
f'(scheduled via `run_sync_soon`); continuing.\n'
|
||||
)
|
||||
|
||||
|
||||
def dump_tree_on_sig(
|
||||
sig: int,
|
||||
|
|
@ -223,8 +296,23 @@ def dump_tree_on_sig(
|
|||
# only the `<init>` task. Falls back to a direct
|
||||
# call when no token was captured (e.g. signal
|
||||
# delivered outside a trio.run).
|
||||
#
|
||||
# Co-schedule the relay-to-subs in the SAME
|
||||
# callback so parent's dump prints BEFORE any
|
||||
# sub receives SIGUSR1 — see `_dump_then_relay`
|
||||
# for the full hierarchical-ordering rationale.
|
||||
if _trio_token is not None:
|
||||
_trio_token.run_sync_soon(_safe_dump_task_tree)
|
||||
_trio_token.run_sync_soon(
|
||||
partial(
|
||||
_dump_then_relay,
|
||||
sig=sig if relay_to_subs else None,
|
||||
)
|
||||
)
|
||||
# NOTE, `_dump_then_relay` handles the relay
|
||||
# internally; bail out before the
|
||||
# direct-path relay below.
|
||||
return
|
||||
|
||||
else:
|
||||
dump_task_tree()
|
||||
|
||||
|
|
@ -246,27 +334,15 @@ def dump_tree_on_sig(
|
|||
# 'Supposedly we dumped just fine..?'
|
||||
# )
|
||||
|
||||
# Direct-path relay (only reached when `_trio_token`
|
||||
# was None — the run_sync_soon path returned above
|
||||
# to let `_dump_then_relay` handle the relay
|
||||
# in-callback).
|
||||
if not relay_to_subs:
|
||||
log.devx(f'Skipping {sig!r} relay to subactors..')
|
||||
return
|
||||
|
||||
an: ActorNursery
|
||||
for an in _state.current_actor()._actoruid2nursery.values():
|
||||
subproc: ProcessType
|
||||
subactor: Actor
|
||||
for subactor, subproc, _ in an._children.values():
|
||||
log.warning(
|
||||
f'Relaying `SIGUSR1`[{sig}] to sub-actor\n'
|
||||
f'{subactor}\n'
|
||||
f' |_{subproc}\n'
|
||||
)
|
||||
|
||||
# bc of course stdlib can't have a std API.. XD
|
||||
match subproc:
|
||||
case trio.Process():
|
||||
subproc.send_signal(sig)
|
||||
|
||||
case mp.Process():
|
||||
subproc._send_signal(sig)
|
||||
_relay_sig_to_subactors(sig)
|
||||
|
||||
|
||||
def enable_stack_on_sig(
|
||||
|
|
|
|||
Loading…
Reference in New Issue