diff --git a/examples/debugging/shield_hang_in_sub.py b/examples/debugging/shield_hang_in_sub.py index 280757ea..530f26db 100644 --- a/examples/debugging/shield_hang_in_sub.py +++ b/examples/debugging/shield_hang_in_sub.py @@ -49,9 +49,11 @@ async def main( tractor.open_nursery( debug_mode=True, enable_stack_on_sig=True, - # maybe_enable_greenback=False, - loglevel='devx', + loglevel='devx', # XXX REQUIRED log level! enable_transports=[tpt], + # maybe_enable_greenback=True, + # ^TODO? maybe a "smarter" way todo all this is how + # `modden` does with a rtv serialized through the osenv? ) as an, ): ptl: tractor.Portal = await an.start_actor( @@ -63,7 +65,9 @@ async def main( start_n_shield_hang, ) as (ctx, cpid): - _, proc, _ = an._children[ptl.chan.uid] + _, proc, _ = an._children[ + ptl.chan.aid.uid + ] assert cpid == proc.pid print( diff --git a/tests/devx/test_tooling.py b/tests/devx/test_tooling.py index 3678854a..d00bee4b 100644 --- a/tests/devx/test_tooling.py +++ b/tests/devx/test_tooling.py @@ -21,6 +21,7 @@ import os import signal import time from typing import ( + Callable, TYPE_CHECKING, ) @@ -47,7 +48,10 @@ if TYPE_CHECKING: @no_macos def test_shield_pause( - spawn: PexpectSpawner, + spawn: Callable[ + ..., + PexpectSpawner, + ], ): ''' Verify the `tractor.pause()/.post_mortem()` API works inside an @@ -55,8 +59,10 @@ def test_shield_pause( next checkpoint wherein the cancelled will get raised. ''' - child = spawn( - 'shield_hang_in_sub' + child: PexpectSpawner = spawn( + 'shield_hang_in_sub', + loglevel='devx', + # ^XXX REQUIRED for below patt matching! ) expect( child, @@ -86,38 +92,62 @@ def test_shield_pause( # end-of-tree delimiter "end-of-\('root'", ) - assert_before( + _before: str = assert_before( child, [ # 'Srying to dump `stackscope` tree..', # 'Dumping `stackscope` tree for actor', "('root'", # uid line - # TODO!? this used to show? + # TODO!? this in-task-code used to show?? # -[ ] mk reproducable for @oremanj? + # => SOLVED? by our `trio_token.run_sync_soon()` + # approach? # # parent block point (non-shielded) # 'await trio.sleep_forever() # in root', ] ) - expect( - child, - # end-of-tree delimiter - "end-of-\('hanger'", - ) - assert_before( - child, - [ - # relay to the sub should be reported - 'Relaying `SIGUSR1`[10] to sub-actor', - "('hanger'", # uid line + # NOTE, hierarchical-ordering invariant restored by + # `_dump_then_relay` (co-scheduled dump+relay on the + # trio loop, see `tractor.devx._stackscope`): the + # parent's full task-tree prints BEFORE the 'Relaying + # `SIGUSR1`' log msg, which prints BEFORE any sub- + # actor receives the signal and dumps its own tree. + # So the relay log appears BETWEEN `end-of-('root'` + # (above) and `end-of-('hanger'` (below). + handle_out_of_order: bool = False + + if ( + handle_out_of_order + and + "end-of-('hanger'" in _before + ): + assert "('hanger'" in _before + assert 'Relaying `SIGUSR1`[10] to sub-actor' in _before + + else: + expect( + child, + 'Relaying `SIGUSR1`\\[10\\] to sub-actor', + ) + expect( + child, + # end-of-subactor's-tree delimiter + "end-of-\('hanger'", + ) + _before: str = assert_before( + child, + [ + "('hanger'", # uid line + + # TODO!? SEE ABOVE + # hanger LOC where it's shield-halted + # 'await trio.sleep_forever() # in subactor', + ] + ) - # TODO!? SEE ABOVE - # hanger LOC where it's shield-halted - # 'await trio.sleep_forever() # in subactor', - ] - ) # simulate the user sending a ctl-c to the hanging program. # this should result in the terminator kicking in since diff --git a/tractor/devx/_stackscope.py b/tractor/devx/_stackscope.py index 85f57098..aab313eb 100644 --- a/tractor/devx/_stackscope.py +++ b/tractor/devx/_stackscope.py @@ -24,7 +24,7 @@ disjoint, parallel executing tasks in separate actors. ''' from __future__ import annotations -# from functools import partial +from functools import partial from threading import ( current_thread, Thread, @@ -63,7 +63,10 @@ if TYPE_CHECKING: @trio.lowlevel.disable_ki_protection -def dump_task_tree() -> None: +def dump_task_tree( + write_file: bool = False, + write_tty: bool = False, +) -> None: ''' Do a classic `stackscope.extract()` task-tree dump to console at `.devx()` level. @@ -121,7 +124,7 @@ def dump_task_tree() -> None: ) full_dump: str = ( f'Dumping `stackscope` tree for actor\n' - f'(>: {actor.uid!r}\n' + f'(>: {actor.aid.uid!r}\n' f' |_{mp.current_process()}\n' f' |_{thr}\n' # TODO, use the nest_from_op @@ -134,11 +137,11 @@ def dump_task_tree() -> None: f'capture-bypass tee: {fpath}\n' f'(`tail -f {fpath}` to follow across signals)\n' f'\n' - f'------ start-of-{actor.uid!r} ------\n' + f'------ start-of-{actor.aid.uid!r} ------\n' f'|\n' f'{tree_str}' f'|\n' - f'|_____ end-of-{actor.uid!r} ______\n' + f'|_____ end-of-{actor.aid.uid!r} ______\n' ) log.devx(full_dump) @@ -146,28 +149,30 @@ def dump_task_tree() -> None: # `--capture=fd` swallows `log.devx()` above; the # following two writes guarantee the dump reaches the # human even when stdio is captured. - try: - with open(fpath, 'a') as f: - f.write(full_dump + '\n') - except OSError: - log.exception( - f'Failed to tee stackscope dump to {fpath!r}' - ) + if write_file: + try: + with open(fpath, 'a') as f: + f.write(full_dump + '\n') + except OSError: + log.exception( + f'Failed to tee stackscope dump to {fpath!r}' + ) - try: - with open('/dev/tty', 'w') as tty: - tty.write(full_dump + '\n') - except OSError: - # no controlling tty (CI / nohup / detached) — - # silently fall through; the file sink covers it. - pass + if write_tty: + try: + with open('/dev/tty', 'w') as tty: + tty.write(full_dump + '\n') + except OSError: + # no controlling tty (CI / nohup / detached) — + # silently fall through; the file sink covers it. + pass _handler_lock = RLock() _tree_dumped: bool = False # Captured at `enable_stack_on_sig()` time when running # inside a trio task. `dump_tree_on_sig` uses this to -# schedule `dump_task_tree` ON the trio loop via +# schedule `dump_task_tree()` ON the trio loop via # `token.run_sync_soon` so stackscope sees a real current # task and can recurse into nursery children. Without # it (signal handler running in a non-trio stack frame), @@ -176,13 +181,70 @@ _tree_dumped: bool = False _trio_token: trio.lowlevel.TrioToken|None = None -def _safe_dump_task_tree() -> None: +def _relay_sig_to_subactors(sig: int) -> None: ''' - `run_sync_soon`-friendly wrapper that swallows any - exception from `dump_task_tree`. Trio prints - + crashes on uncaught exceptions in scheduled - callbacks; we'd rather log + keep the test running so - the user can re-trigger the dump. + Forward `sig` to every live sub-actor's underlying + process so each runs its own `dump_tree_on_sig` + handler. + + Factored out of `dump_tree_on_sig` so the + `run_sync_soon`-deferred path can call it AFTER + the parent's `dump_task_tree()` completes — see + `_dump_then_relay` below for why ordering matters. + + ''' + an: ActorNursery + for an in _state.current_actor()._actoruid2nursery.values(): + subproc: ProcessType + subactor: Actor + for ( + subactor, + subproc, + _, + ) in an._children.values(): + log.warning( + f'Relaying `SIGUSR1`[{sig}] to sub-actor\n' + f'{subactor}\n' + f' |_{subproc}\n' + ) + # bc of course stdlib can't have a std API.. XD + match subproc: + case trio.Process(): + subproc.send_signal(sig) + + case mp.Process(): + subproc._send_signal(sig) + + +def _dump_then_relay( + sig: int|None, +) -> None: + ''' + `run_sync_soon`-friendly callback: dump THIS actor's + task tree first, THEN relay `sig` to subactors so + their dumps can't race ahead of ours. + + Hierarchical-ordering preservation: the legacy + direct-call path (pre-`run_sync_soon`) ran the dump + synchronously inside the signal handler, then + relayed — guaranteeing parent-output-before-child + in the multiplexed pty stream. The pure-deferred + path (schedule dump only, relay sync from handler) + inverts that: relay fires while the parent's + dump is still queued, subs receive SIGUSR1 and + schedule their own dumps, all dumps then race in + arbitrary order through stdio. + + Co-scheduling fixes that: by chaining relay AFTER + `dump_task_tree()` inside the same trio-loop + callback, parent output flushes before any sub + receives the signal, restoring the + parent → relay-log → sub-dump ordering humans + expect when reading hang-investigation traces. + + Trio prints + crashes on uncaught exceptions in + scheduled callbacks; we swallow + log so the test + keeps running and the user can re-trigger. ''' try: @@ -193,6 +255,17 @@ def _safe_dump_task_tree() -> None: '`run_sync_soon`); continuing.\n' ) + if sig is None: + return + + try: + _relay_sig_to_subactors(sig) + except BaseException: + log.exception( + f'`_relay_sig_to_subactors({sig})` raised ' + f'(scheduled via `run_sync_soon`); continuing.\n' + ) + def dump_tree_on_sig( sig: int, @@ -223,8 +296,23 @@ def dump_tree_on_sig( # only the `` task. Falls back to a direct # call when no token was captured (e.g. signal # delivered outside a trio.run). + # + # Co-schedule the relay-to-subs in the SAME + # callback so parent's dump prints BEFORE any + # sub receives SIGUSR1 — see `_dump_then_relay` + # for the full hierarchical-ordering rationale. if _trio_token is not None: - _trio_token.run_sync_soon(_safe_dump_task_tree) + _trio_token.run_sync_soon( + partial( + _dump_then_relay, + sig=sig if relay_to_subs else None, + ) + ) + # NOTE, `_dump_then_relay` handles the relay + # internally; bail out before the + # direct-path relay below. + return + else: dump_task_tree() @@ -246,27 +334,15 @@ def dump_tree_on_sig( # 'Supposedly we dumped just fine..?' # ) + # Direct-path relay (only reached when `_trio_token` + # was None — the run_sync_soon path returned above + # to let `_dump_then_relay` handle the relay + # in-callback). if not relay_to_subs: + log.devx(f'Skipping {sig!r} relay to subactors..') return - an: ActorNursery - for an in _state.current_actor()._actoruid2nursery.values(): - subproc: ProcessType - subactor: Actor - for subactor, subproc, _ in an._children.values(): - log.warning( - f'Relaying `SIGUSR1`[{sig}] to sub-actor\n' - f'{subactor}\n' - f' |_{subproc}\n' - ) - - # bc of course stdlib can't have a std API.. XD - match subproc: - case trio.Process(): - subproc.send_signal(sig) - - case mp.Process(): - subproc._send_signal(sig) + _relay_sig_to_subactors(sig) def enable_stack_on_sig(