10 changed files with 741 additions and 1929 deletions
--- a/ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md
+++ b/ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md
@ -182,118 +182,6 @@ Target: 0 failures across 5 runs ⇒ ship. 1–2 failures
 still rotating ⇒ apply (C). Same test failing twice
 ⇒ escalate to (E).

-## Snapshot evidence (2026-05-13)
-
-After landing the `fail_after_w_trace` /
-`afk_alarm_w_trace` capture-on-timeout helpers
-(`tractor._testing.trace`), `test_nested_multierrors`
-on the `main_thread_forkserver` backend produces
-**reproducible diag snapshots** at
-`$XDG_CACHE_HOME/tractor/hung-dumps/test_nested_multierrors_start_method_main_thread_forkserver__<iso-ts>/`.
-
-### Reproduction
-
-```bash
-pytest \
-  -v --verbose --durations=10 \
-  --spawn-backend=main_thread_forkserver \
-  --tpt-proto=uds \
-  --capture=sys --show-capture=stderr -rxX \
-  tests/test_cancellation.py::test_nested_multierrors
-```
-
-The test is `xfail(strict=False)` for MTF — it RUNS
-each invocation so snapshots accumulate, but doesn't
-break `--lf` workflow.
-
-### Consistent shape across runs
-
-5+ snapshots taken back-to-back show the SAME pattern:
-
- **Timing:** ~10s wall-clock total. Inner
-  `fail_after_w_trace(10)` fires at exactly T=10s;
-  cascade's `nursery.__aexit__` takes ~0.6s more to
-  gather + propagate the resulting
-  `BaseExceptionGroup`. **Trio backend completes the
-  SAME test in <6s** — so the MTF cascade is ~2x
-  slower at minimum.
-
- **`BaseExceptionGroup` shape:** mixed
-  `[RemoteActorError, Cancelled]`. The first
-  subactor's natural error-propagation (`assert 0`
-  raised → `RemoteActorError` portal-result)
-  arrives before T=10s; the OTHER subactor's
-  portal-wait is still in flight at T=10s, gets
-  cancelled by `fail_after_w_trace`'s scope-cancel
-  → returns `Cancelled` instead.
-
- **Orphan-spawn skew:** snapshot's `orphans` bucket
-  (after the `_is_tractor_subactor` cgroup-slice
-  override fix) consistently shows 2-4 init-adopted
-  procs at `depth_3` and `depth_1` levels — these
-  are the leaves whose parent (`depth_2` spawner)
-  was killed mid-cascade but who hadn't yet seen
-  the cancel signal themselves.
-
- **UDS sock-leak:** 2-6 dead-orphan socks per run
-  (varies with cascade timing). The
-  `track_orphaned_uds_per_test` fixture reaps them
-  post-test → contamination is isolated per-invocation.
-
-### Capture mechanism
-
-`fail_after_w_trace` covers two firing paths:
-
-1. **`trio.TooSlowError`** raised at scope-exit
-   (body returned cleanly past deadline) — direct
-   `except` handler captures.
-
-2. **Scope-cancel + body raises non-`Cancelled` exc**
-   (e.g. `nursery.__aexit__` wraps timeout-induced
-   `Cancelled` into a `BaseExceptionGroup` that
-   escapes before `trio.fail_after`'s exit-check
-   could fire `TooSlowError`) — body-raise `except`
-   handler checks `scope.cancel_called` and
-   captures if True. This path catches the
-   `test_nested_multierrors` shape specifically (see
-   "BaseExceptionGroup shape" above).
-
-The snapshot dir contains:
- `trace.txt` — `ptree` + `hung_state` (kernel
-  `wchan`/`stack` + `py-spy dump --locals` when
-  sudo cached), with `include_strays=True`
-  surfacing any cross-test ghost subactor trees in
-  the `orphans` bucket.
- `bindspace.txt` — UDS bindspace classification
-  (live-active / orphaned-alive / orphaned-dead).
- `meta.json` — `{pid, label, captured_at, sudo_cached}`.
-
-The end-of-session `pytest_terminal_summary` hook
-in `tractor._testing.pytest` lists every snapshot
-dir from the run so you don't have to scroll back
-through captured-stderr lines:
-
-```
-========================= tractor hang-snapshot index ==========================
-N `fail_after_w_trace` / `afk_alarm_w_trace` snapshot(s) captured this session:
-  <test-id>
-    → /home/.../.cache/tractor/hung-dumps/<label>__<ts>
-```
-
-### Caveats
-
-The snapshot fires AFTER the body-raise (not at the
-exact moment of scope-cancel), so the parent's
-py-spy frames show `_do_capture_snapshot` itself
-running, NOT the cancel-cascade hang frame. To see
-the actual hang state, manual `acli.ptree` /
-`acli.hung_dump` from a second terminal at T=10s
-would be needed — **not currently possible**
-because per-test reaper fixtures clean up ~0.6s
-post-timeout. See follow-up TODO in
-`tractor/_testing/trace.py` for a
-`TRACTOR_TRACE_HOLD=1` env-var pause mode.
-
 ## See also

 - [#452](https://github.com/goodboy/tractor/issues/452) —
--- a/pyproject.toml
+++ b/pyproject.toml
@ -83,13 +83,7 @@ testing = [
  # test suite
  # TODO: maybe some of these layout choices?
  # https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
-  # bumped 8.3.5 → 9.0 per upstream security advisory + our
-  # local-only reliance on the post-9.0 capture-machinery shape
-  # (the `sys.__stderr__`-bypass print in
-  # `tractor._testing.trace._do_capture_snapshot` works on 8.x
-  # too, but standardizing on 9.x here ensures `--show-capture`
-  # interactions stay predictable across dev installs).
-  "pytest>=9.0",
+  "pytest>=8.3.5",
  "pexpect>=4.9.0,<5",
  # per-test wall-clock bound (used via
  # `@pytest.mark.timeout(..., method='thread')` on the
--- a/tests/test_advanced_streaming.py
+++ b/tests/test_advanced_streaming.py
@ -10,10 +10,6 @@ from typing import Type
 import pytest
 import trio
 import tractor
-from tractor._testing.trace import (
-    AfkAlarmWTraceFactory,
-    FailAfterWTraceFactory,
-)


 def is_win():
@ -154,9 +150,6 @@ def test_dynamic_pub_sub(

    is_forking_spawner: bool,
    set_fork_aware_capture,
-
-    fail_after_w_trace: FailAfterWTraceFactory,
-    afk_alarm_w_trace: AfkAlarmWTraceFactory,
 ):
    failed_to_raise_report: str = (
        f'Never got a {expect_cancel_exc!r} ??'
@ -174,36 +167,42 @@ def test_dynamic_pub_sub(
    # a per-spawn cost (forkserver round-trip + IPC peer-handshake)
    # that can stack up over `cpus - 1` sequential `n.run_in_actor()`
    # calls — especially on UDS under cross-pytest contention
-    # (#451 / #452). 4s was flaking right at the edge under fork
-    # backends — bumped to 8s with diag-snapshot-on-timeout via
-    # `fail_after_w_trace` so a borderline run still fails loud
-    # but lands a ptree/wchan/py-spy dump in
-    # `$XDG_CACHE_HOME/tractor/hung-dumps/` for inspection.
+    # (#451 / #452). Empirically a flat 15s flakes on
+    # `main_thread_forkserver` for many-cpu hosts (a single bad
+    # spawn-stack puts total run-time at ~15.5s, just over);
+    # 30s gives plenty of headroom while still failing-loud on
+    # a real hang.
    #
-    # XXX caveat: this is an *inner* trio cancel — its `Cancelled`
-    # cannot reach a task parked in a shielded `await` (e.g. inside
-    # actor-nursery teardown). When the in-band cancel path is
-    # itself buggy (the bug-class-3 `raise KBI` swallow we're
-    # currently chasing) this guard does NOT fire and the test
-    # sits forever until external SIGINT. The `afk_alarm_w_trace`
-    # outer guard below is the AFK-safety counterpart (SIGALRM
-    # raises in the main thread regardless of trio scope state).
+    # XXX caveat: this is an *inner* `trio.fail_after` — its
+    # `Cancelled` cannot reach a task parked in a shielded `await`
+    # (e.g. inside actor-nursery teardown). When the in-band cancel
+    # path is itself buggy (the bug-class-3 `raise KBI` swallow we're
+    # currently chasing) this guard does NOT fire and the test sits
+    # forever until external SIGINT. The `_DIAG_CAP_S` outer guard
+    # below is the AFK-safety counterpart.
    fail_after_s: int = (
-        8
+        4
        if is_forking_spawner
-        else 20
+        else 12
    )

+    # outer guard: when the inner fail_after fails to fire because of
+    # a shielded-await deadlock, this cap *aborts the trio run via
+    # signal.alarm → KBI* so AFK runs don't sit for >20min on the
+    # bug-class-3 hang. Slightly larger than `fail_after_s` so the
+    # trio-native path always wins when it works.
+    _DIAG_CAP_S: int = fail_after_s + 5
+
    async def main():
        # bug-class-3 breadcrumb: tag each level of the cancel path
        # so when the run hangs and we capture cancel-level logs, the
        # *last* breadcrumb that fired names the swallow point.
        test_log.cancel('test_dynamic_pub_sub: enter main()')
        try:
-            async with fail_after_w_trace(fail_after_s):
+            with trio.fail_after(fail_after_s):
                test_log.cancel(
                    f'test_dynamic_pub_sub: '
-                    f'enter `fail_after_w_trace({fail_after_s})` scope'
+                    f'enter `trio.fail_after({fail_after_s})` scope'
                )
                try:
                    async with tractor.open_nursery(
@ -259,7 +258,15 @@ def test_dynamic_pub_sub(
                'test_dynamic_pub_sub: leaving `main()`'
            )

-    def _run_and_match():
+    # outer signal-based guard — survives a shielded-await deadlock
+    # since `signal.alarm` raises in the main thread regardless of
+    # trio's scope state. ONLY armed under fork-based backends since
+    # the bug we're chasing is MTF-specific.
+    import signal
+    armed_alarm: bool = bool(is_forking_spawner)
+    if armed_alarm:
+        signal.alarm(_DIAG_CAP_S)
+    try:
        try:
            trio.run(main)
            pytest.fail(failed_to_raise_report)
@ -280,19 +287,11 @@ def test_dynamic_pub_sub(
                pytest.fail(failed_to_raise_report)

            test_log.exception('Got user-cancel exc AS EXPECTED')
-
-    # outer SIGALRM-based guard — survives a shielded-await
-    # deadlock since `signal.alarm` raises in the main thread
-    # regardless of trio's scope state, AND captures a full diag
-    # snapshot to `$XDG_CACHE_HOME/tractor/hung-dumps/` before
-    # re-raising. ONLY armed under fork-based backends since the
-    # bug we're chasing is MTF-specific. Cap = `fail_after_s + 5`
-    # so the trio-native path always wins when it works.
-    if is_forking_spawner:
-        with afk_alarm_w_trace(fail_after_s + 5):
-            _run_and_match()
-    else:
-        _run_and_match()
+    finally:
+        # always disarm so a passing test doesn't get killed
+        # post-trio.run by a stale alarm.
+        if armed_alarm:
+            signal.alarm(0)


@tractor.context
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@ -7,7 +7,6 @@ import signal
 import platform
 import time
 from itertools import repeat
-from typing  import Type

 import pytest
 import trio
@ -15,7 +14,6 @@ import tractor
 from tractor._testing import (
    tractor_test,
 )
-from tractor._testing.trace import FailAfterWTraceFactory
 from .conftest import no_windows


@ -23,44 +21,14 @@ _non_linux: bool = platform.system() != 'Linux'
 _friggin_windows: bool = platform.system() == 'Windows'


-pytestmark = [
-    # Multi-actor cancel cascades under
-    # `--spawn-backend=subint` trip the abandoned-subint
-    # GIL-hostage class — a stuck subint can starve the
-    # parent's trio loop and block cancel-delivery.
-    # Apply the skip module-wide rather than per-test
-    # since every test here exercises the same cascade.
-    pytest.mark.skipon_spawn_backend(
-        'subint',
-        reason=(
-            'XXX SUBINT GIL-CONTENTION HANGING TEST XXX\n'
-            'Cancel cascades under '
-            '`--spawn-backend=subint` trip the abandoned-subint '
-            'GIL-hostage class — see\n'
-            '  - `ai/conc-anal/subint_sigint_starvation_issue.md` '
-            '(GIL-hostage, SIGINT-unresponsive)\n'
-            '  - `ai/conc-anal/subint_cancel_delivery_hang_issue.md` '
-            '(sibling: parent parks on dead chan)\n'
-            '  - https://github.com/goodboy/tractor/issues/379 '
-            '(subint umbrella)\n'
-        )
-    ),
-    pytest.mark.usefixtures(
-        'reap_subactors_per_test',
-        # NOTE, cancellation tests stress the SIGKILL
-        # `hard_kill` path which leaks UDS sock-files when
-        # the subactor's IPC server `finally:` cleanup
-        # doesn't run. Track per-test for blame attribution.
-        'track_orphaned_uds_per_test',
-        # NOTE, cancel-cascade timing races (see
-        # `test_nested_multierrors`) can also leave a
-        # subactor spinning at 100% CPU when its cancel
-        # signal got swallowed mid-handshake. Catches the
-        # runaway-loop class that doesn't leak UDS socks
-        # but burns the box.
-        'detect_runaway_subactors_per_test',
-    ),
-]
+pytestmark = pytest.mark.skipon_spawn_backend(
+    'subint',
+    reason=(
+        'XXX SUBINT HANGING TEST XXX\n'
+        'See oustanding issue(s)\n'
+        # TODO, put issue link!
+    )
+)


 async def assert_err(delay=0):
@ -87,11 +55,7 @@ async def do_nuthin():
    ],
    ids=['no_args', 'unexpected_args'],
 )
-def test_remote_error(
-    reg_addr: tuple,
-    args_err: tuple[dict, Type[Exception]],
-    set_fork_aware_capture,
-):
+def test_remote_error(reg_addr, args_err):
    '''
    Verify an error raised in a subactor that is propagated
    to the parent nursery, contains the underlying boxed builtin
@ -156,10 +120,17 @@ def test_remote_error(
            assert exc.boxed_type == errtype


+# @pytest.mark.skipon_spawn_backend(
+#     'subint',
+#     reason=(
+#         'XXX SUBINT HANGING TEST XXX\n'
+#         'See oustanding issue(s)\n'
+#         # TODO, put issue link!
+#     )
+# )
 def test_multierror(
    reg_addr: tuple[str, int],
-    start_method: str,  # parametrized
-    set_fork_aware_capture, #: Callable,
+    start_method: str,
 ):
    '''
    Verify we raise a ``BaseExceptionGroup`` out of a nursery where
@ -204,8 +175,6 @@ def test_multierror_fast_nursery(
    start_method: str,
    num_subactors: int,
    delay: float,
-    set_fork_aware_capture,
-    fail_after_w_trace: FailAfterWTraceFactory,
 ):
    '''
    Verify we raise a ``BaseExceptionGroup`` out of a nursery where
@ -214,43 +183,21 @@ def test_multierror_fast_nursery(

    '''
    async def main():
-        # budget = 2× natural trio-backend cascade time for
-        # 25 errorer subactors (~14s observed). on-timeout
-        # diag snapshot → if the cancel cascade hangs
-        # (observed under MTF backend with N>=14 errorer
-        # subactors) we get a fresh ptree/wchan/py-spy dump
-        # on disk INSTEAD of an opaque pytest timeout-kill.
-        # See `tractor/_testing/trace.py` for the helper.
-        async with fail_after_w_trace(30.0):
-            async with tractor.open_nursery(
-                registry_addrs=[reg_addr],
-            ) as nursery:
+        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
+        ) as nursery:

-                for i in range(num_subactors):
-                    await nursery.run_in_actor(
-                        assert_err,
-                        name=f'errorer{i}',
-                        delay=delay
-                    )
+            for i in range(num_subactors):
+                await nursery.run_in_actor(
+                    assert_err,
+                    name=f'errorer{i}',
+                    delay=delay
+                )

    # with pytest.raises(trio.MultiError) as exc_info:
-    # NOTE, `trio.TooSlowError` from `fail_after_w_trace`
-    # bubbles UN-wrapped if `open_nursery.__aexit__` never
-    # gets re-entered; wrapped inside a `BaseExceptionGroup`
-    # if it did. Accept both shapes so the matcher itself
-    # doesn't lie about *what* failed.
-    with pytest.raises(
-        (BaseExceptionGroup, trio.TooSlowError),
-    ) as exc_info:
+    with pytest.raises(BaseExceptionGroup) as exc_info:
        trio.run(main)

-    if isinstance(exc_info.value, trio.TooSlowError):
-        pytest.fail(
-            f'cancel cascade hung past 12s '
-            f'(num_subactors={num_subactors}, delay={delay}); '
-            f'see stderr for `fail_after_w_trace` snapshot path'
-        )
-
    assert exc_info.type == ExceptionGroup
    err = exc_info.value
    exceptions = err.exceptions
@ -330,7 +277,6 @@ async def stream_forever():
 async def test_cancel_infinite_streamer(
    reg_addr: tuple,
    start_method: str,
-    set_fork_aware_capture,
 ):
    # stream for at most 1 seconds
    with (
@ -354,6 +300,14 @@ async def test_cancel_infinite_streamer(
    assert n.cancelled


+# @pytest.mark.skipon_spawn_backend(
+#     'subint',
+#     reason=(
+#         'XXX SUBINT HANGING TEST XXX\n'
+#         'See oustanding issue(s)\n'
+#         # TODO, put issue link!
+#     )
+# )
@pytest.mark.parametrize(
    'num_actors_and_errs',
    [
@ -391,7 +345,6 @@ async def test_some_cancels_all(
    reg_addr: tuple,
    start_method: str,
    loglevel: str,
-    set_fork_aware_capture, #: Callable,
 ):
    '''
    Verify a subset of failed subactors causes all others in
@ -509,119 +462,32 @@ async def spawn_and_error(
 #     10,
 #     method='thread',
 # )
-@pytest.mark.parametrize(
-    'depth',
-    [1, 3],
-    ids='depth={}'.format,
-)
-@tractor_test(
-    # bumped from the 30s default to cover fork-based
-    # cancel-cascade flakes; 2 spawners × 2 errorers × depth 1+
-    # cascade through 6 portal-wait_for_result paths each
-    # paying `terminate_after=1.6s` + UDS sock-unlink under
-    # MTF/UDS contention can easily blow past 30s.
-    # Trio backend is fast and won't notice the extra budget.
-    # See `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
-    timeout=10,
-)
+@tractor_test
 async def test_nested_multierrors(
    reg_addr: tuple,
    loglevel: str,
    start_method: str,
-    set_fork_aware_capture,
-    fail_after_w_trace: FailAfterWTraceFactory,
-    request: pytest.FixtureRequest,
-    depth: int,
 ):
    '''
-    Test that failed actor sets are wrapped in `BaseExceptionGroup`s.
-
-    Parametrized over recursion `depth ∈ {1, 3}`:
-
-      - `depth=1`: shallow tree (2 spawners × 2 errorers, 2
-        levels). Cascade completes well within budget on ALL
-        backends including MTF — regression-safety green case.
-
-      - `depth=3`: deep tree (2 spawners × recursive depth-3
-        spawn-and-error). On `main_thread_forkserver` this
-        trips the cancel-cascade shape-mismatch bug class
-        (see `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`)
-        — xfailed below.
+    Test that failed actor sets are wrapped in `BaseExceptionGroup`s. This
+    test goes only 2 nurseries deep but we should eventually have tests
+    for arbitrary n-depth actor trees.

    '''
-    # XXX: `multiprocessing.forkserver` can't handle nested
-    # spawning at any depth — hangs / broken-pipes. Pre-existing
-    # backend limitation, NOT depth-specific.
-    if start_method == 'forkserver':
-        pytest.skip("Forksever sux hard at nested spawning...")
+    if start_method == 'trio':
+        depth = 3
+        subactor_breadth = 2
+    else:
+        # XXX: multiprocessing can't seem to handle any more then 2 depth
+        # process trees for whatever reason.
+        # Any more process levels then this and we see bugs that cause
+        # hangs and broken pipes all over the place...
+        if start_method == 'forkserver':
+            pytest.skip("Forksever sux hard at nested spawning...")
+        depth = 1  # means an additional actor tree of spawning (2 levels deep)
+        subactor_breadth = 2

-    subactor_breadth = 2
-
-    # MTF backend trips a probabilistic timing race in the
-    # cancel-cascade — NOT depth-gated; depth amplifies the
-    # variance so depth=3 misses nearly every run while
-    # depth=1 misses occasionally. Both get the xfail mark
-    # (with `strict=False`) since the bug class can fire at
-    # either depth.
-    #
-    # The scenario in detail:
-    #
-    #     T=0      spawn spawner_0 + spawner_1 in parallel
-    #     T=t1     spawner_0's child errors →
-    #              RemoteActorError reaches root nursery
-    #     T=t1+ε   root nursery starts cancelling
-    #              spawner_1's portal-wait
-    #     T=t2     spawner_1's child errors → tries to send
-    #              RemoteActorError back
-    #
-    #     if t2 < t1+ε:  BEG = [RAE, RAE]        ← clean (xpass)
-    #     if t2 > t1+ε:  BEG = [RAE, Cancelled]  ← race tripped (xfail)
-    #
-    # i.e. the assertion below (`isinstance(_, RemoteActorError)`)
-    # fails iff cancel-delivery beats the other tree's natural
-    # error-propagation. Depth amplifies `t2-t1` variance
-    # (longer per-tree paths = more skew); under MTF the
-    # fork-spawn jitter + UDS-contention widens both `t1` and
-    # `t2` further.
-    #
-    # With `strict=False` the clean-cascade cases (most
-    # depth=1 runs, rare depth=3 runs) report as `xpassed`
-    # while the race-tripped cases report as `xfailed` —
-    # neither flakes `--lf`. When MTF cancel-cascade
-    # eventually speeds up enough to close the race even at
-    # depth=3, BOTH variants will reliably `xpass` and
-    # pytest will yell — our signal to drop the marker. See
-    # `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
-    if start_method == 'main_thread_forkserver':
-        request.node.add_marker(
-            pytest.mark.xfail(
-                strict=False,
-                reason=(
-                    f'MTF cancel-cascade shape-mismatch at '
-                    f'depth={depth} (Cancelled races '
-                    f'RemoteActorError in BEG); see conc-anal/'
-                    'cancel_cascade_too_slow_under_main_thread_forkserver_issue.md'
-                ),
-            )
-        )
-
-    # 6s budget: in the non-hang case (and on the trio
-    # backend) the whole spawn + cancel-cascade should
-    # complete in well under that. On the borderline hang
-    # case the `fail_after_w_trace` fires `TooSlowError`
-    # AND captures a ptree/wchan/py-spy snapshot to
-    # `$XDG_CACHE_HOME/tractor/hung-dumps/` for offline
-    # inspection. See
-    # `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
-    match (start_method, depth):
-        case ('trio', _):
-            timeout = 6
-        case ('main_thread_forkserver', 1):
-            timeout = 16
-        case ('main_thread_forkserver', 3):
-            timeout = 30
-
-    async with fail_after_w_trace(timeout):
+    with trio.fail_after(120):
        try:
            async with tractor.open_nursery() as nursery:
                for i in range(subactor_breadth):
@ -792,6 +658,14 @@ async def spawn_sub_with_sync_blocking_task():
        print('exiting first subactor layer..\n')


+# @pytest.mark.skipon_spawn_backend(
+#     'subint',
+#     reason=(
+#         'XXX SUBINT HANGING TEST XXX\n'
+#         'See oustanding issue(s)\n'
+#         # TODO, put issue link!
+#     )
+# )
@pytest.mark.parametrize(
    'man_cancel_outer',
    [
@ -811,7 +685,7 @@ async def spawn_sub_with_sync_blocking_task():
 def test_cancel_while_childs_child_in_sync_sleep(
    loglevel: str,
    start_method: str,
-    is_forking_spawner: bool,
+    spawn_backend: str,
    debug_mode: bool,
    reg_addr: tuple,
    man_cancel_outer: bool,
@ -827,10 +701,7 @@ def test_cancel_while_childs_child_in_sync_sleep(

    '''
    if start_method == 'forkserver':
-        pytest.skip(
-            "`multiprocessing`'s forkserver sux hard at "
-            "resuming from sync sleep..."
-        )
+        pytest.skip("Forksever sux hard at resuming from sync sleep...")

    async def main():
        #
@ -873,11 +744,7 @@ def test_cancel_while_childs_child_in_sync_sleep(
        # delay = 2  # is AssertionError in eg AND no TooSlowError !?
        # is AssertionError in eg AND no _cs cancellation.
        delay = (
-            6 if (
-                _non_linux
-                or
-                is_forking_spawner
-            )
+            6 if _non_linux
            else 4 
        )

--- a/tests/test_local.py
+++ b/tests/test_local.py
@ -10,22 +10,18 @@ import tractor
 from tractor._testing import tractor_test


-def test_no_runtime():
-    '''
-    A registrar must be established before any nurseries
+@pytest.mark.trio
+async def test_no_runtime():
+    """A registrar must be established before any nurseries
    can be created.

    (In other words ``tractor.open_root_actor()`` must be
    engaged at some point?)
-
-    '''
-    async def main():
+    """
+    with pytest.raises(RuntimeError) :
        async with tractor.find_actor('doggy'):
            pass

-    with pytest.raises(tractor._exceptions.NoRuntime) :
-        trio.run(main)
-

@tractor_test
 async def test_self_is_registered(reg_addr):
--- a/tractor/_testing/_reap.py
+++ b/tractor/_testing/_reap.py
@ -463,20 +463,11 @@ def reap(
    grace: float = 3.0,
    poll: float = 0.25,
    log=print,
-    include_descendants: bool = True,
 ) -> tuple[list[int], list[int]]:
    '''
-    Deliver SIGINT to each pid (AND its subtree
-    descendants when `include_descendants=True`, the
-    default), wait up to `grace` seconds for them to
-    exit, then SIGKILL any that survive.
-
-    The subtree-walk is what makes a single `acli.reap`
-    invocation tear down a *full* leaked actor-tree
-    rather than just its init-adopted top. Without it,
-    repeated calls are needed: each pass kills the
-    current `ppid==1` level, the level below becomes
-    init-adopted, next pass kills those, etc.
+    Deliver SIGINT to each pid, wait up to `grace`
+    seconds for them to exit, then SIGKILL any that
+    survive.

    Returns `(signalled, survivors_killed)` so callers
    can report / assert.
@ -489,43 +480,8 @@ def reap(
    if not pids:
        return ([], [])

-    # Expand each pid into its full subtree (descendants
-    # included) so a multi-level leaked actor-tree gets
-    # torn down in a single pass. Falls back to the
-    # original `pids` list if psutil isn't installed.
-    pids_to_signal: list[int] = list(pids)
-    if include_descendants:
-        try:
-            import psutil
-        except ImportError:
-            psutil = None
-        if psutil is not None:
-            seen: set[int] = set(pids)
-            for root in list(pids):
-                try:
-                    p = psutil.Process(root)
-                    for c in p.children(recursive=True):
-                        if c.pid not in seen:
-                            seen.add(c.pid)
-                            pids_to_signal.append(c.pid)
-                except (
-                    psutil.NoSuchProcess,
-                    psutil.AccessDenied,
-                ):
-                    # raced / unprivileged — skip silently;
-                    # the orphan-root itself still gets the
-                    # signal below.
-                    continue
-            n_extra: int = len(pids_to_signal) - len(pids)
-            if n_extra:
-                log(
-                    f'[tractor-reap] expanded {len(pids)} '
-                    f'orphan-root(s) → {len(pids_to_signal)} '
-                    f'incl. {n_extra} subtree-descendant(s)'
-                )
-
    signalled: list[int] = []
-    for pid in pids_to_signal:
+    for pid in pids:
        try:
            os.kill(pid, signal.SIGINT)
            signalled.append(pid)
@ -1155,19 +1111,6 @@ def reap_subactors_per_test() -> int:
    (`_reap_orphaned_subactors`) only kicks in at session
    end which is too late to save the cascade.

-    Reaps both:
-      1. direct descendants of `pytest` (`PPid==pytest_pid`)
-      2. NEW init-adopted tractor procs (`PPid==1` AND
-         `_is_tractor_subactor`) that appeared between
-         pre-yield and post-yield — these are the leaked
-         subactors whose mid-tier parent died during the
-         cascade, reparenting them to init.
-
-    Pre-yield snapshot of init-adopted tractor procs is
-    used to scope (2) to THIS test's leaks only — without
-    it we'd also reap orphans from concurrent unrelated
-    tractor uses on the box (piker, etc.).
-
    Apply at module-level on the topically-problematic
    test files via:

@ -1187,16 +1130,7 @@ def reap_subactors_per_test() -> int:

    '''
    parent_pid: int = os.getpid()
-    # Snapshot pre-existing init-adopted tractor procs so
-    # we can scope post-test reap to NEW orphans only.
-    pre_orphans: set[int] = set(find_orphans())
    yield parent_pid
    pids: list[int] = find_descendants(parent_pid)
-    new_orphans: list[int] = [
-        pid for pid in find_orphans()
-        if pid not in pre_orphans
-    ]
-    if new_orphans:
-        pids.extend(new_orphans)
    if pids:
        reap(pids, grace=3.0)
--- a/tractor/_testing/pytest.py
+++ b/tractor/_testing/pytest.py
@ -38,20 +38,6 @@ import tractor
 from tractor.spawn._spawn import SpawnMethodKey
 import trio

-# Re-export `_testing.trace`'s pytest fixtures so they're
-# picked up by pytest's plugin-discovery (this module is
-# loaded via `pytest_plugins` from `pyproject.toml`). The
-# `noqa: F401` annotations make linters tolerate the
-# unused-looking imports — they're load-bearing for pytest
-# discovery. The fixtures share their `name=` kw with the
-# underlying CM functions; the python-level identifiers
-# below carry the `_fixture` suffix to avoid module-scope
-# collision (see `_testing/trace.py` for details).
-from .trace import (  # noqa: F401
-    afk_alarm_w_trace_fixture,
-    fail_after_w_trace_fixture,
-)
-
 # Sub-plugin: zombie-subactor + UDS sock-file + shm
 # reaping fixtures live in `tractor._testing._reap`
 # alongside the underlying detection/cleanup helpers.
@ -765,37 +751,3 @@ def set_fork_aware_capture(
    #     request=request,
    #     start_method=start_method,
    # )
-
-
-def pytest_terminal_summary(
-    terminalreporter,
-    exitstatus: int,
-    config: pytest.Config,
-) -> None:
-    '''
-    End-of-session summary: list all
-    `fail_after_w_trace`/`afk_alarm_w_trace` snapshot dirs
-    captured during the run so the human doesn't have to scroll
-    back through captured-stderr lines to find dump paths.
-
-    Reads from `tractor._testing.trace._SNAPSHOT_INDEX` which is
-    populated by `_do_capture_snapshot()` on each successful
-    snapshot capture.
-
-    No-op when zero snapshots were captured (most sessions).
-
-    '''
-    from .trace import _SNAPSHOT_INDEX
-
-    if not _SNAPSHOT_INDEX:
-        return
-
-    tr = terminalreporter
-    tr.write_sep('=', 'tractor hang-snapshot index')
-    tr.write_line(
-        f'{len(_SNAPSHOT_INDEX)} `fail_after_w_trace` / '
-        f'`afk_alarm_w_trace` snapshot(s) captured this session:'
-    )
-    for label, path in _SNAPSHOT_INDEX:
-        tr.write_line(f'  {label}')
-        tr.write_line(f'    → {path}')
--- a/tractor/_testing/trace.py
+++ b/tractor/_testing/trace.py
--- a/uv.lock
+++ b/uv.lock
@ -559,18 +559,17 @@ wheels = [

 [[package]]
 name = "pytest"
-version = "9.0.3"
+version = "8.3.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "colorama", marker = "sys_platform == 'win32'" },
    { name = "iniconfig" },
    { name = "packaging" },
    { name = "pluggy" },
-    { name = "pygments" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891, upload-time = "2025-03-02T12:54:54.503Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" },
+    { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634, upload-time = "2025-03-02T12:54:52.069Z" },
 ]

 [[package]]
@ -793,7 +792,7 @@ dev = [
    { name = "prompt-toolkit", specifier = ">=3.0.50" },
    { name = "psutil", specifier = ">=7.0.0" },
    { name = "pyperclip", specifier = ">=1.9.0" },
-    { name = "pytest", specifier = ">=9.0" },
+    { name = "pytest", specifier = ">=8.3.5" },
    { name = "pytest-timeout", specifier = ">=2.3" },
    { name = "stackscope", specifier = ">=0.2.2,<0.3" },
    { name = "typing-extensions", specifier = ">=4.14.1" },
@ -816,7 +815,7 @@ sync-pause = [{ name = "greenback", marker = "python_full_version == '3.13.*'",
 testing = [
    { name = "pexpect", specifier = ">=4.9.0,<5" },
    { name = "psutil", specifier = ">=7.0.0" },
-    { name = "pytest", specifier = ">=9.0" },
+    { name = "pytest", specifier = ">=8.3.5" },
    { name = "pytest-timeout", specifier = ">=2.3" },
 ]

--- a/xontrib/tractor_diag.xsh
+++ b/xontrib/tractor_diag.xsh
@ -6,7 +6,7 @@ prefix-completion treats them as a sub-cmd group — type
 `acli.<TAB>` to see the full set.

 Provides:
-  - `acli.ptree <pid|pgrep-pat>`        psutil-backed proc tree,
+  - `acli.ptree <pid|pgrep-pat>`       psutil-backed proc tree,
                                        live + zombies split.
  - `acli.hung_dump <pid|pat> [...]`    kernel `wchan`/`stack` +
                                        `py-spy dump` (incl `--locals`)
@ -17,10 +17,6 @@ Provides:
                                        (e.g. `piker`, `tractor`);
                                        path -> use as-is.
                                        default: `$XDG_RUNTIME_DIR/tractor`.
-  - `acli.dump_all <pid> [--out-dir]    full snapshot bundle —
-                          [--label]`    ptree + hung_dump + bindspace
-                                        written to a timestamped dir
-                                        for sharing / AI introspection.
  - `acli.reap [opts]`                  SC-polite zombie-subactor
                                        reaper + optional `/dev/shm/`
                                        + UDS sock-file sweeps.
@ -33,30 +29,188 @@ Or source directly:
  source ./xontrib/tractor_diag.xsh

 Pipe-to-paste idiom (xonsh):
-  acli.hung_dump pytest |t /tmp/hung.log
-
-The diagnostic core lives in `tractor._testing.trace` so it
-can also be invoked from inside pytest tests (e.g. via
-`fail_after_w_trace` / `afk_alarm_w_trace` capture-on-hang
-helpers) — these aliases are just thin terminal wrappers.
+  hung-dump pytest |t /tmp/hung.log

 Requires `psutil` for full functionality (`ptree` and the
-`hung_dump` tree-walk). Falls back to `pgrep -P` recursion if
-missing.
+`hung-dump` tree-walk). Falls back to `pgrep -P` recursion
+if missing.
 """

+import os
+import re
+import subprocess as sp
 from pathlib import Path

-from tractor._testing.trace import (
-    dump_all as _dump_all,
-    dump_hung_state,
-    dump_proc_tree,
-    resolve_pids,
-    scan_bindspace,
+try:
+    import psutil
+except ImportError:
+    psutil = None
+    print(
+        '[tractor-diag] `psutil` missing — '
+        'acli.ptree disabled, acli.hung_dump uses pgrep fallback. '
+        '`uv pip install psutil` for full functionality.'
+    )
+
+
+# matches tractor's UDS sock naming: `<actor_name>@<pid>.sock`
+_UDS_SOCK_RE = re.compile(
+    r'^(?P<name>.+)@(?P<pid>\d+)\.sock$'
 )


-# --- ptree ----------------------------------------------------
+# --- helpers --------------------------------------------------
+
+def _resolve_pids(arg: str) -> list:
+    '''Resolve a numeric pid OR a `pgrep -f` pattern.'''
+    if arg.isdigit():
+        return [int(arg)]
+    try:
+        out = sp.check_output(
+            ['pgrep', '-f', arg],
+            text=True,
+        )
+    except sp.CalledProcessError:
+        return []
+    return [int(p) for p in out.split() if p]
+
+
+def _walk_tree_psutil(pid: int) -> list:
+    '''Flat list `[Process, *descendants]` via psutil.'''
+    try:
+        p = psutil.Process(pid)
+    except psutil.NoSuchProcess:
+        return []
+    return [p] + p.children(recursive=True)
+
+
+def _walk_tree_with_depth(pid: int):
+    '''
+    Yield `(proc, depth)` pairs walking `pid`'s tree. `depth==0`
+    is the root; `depth==1` are direct children, etc. Used by
+    `ptree` to render parent/child relationships visually.
+    '''
+    try:
+        root = psutil.Process(pid)
+    except psutil.NoSuchProcess:
+        return
+    yield root, 0
+    stack: list = [(root, 0)]
+    seen: set = {pid}
+    while stack:
+        parent, d = stack.pop()
+        try:
+            kids = parent.children()
+        except psutil.NoSuchProcess:
+            continue
+        for k in kids:
+            if k.pid in seen:
+                continue
+            seen.add(k.pid)
+            yield k, d + 1
+            stack.append((k, d + 1))
+
+
+def _which_cgroup_slice(pid: int) -> str|None:
+    '''
+    Return which top-level systemd cgroup slice `pid` is
+    rooted in, or `None` if it's not in either:
+
+      - `'system'`: under `/system.slice/...` — typically
+        `.service` units (long-lived daemons explicitly
+        enabled via `systemctl enable`, e.g.
+        `auto-cpufreq.service`, `dbus.service`,
+        `systemd-journald.service`).
+
+      - `'user'`: under `/user.slice/user-<uid>.slice/...`
+        — typically `.scope` units that systemd auto-wraps
+        around desktop-launched apps + login-session
+        procs (e.g. `app-firefox-<id>.scope`,
+        `session-<id>.scope`).
+
+      - `None`: NOT in either slice — pid 1 is NOT
+        managing this proc via cgroup. Combined with
+        `ppid==1`, this is the genuine "leaked / parent
+        died" orphan signal.
+
+    Both slice categories are by-design `ppid==1` (pid 1
+    is actively managing them) and should NOT be flagged
+    as concerning orphans, but distinguishing them is
+    useful: `system.slice` is "real services on this
+    box", `user.slice` is "stuff in your login session".
+
+    Returns `None` on any read error (proc gone, perm
+    denied, non-Linux, etc.) — callers should treat that
+    as "unknown, classify as plain orphan".
+
+    '''
+    try:
+        with open(f'/proc/{pid}/cgroup') as f:
+            cg: str = f.read()
+    except (
+        FileNotFoundError,
+        PermissionError,
+        ProcessLookupError,
+        OSError,
+    ):
+        return None
+    if '/system.slice/' in cg:
+        return 'system'
+    if '/user.slice/' in cg:
+        return 'user'
+    return None
+
+
+def _walk_tree_pgrep(pid: int) -> list:
+    '''psutil-less fallback — recursive `pgrep -P`.'''
+    out = [pid]
+    try:
+        kids = sp.check_output(
+            ['pgrep', '-P', str(pid)],
+            text=True,
+        ).split()
+    except sp.CalledProcessError:
+        return out
+    for k in kids:
+        out.extend(_walk_tree_pgrep(int(k)))
+    return out
+
+
+def _ensure_sudo_cached() -> bool:
+    '''
+    Ensure `sudo` credentials are cached so subsequent
+    `sudo -n` calls succeed without prompting.
+
+    Returns True if cached (or successfully refreshed),
+    False if user cancelled or sudo is unavailable.
+
+    Tries `sudo -n true` first as a no-op probe; if that
+    fails, runs `sudo -v` which prompts interactively to
+    validate/refresh the credential timestamp.
+    '''
+    # probe — already cached?
+    cached = sp.run(
+        ['sudo', '-n', 'true'],
+        capture_output=True,
+    ).returncode == 0
+    if cached:
+        return True
+
+    print(
+        '[tractor-diag] needs `sudo` for /proc/<pid>/stack '
+        'and `py-spy dump`; caching creds via `sudo -v`...'
+    )
+    try:
+        rc = sp.run(['sudo', '-v']).returncode
+    except KeyboardInterrupt:
+        print('  cancelled — proceeding without sudo')
+        return False
+    except FileNotFoundError:
+        print('  sudo not on PATH — proceeding without sudo')
+        return False
+    return rc == 0
+
+
+# --- ptree ---------------------------------------------------

 def _ptree(args):
    '''
@ -66,9 +220,36 @@ def _ptree(args):

    usage: acli.ptree [--tree|-t] <pid|pgrep-pattern> [...]

-    See `tractor._testing.trace.dump_proc_tree()` for the
-    bucket semantics + classification details.
+    classification (per-proc, not per-tree):

+      - zombies:  `status in (Z, X)` — defunct, parent
+                  hasn't reaped (or kernel-marked dead).
+      - orphans:  `ppid == 1` — original parent exited;
+                  has been reparented to init. Includes
+                  the *root* of an abandoned tree AND
+                  any descendant that ended up reparented
+                  to init mid-flight.
+      - live:     real parent (`ppid > 1`), non-defunct.
+
+    Trees of orphan roots are still walked — their
+    descendants show as `live` if they themselves still
+    have a real (non-init) parent (the orphan root), but
+    the orphan root itself appears in `orphans`.
+
+    Cross-bucket parent annotation (always emitted):
+      when a row's parent (by ppid) lives in a *different*
+      severity bucket, the row is suffixed with
+      `[parent: <pid> (in `<bucket>`)]` so the visual
+      `└─` marker still resolves to a findable parent
+      even when bucketing scatters parent and child into
+      separate sections.
+
+    `--tree` / `-t` flag (opt-in):
+      additionally emit a flat walk-order `## tree`
+      section at the top — a contiguous parent-child
+      tree shape with no severity-grouping. Same procs,
+      no annotations needed because each parent appears
+      directly above its children.
    '''
    flag_tree: bool = False
    pos_args: list = []
@ -81,19 +262,224 @@ def _ptree(args):
    if not pos_args:
        print('usage: acli.ptree [--tree|-t] <pid|pgrep-pattern> [...]')
        return 1
+    if psutil is None:
+        print('ptree requires psutil; install via `uv pip install psutil`')
+        return 1

    roots: list = []
    for a in pos_args:
-        roots.extend(resolve_pids(a))
+        roots.extend(_resolve_pids(a))
    roots = sorted(set(roots))
    if not roots:
        print(f'(no procs match: {pos_args})')
        return 1

-    print(dump_proc_tree(roots, flag_tree=flag_tree), end='')
+    # statuses considered "defunct" — STATUS_ZOMBIE is the
+    # common case (`Z`); STATUS_DEAD (`X`) is rarer but kernel-
+    # reported and equally not-coming-back.
+    defunct_statuses: set = {
+        psutil.STATUS_ZOMBIE,
+        getattr(psutil, 'STATUS_DEAD', 'dead'),
+    }
+
+    seen: set = set()
+    walk_order: list = []  # [(proc, depth)] preserved walk order
+    live: list = []        # [(proc, depth)]
+    orphans: list = []
+    # `ppid==1` AND rooted in `/system.slice/` cgroup —
+    # real systemd-managed services (e.g. `auto-cpufreq`,
+    # `NetworkManager`).
+    system_slice: list = []
+    # `ppid==1` AND rooted in `/user.slice/.../*.scope` —
+    # desktop-launched apps wrapped by systemd-user in
+    # transient `.scope` units (e.g. Firefox, browsers,
+    # editors started from a launcher).
+    user_slice: list = []
+    zombies: list = []
+    gone: list = []
+
+    # parent-bucket lookup populated post-classification so
+    # `_row()` can annotate cross-bucket parent refs.
+    pid_to_bucket: dict = {}
+
+    for r in roots:
+        for (p, depth) in _walk_tree_with_depth(r):
+            if p.pid in seen:
+                continue
+            seen.add(p.pid)
+            try:
+                status: str = p.status()
+                ppid: int = p.ppid()
+            except psutil.NoSuchProcess:
+                gone.append(p.pid)
+                continue
+            entry = (p, depth)
+            # severity order:
+            #   zombie > orphan > system-slice > user-slice > live
+            # `ppid==1` splits into:
+            #   - `system-slice` (rooted in `/system.slice/` —
+            #     real services, by-design `ppid==1`)
+            #   - `user-slice` (rooted in
+            #     `/user.slice/.../*.scope` — desktop apps
+            #     wrapped by systemd-user, by-design `ppid==1`)
+            #   - `orphans` (everything else with `ppid==1` —
+            #     genuinely concerning).
+            if status in defunct_statuses:
+                zombies.append(entry)
+                pid_to_bucket[p.pid] = 'zombies'
+            elif ppid == 1:
+                slice_kind: str|None = _which_cgroup_slice(p.pid)
+                if slice_kind == 'system':
+                    system_slice.append(entry)
+                    pid_to_bucket[p.pid] = 'system-slice'
+                elif slice_kind == 'user':
+                    user_slice.append(entry)
+                    pid_to_bucket[p.pid] = 'user-slice'
+                else:
+                    orphans.append(entry)
+                    pid_to_bucket[p.pid] = 'orphans'
+            else:
+                live.append(entry)
+                pid_to_bucket[p.pid] = 'live'
+            walk_order.append(entry)
+
+    total: int = (
+        len(live)
+        + len(orphans)
+        + len(system_slice)
+        + len(user_slice)
+        + len(zombies)
+    )
+    print(f'# ptree: {total} procs across roots {roots}')
+
+    hdr = '  ' + 'PID'.rjust(7) + '  ' + 'PPID'.rjust(7) + '  '
+    hdr += 'STATUS'.ljust(10) + '  CMD'
+
+    def _row(entry, bucket: str|None = None):
+        '''
+        Render `(proc, depth)` as an aligned row. Tree depth is
+        rendered as a `└─` marker on the CMD column so PID/PPID/
+        STATUS stay column-aligned.
+
+        When `bucket` is given AND the row's parent lives in a
+        *different* bucket, append a `[parent: <pid> (in `<b>`)]`
+        suffix so the `└─` marker can be resolved across the
+        severity-section split.
+        '''
+        p, depth = entry
+        tree_pfx = ('   ' * depth) + ('└─ ' if depth > 0 else '')
+
+        # cross-bucket parent annotation; safe to compute up
+        # front because `p.ppid()` is cheap and rarely
+        # raises (parent pid is read from `/proc/<pid>/stat`,
+        # cached by psutil).
+        parent_anno: str = ''
+        if (
+            bucket is not None
+            and depth > 0
+        ):
+            try:
+                parent_pid: int = p.ppid()
+            except psutil.NoSuchProcess:
+                parent_pid = 0
+            if parent_pid and parent_pid != 1:
+                parent_bucket: str|None = pid_to_bucket.get(parent_pid)
+                if (
+                    parent_bucket is not None
+                    and parent_bucket != bucket
+                ):
+                    parent_anno = (
+                        f'  [parent: {parent_pid} '
+                        f'(in `{parent_bucket}`)]'
+                    )
+
+        # NOTE: `psutil.ZombieProcess` is a *subclass* of
+        # `psutil.NoSuchProcess`, but the proc is NOT gone —
+        # it's a zombie whose `/proc/<pid>/cmdline` is empty/
+        # unreadable. Catch it FIRST so we still render a
+        # row (using fields that DO work on zombies: pid,
+        # ppid, status, name).
+        try:
+            cmd = ' '.join(p.cmdline())[:140] or '[' + p.name() + ']'
+            r = '  ' + str(p.pid).rjust(7)
+            r += '  ' + str(p.ppid()).rjust(7)
+            r += '  ' + p.status().ljust(10)
+            r += '  ' + tree_pfx + cmd + parent_anno
+            return r
+        except psutil.ZombieProcess:
+            try:
+                ppid_str = str(p.ppid())
+                name = p.name()
+            except psutil.NoSuchProcess:
+                ppid_str, name = '?', '?'
+            r = '  ' + str(p.pid).rjust(7)
+            r += '  ' + ppid_str.rjust(7)
+            r += '  ' + 'zombie'.ljust(10)
+            r += '  ' + tree_pfx + '[' + name + ' <defunct>]' + parent_anno
+            return r
+        except psutil.NoSuchProcess:
+            return '  ' + str(p.pid).rjust(7) + '  (gone mid-walk)'
+
+    def _section(
+        title: str,
+        procs: list,
+        hint: str = '',
+        bucket: str|None = None,
+    ):
+        print(f'\n## {title} ({len(procs)})' + (f'  — {hint}' if hint else ''))
+        if not procs:
+            print('  (none)')
+            return
+        print(hdr)
+        for p in procs:
+            print(_row(p, bucket=bucket))
+
+    # `--tree` opt-in: emit a flat walk-order section first
+    # so the parent-child tree shape is contiguous (no
+    # severity-grouping). No `bucket` arg → no cross-bucket
+    # annotation, since each parent appears directly above
+    # its children here.
+    if flag_tree:
+        _section(
+            'tree', walk_order,
+            'flat walk-order, parent-child preserved',
+        )
+
+    # severity-ordered: most concerning first. Each section
+    # passes its own `bucket` name so `_row()` can annotate
+    # rows whose parents live in a different section.
+    _section(
+        'zombies', zombies,
+        'status `Z`/`X`, parent has not reaped',
+        bucket='zombies',
+    )
+    _section(
+        'orphans', orphans,
+        '`ppid==1`, NOT in a `system.slice`/`user.slice` cgroup '
+        '(likely leaked / parent gone)',
+        bucket='orphans',
+    )
+    _section(
+        'system-slice', system_slice,
+        '`ppid==1`, rooted under `/system.slice/` '
+        '(real systemd-managed service — daemon, login '
+        'session manager, etc; not a leak)',
+        bucket='system-slice',
+    )
+    _section(
+        'user-slice', user_slice,
+        '`ppid==1`, rooted under `/user.slice/.../*.scope` '
+        '(desktop-launched app wrapped by systemd-user — '
+        'browser, editor, etc; not a leak)',
+        bucket='user-slice',
+    )
+    _section('live', live, bucket='live')
+
+    if gone:
+        print(f'\n## gone-during-walk ({len(gone)}): {gone}')


-# --- hung-dump -----------------------------------------------
+# --- hung-dump ------------------------------------------------

 def _hung_dump(args):
    '''
@ -103,116 +489,248 @@ def _hung_dump(args):
    usage: acli.hung_dump <pid|pgrep-pattern> [...]

    note: `/proc/<pid>/stack` and `py-spy dump` typically
-    require CAP_SYS_PTRACE — invoked via `sudo -n`. If sudo
-    isn't cached this alias prompts (via `sudo -v`); for the
-    non-interactive equivalent see
-    `tractor._testing.trace.dump_hung_state(allow_sudo_prompt=False)`.
-
+    require CAP_SYS_PTRACE — invoked via `sudo -n`. run
+    `sudo true` first to cache creds.
    '''
    if not args:
        print('usage: acli.hung_dump <pid|pgrep-pattern> [...]')
        return 1

+    # cache sudo creds upfront so per-pid `sudo -n` calls
+    # for `cat /proc/<pid>/stack` and `py-spy dump` don't
+    # each prompt (or silently fail).
+    have_sudo: bool = _ensure_sudo_cached()
+
    roots: list = []
    for a in args:
-        roots.extend(resolve_pids(a))
+        roots.extend(_resolve_pids(a))
    roots = sorted(set(roots))
    if not roots:
        print(f'(no procs match: {args})')
        return 1

-    print(
-        dump_hung_state(roots, allow_sudo_prompt=True),
-        end='',
-    )
+    pids: list = []
+    seen: set = set()
+    for r in roots:
+        if psutil is not None:
+            walk = [p.pid for p in _walk_tree_psutil(r)]
+        else:
+            walk = _walk_tree_pgrep(r)
+        for pid in walk:
+            if pid not in seen:
+                seen.add(pid)
+                pids.append(pid)
+
+    print(f'# tree: {pids}')
+    print('\n## ps forest')
+    $[ps -o pid,ppid,pgid,stat,cmd -p @(','.join(map(str, pids)))]
+
+    for pid in pids:
+        print(f'\n## pid {pid}')
+
+        for f in ('wchan', 'stack'):
+            path = Path(f'/proc/{pid}/{f}')
+            try:
+                txt = path.read_text().rstrip()
+                print(f'-- /proc/{pid}/{f} --\n{txt}')
+            except PermissionError:
+                if not have_sudo:
+                    print(
+                        f'-- /proc/{pid}/{f}: '
+                        'PermissionError (no sudo) --'
+                    )
+                    continue
+                try:
+                    txt = sp.check_output(
+                        ['sudo', '-n', 'cat', str(path)],
+                        text=True,
+                        stderr=sp.DEVNULL,
+                    ).rstrip()
+                    print(f'-- /proc/{pid}/{f} (sudo) --\n{txt}')
+                except sp.CalledProcessError:
+                    print(
+                        f'-- /proc/{pid}/{f}: '
+                        'sudo cred expired? rerun --'
+                    )
+            except FileNotFoundError:
+                print(f'-- /proc/{pid}/{f}: proc gone --')
+
+        print(f'-- py-spy {pid} --')
+        if not have_sudo:
+            print('  (skipped — no sudo)')
+            continue
+        try:
+            $[sudo -n py-spy dump --pid @(pid) --locals]
+        except Exception as e:
+            print(f'  (py-spy failed: {e})')


-# --- bindspace-scan ------------------------------------------
+# --- bindspace-scan -------------------------------------------

 def _bindspace_scan(args):
    '''
-    Scan a tractor UDS bindspace dir for orphan sock files.
+    Scan a tractor UDS bindspace dir for orphan sock files
+    (those whose embedded `<pid>` no longer corresponds to
+    a live process).

    usage: acli.bindspace_scan [<name>|<dir>]

-    See `tractor._testing.trace.scan_bindspace()` for full arg
-    semantics + output-bucket details.
+    - no arg          -> `$XDG_RUNTIME_DIR/tractor`
+                         (or `/run/user/<uid>/tractor`)
+    - bare `<name>`   -> `$XDG_RUNTIME_DIR/<name>`,
+                         for projects like `piker` that bind
+                         their own sibling sub-dir alongside
+                         tractor's default
+    - path (abs or
+      containing `/`) -> use as-is

    '''
-    arg: str | None = args[0] if args else None
-    print(scan_bindspace(arg), end='')
-
-
-# --- dump-all (snapshot bundle) ------------------------------
-
-def _dump_all_alias(args):
-    '''
-    Capture a full diag snapshot bundle for a hung proc-tree
-    into a timestamped directory for offline / AI inspection.
-
-    usage: acli.dump_all <pid|pgrep-pat>
-                        [--label <label>]
-                        [--out-dir <path>]
-
-    Writes:
-      <out_dir>/<label>__<ts>/{trace.txt, bindspace.txt, meta.json}
-
-    Defaults:
-      --label   = `manual`
-      --out-dir = `$XDG_CACHE_HOME/tractor/hung-dumps/`
-                  (fallback `~/.cache/tractor/hung-dumps/`)
-
-    '''
-    import argparse
-    parser = argparse.ArgumentParser(
-        prog='acli.dump_all',
-        description=_dump_all_alias.__doc__,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
+    runtime: str = os.environ.get(
+        'XDG_RUNTIME_DIR',
+        f'/run/user/{os.getuid()}',
    )
-    parser.add_argument(
-        'target',
-        help='pid or pgrep -f pattern',
-    )
-    parser.add_argument(
-        '--label', '-l',
-        default='manual',
-        help='snapshot dir label prefix (default: `manual`)',
-    )
-    parser.add_argument(
-        '--out-dir', '-o',
-        type=Path,
-        default=None,
-        help='snapshot root dir (default: '
-             '$XDG_CACHE_HOME/tractor/hung-dumps/)',
-    )
-    try:
-        ns = parser.parse_args(args)
-    except SystemExit as se:
-        return int(se.code) if se.code is not None else 0
+    if args:
+        arg: str = args[0]
+        if (
+            arg.startswith('/')
+            or
+            '/' in arg
+        ):
+            bs_dir = Path(arg)
+        else:
+            # bare name -> `$XDG_RUNTIME_DIR/<name>` so
+            # callers can say `acli.bindspace_scan piker`
+            bs_dir = Path(runtime) / arg
+    else:
+        bs_dir = Path(runtime) / 'tractor'

-    pids: list = resolve_pids(ns.target)
-    if not pids:
-        print(f'(no procs match: {ns.target})')
+    if not bs_dir.exists():
+        print(f'(no bindspace at {bs_dir})')
        return 1

-    # snapshot scoped to ONE root — pick the first matched
-    # pid. Multi-root snapshots can be done by invoking
-    # `acli.dump_all <pid>` per root.
-    root_pid: int = pids[0]
-    if len(pids) > 1:
-        print(
-            f'[acli.dump_all] {len(pids)} pids matched '
-            f'{ns.target!r}; snapshotting tree from {root_pid} '
-            f'(re-run per-pid for others: {pids[1:]})'
-        )
+    socks = sorted(bs_dir.glob('*.sock'))
+    print(f'## bindspace {bs_dir} ({len(socks)} sock file(s))')

-    dump_dir = _dump_all(
-        root_pid,
-        out_dir=ns.out_dir,
-        label=ns.label,
-        allow_sudo_prompt=True,  # CLI: ok to prompt
+    live_active: list = []  # PID alive AND ppid != 1
+    live_orphaned: list = []  # PID alive AND ppid == 1 (init-adopted)
+    dead_orphans: list = []  # PID gone, sock stale
+    bogus: list = []
+
+    def _ppid(pid: int) -> int | None:
+        '''
+        Read `/proc/<pid>/stat` -> ppid. Returns None on race
+        (proc died between `os.kill(pid, 0)` succeeding and this
+        read), permission errors, or non-linux.
+        '''
+        try:
+            with open(f'/proc/{pid}/stat') as f:
+                # field [3] of `man 5 proc` `/proc/<pid>/stat`
+                # NB: field [1] is `(comm)` which can contain
+                # spaces and parens — split from the *last*
+                # `)` to avoid that bullshit.
+                stat: str = f.read()
+            after_comm: str = stat.rsplit(')', 1)[1].strip()
+            return int(after_comm.split()[1])  # state(0) ppid(1)
+        except (FileNotFoundError, PermissionError, ProcessLookupError, OSError):
+            return None
+
+    for s in socks:
+        m = _UDS_SOCK_RE.match(s.name)
+        if not m:
+            bogus.append(s)
+            continue
+        pid = int(m['pid'])
+        name = m['name']
+        try:
+            os.kill(pid, 0)
+        except ProcessLookupError:
+            dead_orphans.append((s, pid, name))
+            continue
+        except PermissionError:
+            # exists but owned by another user — treat as live-active
+            # (we can't read its /proc/<pid>/stat to check ppid)
+            live_active.append((s, pid, name, None))
+            continue
+
+        # PID is alive in our euid view; classify by ppid
+        ppid: int | None = _ppid(pid)
+        if ppid == 1:
+            # adopted by init -> the original parent reaped
+            # without cleaning up this sub. Same class as
+            # what `acli.reap` detects.
+            live_orphaned.append((s, pid, name, ppid))
+        else:
+            live_active.append((s, pid, name, ppid))
+
+    print(f'\n## live-active ({len(live_active)})  — PID alive, parent still own it')
+    if not live_active:
+        print('  (none)')
+    for s, pid, name, ppid in live_active:
+        row = '  ' + str(pid).rjust(7)
+        row += '  ' + name.ljust(32)
+        row += '  ' + s.name
+        if ppid is not None:
+            row += f'  (ppid={ppid})'
+        print(row)
+
+    print(
+        f'\n## orphaned-alive ({len(live_orphaned)})  '
+        f'— PID alive but `ppid==1`, parent reaped; '
+        f'`acli.reap` candidate'
    )
-    print(f'[acli.dump_all] snapshot written to: {dump_dir}')
+    if not live_orphaned:
+        print('  (none)')
+    for s, pid, name, ppid in live_orphaned:
+        row = '  ' + str(pid).rjust(7)
+        row += '  ' + name.ljust(32)
+        row += '  ' + s.name + '  (adopted by init)'
+        print(row)
+
+    print(f'\n## orphaned-dead ({len(dead_orphans)})  — PID gone, sock stale')
+    if not dead_orphans:
+        print('  (none)')
+    for s, pid, name in dead_orphans:
+        row = '  ' + str(pid).rjust(7)
+        row += '  ' + name.ljust(32)
+        row += '  ' + s.name + '  (no live proc)'
+        print(row)
+
+    if bogus:
+        print(
+            f'\n## non-tractor ({len(bogus)})  '
+            f'— filename lacks `@<pid>` suffix, '
+            f'cannot determine liveness intrinsically'
+        )
+        for s in bogus:
+            print(f'  {s.name}')
+        # show a copy-pastable `ss` cmd per sock so the
+        # caller can resolve listener-PID externally
+        # (e.g. for piker's `chart.sock` / `pikerd.sock`
+        # style flat names). `ss -lpx 'src = <path>'`
+        # prints `users:(("<proc>",pid=<N>,fd=<M>))` for
+        # the listening side; empty output -> nobody's
+        # listening -> safe to unlink.
+        print(
+            '\nto check liveness manually '
+            '(needs `iproute2`/`ss`):'
+        )
+        for s in bogus:
+            print(f"  ss -lpx 'src = {s}'")
+
+    if dead_orphans or live_orphaned:
+        print(
+            '\nto sweep BOTH orphaned-alive subs (graceful '
+            'SIGINT -> SIGKILL) AND dead-orphan socks in one shot:'
+        )
+        print('  acli.reap --uds')
+
+    if dead_orphans:
+        unlink_cmd = ' '.join(str(o[0]) for o in dead_orphans)
+        print(
+            '\n(or to unlink dead-orphan socks manually, '
+            "skipping `acli.reap`'s graceful-cancel ladder:)"
+        )
+        print(f'  rm {unlink_cmd}')


 # --- acli.reap ------------------------------------------------
@ -398,7 +916,6 @@ _TCLI_ALIASES: dict = {
    'acli.ptree': _ptree,
    'acli.hung_dump': _hung_dump,
    'acli.bindspace_scan': _bindspace_scan,
-    'acli.dump_all': _dump_all_alias,
    'acli.reap': _tractor_reap,
 }