Add stray-proc scan + refine `_testing.trace` capture

Deats,
- `_find_tractor_strays()`: scan `/proc/*/cmdline` for
  `tractor._child` procs NOT in the walk's `seen` set — surfaces
  ghost subactor trees from prior test runs (cross-test launchpad
  contamination).
- `dump_proc_tree(include_strays=True)`: refactor classification
  into `_classify_walk()` closure, walk stray roots as additional
  trees, emit stray-root summary in header. Also: `tractor._child`
  procs reparented to init are now always classified as orphans
  regardless of cgroup-slice (leaked subactor ≠ desktop-launched
  app).
- `_do_capture_snapshot()`: use `sys.__stderr__` to bypass pytest
  `--capture=sys` redirection so snapshot paths always land on the
  real terminal
- `fail_after_w_trace()`: capture diag snapshot on
  non-`TooSlowError` exceptions when the `fail_after` scope's
  cancel had already fired (e.g. nursery wraps `Cancelled` into a
  `BaseExceptionGroup` that escapes before `TooSlowError` can be
  raised).

(this patch was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
subint_forkserver_backend
Gud Boi 2026-05-13 18:46:04 -04:00
parent 7509e313ff
commit 3a243a1fd4
1 changed files with 181 additions and 48 deletions

View File

@ -220,6 +220,47 @@ def _which_cgroup_slice(pid: int) -> str | None:
return None
def _find_tractor_strays(seen: set[int]) -> list[int]:
'''
Scan `/proc/*/cmdline` (+ `/comm` as zombie-safe fallback) for
`tractor._child` / `tractor[<aid>]` proctitle matches whose
`pid` is NOT in the `seen` set.
Used by `dump_proc_tree(include_strays=True)` to surface ghost
subactor trees from PRIOR test runs that aren't descendants of
the snapshot's root pid (typically the pytest worker). These
are usually the source of cross-test launchpad contamination
e.g. orphaned `tractor._child` procs still squatting on UDS
bindspace from a hung-then-killed pytest invocation.
Returns the pids; caller decides what to do with them
(typically: walk their subtrees as additional roots and let
the existing zombie/orphan/live classification handle them).
Reuses `_reap._is_tractor_subactor` for the cmdline/comm
intrinsic-marker test so the detection stays in lock-step
with the reaper's own definition.
'''
# lazy-imported to avoid module-import cycle: `_reap.py` is a
# pytest plugin that imports from this module's siblings.
from ._reap import _is_tractor_subactor
strays: list[int] = []
proc = Path('/proc')
if not proc.is_dir():
return strays
for entry in proc.iterdir():
if not entry.name.isdigit():
continue
pid: int = int(entry.name)
if pid in seen:
continue
if _is_tractor_subactor(pid):
strays.append(pid)
return sorted(strays)
def _ppid_from_proc(pid: int) -> int | None:
'''
Read `ppid` from `/proc/<pid>/stat`. Returns None on race
@ -295,6 +336,7 @@ def dump_proc_tree(
roots: list[int],
*,
flag_tree: bool = False,
include_strays: bool = True,
) -> str:
'''
Severity-classified proc-tree rendering of `roots` and
@ -310,6 +352,14 @@ def dump_proc_tree(
`flag_tree=True` additionally prepends a flat walk-order
`## tree` section preserving parent-child shape.
`include_strays=True` (default) additionally scans
`/proc/*/cmdline` for `tractor._child` / `tractor[<aid>]`
procs that are NOT descendants of any provided root these
are typically ghost subactor trees from PRIOR test runs
(cross-test launchpad contamination). Their subtrees are
walked and classified normally; the bucket counts then
include them. See `_find_tractor_strays()`.
'''
buf = StringIO()
@ -339,7 +389,17 @@ def dump_proc_tree(
gone: list = []
pid_to_bucket: dict = {}
for r in roots:
# lazy-imported, used to override cgroup-slice classification
# for `tractor._child` strays (they're orphans regardless of
# whether they happen to be in the user.slice / system.slice
# cgroup — `desktop-launched app` is the *wrong* read for a
# leaked subactor that just happens to inherit user-session
# cgroup membership from its now-dead parent).
from ._reap import _is_tractor_subactor
def _classify_walk(walk_roots: list[int]) -> None:
'''Walk + classify into the closure-shared bucket lists.'''
for r in walk_roots:
for (p, depth) in _walk_tree_with_depth(r):
if p.pid in seen:
continue
@ -355,6 +415,15 @@ def dump_proc_tree(
zombies.append(entry)
pid_to_bucket[p.pid] = 'zombies'
elif ppid == 1:
# `tractor._child` procs reparented to init are
# leaked subactors regardless of cgroup-slice —
# short-circuit to `orphans` before falling back
# to the systemd-slice categorization (which is
# only meaningful for NON-tractor procs).
if _is_tractor_subactor(p.pid):
orphans.append(entry)
pid_to_bucket[p.pid] = 'orphans'
else:
slice_kind: str | None = _which_cgroup_slice(p.pid)
if slice_kind == 'system':
system_slice.append(entry)
@ -370,6 +439,15 @@ def dump_proc_tree(
pid_to_bucket[p.pid] = 'live'
walk_order.append(entry)
_classify_walk(roots)
explicit_seen: set = set(seen)
stray_roots: list[int] = []
if include_strays:
stray_roots = _find_tractor_strays(seen)
if stray_roots:
_classify_walk(stray_roots)
total: int = (
len(live)
+ len(orphans)
@ -378,6 +456,16 @@ def dump_proc_tree(
+ len(zombies)
)
echo(f'# ptree: {total} procs across roots {roots}')
if stray_roots:
n_stray_proc: int = len(seen) - len(explicit_seen)
echo(
f'# + {n_stray_proc} `tractor._child` stray proc(s) '
f'NOT descendants of {roots} '
f'(likely cross-test ghosts; see bindspace dump for '
f'their UDS sock state):'
)
for sr in stray_roots:
echo(f'# stray-root: {sr}')
hdr: str = (
' ' + 'PID'.rjust(7)
@ -472,8 +560,9 @@ def dump_proc_tree(
)
_section(
'orphans', orphans,
'`ppid==1`, NOT in a `system.slice`/`user.slice` cgroup '
'(likely leaked / parent gone)',
'`ppid==1` + leaked: either NOT in a `system.slice`/'
'`user.slice` cgroup, OR a known `tractor._child` '
'proc (leaked subactor, regardless of cgroup-slice)',
bucket='orphans',
)
_section(
@ -928,6 +1017,17 @@ def _do_capture_snapshot(
'''
target_pid: int = pid if pid is not None else os.getpid()
# NOTE: print to `sys.__stderr__` (the ORIGINAL unredirected
# stderr) rather than `sys.stderr` so the snapshot-path message
# bypasses pytest's `--capture=sys` redirection. Under pytest
# xfailed/passed tests have their captured streams SUPPRESSED
# entirely (and `--show-capture` only affects FAILED tests),
# so writing to `sys.stderr` would hide the diag info from the
# human running the suite. `__stderr__` is the pre-capture fd,
# always lands on the real terminal. Outside pytest (e.g. the
# xontrib CLI), `sys.__stderr__ is sys.stderr` so no difference.
import sys
try:
dump_dir: Path = dump_all(
target_pid,
@ -940,21 +1040,19 @@ def _do_capture_snapshot(
allow_sudo_prompt=False,
)
except Exception as e:
import sys
print(
f'[{timeout_kind}_w_trace] '
f'⚠️ dump_all() failed: {e!r} '
f'(label={label!r}, pid={target_pid})',
file=sys.stderr,
file=sys.__stderr__,
)
return None
import sys
print(
f'[{timeout_kind}_w_trace] '
f'⏰ timed out after {seconds}s (label={label!r}, '
f'pid={target_pid}); snapshot at: {dump_dir}',
file=sys.stderr,
file=sys.__stderr__,
)
return dump_dir
@ -986,10 +1084,21 @@ async def fail_after_w_trace(
snapshot parent dir. Defaults to
`$XDG_CACHE_HOME/tractor/hung-dumps/`.
On `trio.TooSlowError`:
1. Capture `dump_all()` (best-effort; failure is logged
to stderr but doesn't mask the original exception).
2. Re-raise so the test fails normally.
Snapshot is taken in EITHER of two cases:
1. `trio.fail_after` raises `TooSlowError` at scope-
exit (body returned cleanly but past the deadline).
2. The body raised a non-`TooSlowError` exception AFTER
our scope's cancel had been triggered — e.g. an
`open_nursery.__aexit__` wraps the timeout-induced
`Cancelled` into a `BaseExceptionGroup` and that
BEG escapes BEFORE `trio.fail_after`'s exit-check
can raise `TooSlowError`. Without this branch the
BEG would propagate untouched and no diag would be
captured.
The captured dump is best-effort (failure is logged to
stderr but doesn't mask the original exception). The
original exception always propagates.
Example
-------
@ -1006,10 +1115,34 @@ async def fail_after_w_trace(
# importable from a plain-python REPL.
import trio
captured: bool = False
try:
with trio.fail_after(seconds) as scope:
try:
with trio.fail_after(seconds):
yield
except BaseException:
# Body raised. If our `fail_after`'s scope had
# already cancelled (e.g. deadline hit and a
# nursery `__aexit__` wrapped the resulting
# `Cancelled` into a `BaseExceptionGroup`), the
# body's exc is downstream of OUR timeout —
# capture diag now since `trio.fail_after`'s
# `TooSlowError` re-raise won't fire when a
# different exc is in flight.
if scope.cancel_called:
_do_capture_snapshot(
label=label,
pid=pid,
out_dir=out_dir,
seconds=seconds,
timeout_kind='fail_after',
)
captured = True
raise
except trio.TooSlowError:
# Body finished without raising; `fail_after`'s exit-
# check fired `TooSlowError`.
if not captured:
_do_capture_snapshot(
label=label,
pid=pid,