Add stray-proc scan + refine `_testing.trace` capture

Deats,
- `_find_tractor_strays()`: scan `/proc/*/cmdline` for
  `tractor._child` procs NOT in the walk's `seen` set — surfaces
  ghost subactor trees from prior test runs (cross-test launchpad
  contamination).
- `dump_proc_tree(include_strays=True)`: refactor classification
  into `_classify_walk()` closure, walk stray roots as additional
  trees, emit stray-root summary in header. Also: `tractor._child`
  procs reparented to init are now always classified as orphans
  regardless of cgroup-slice (leaked subactor ≠ desktop-launched
  app).
- `_do_capture_snapshot()`: use `sys.__stderr__` to bypass pytest
  `--capture=sys` redirection so snapshot paths always land on the
  real terminal
- `fail_after_w_trace()`: capture diag snapshot on
  non-`TooSlowError` exceptions when the `fail_after` scope's
  cancel had already fired (e.g. nursery wraps `Cancelled` into a
  `BaseExceptionGroup` that escapes before `TooSlowError` can be
  raised).

(this patch was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
subint_forkserver_backend
Gud Boi 2026-05-13 18:46:04 -04:00
parent 7509e313ff
commit 3a243a1fd4
1 changed files with 181 additions and 48 deletions

View File

@ -220,6 +220,47 @@ def _which_cgroup_slice(pid: int) -> str | None:
return None return None
def _find_tractor_strays(seen: set[int]) -> list[int]:
'''
Scan `/proc/*/cmdline` (+ `/comm` as zombie-safe fallback) for
`tractor._child` / `tractor[<aid>]` proctitle matches whose
`pid` is NOT in the `seen` set.
Used by `dump_proc_tree(include_strays=True)` to surface ghost
subactor trees from PRIOR test runs that aren't descendants of
the snapshot's root pid (typically the pytest worker). These
are usually the source of cross-test launchpad contamination
e.g. orphaned `tractor._child` procs still squatting on UDS
bindspace from a hung-then-killed pytest invocation.
Returns the pids; caller decides what to do with them
(typically: walk their subtrees as additional roots and let
the existing zombie/orphan/live classification handle them).
Reuses `_reap._is_tractor_subactor` for the cmdline/comm
intrinsic-marker test so the detection stays in lock-step
with the reaper's own definition.
'''
# lazy-imported to avoid module-import cycle: `_reap.py` is a
# pytest plugin that imports from this module's siblings.
from ._reap import _is_tractor_subactor
strays: list[int] = []
proc = Path('/proc')
if not proc.is_dir():
return strays
for entry in proc.iterdir():
if not entry.name.isdigit():
continue
pid: int = int(entry.name)
if pid in seen:
continue
if _is_tractor_subactor(pid):
strays.append(pid)
return sorted(strays)
def _ppid_from_proc(pid: int) -> int | None: def _ppid_from_proc(pid: int) -> int | None:
''' '''
Read `ppid` from `/proc/<pid>/stat`. Returns None on race Read `ppid` from `/proc/<pid>/stat`. Returns None on race
@ -295,6 +336,7 @@ def dump_proc_tree(
roots: list[int], roots: list[int],
*, *,
flag_tree: bool = False, flag_tree: bool = False,
include_strays: bool = True,
) -> str: ) -> str:
''' '''
Severity-classified proc-tree rendering of `roots` and Severity-classified proc-tree rendering of `roots` and
@ -310,6 +352,14 @@ def dump_proc_tree(
`flag_tree=True` additionally prepends a flat walk-order `flag_tree=True` additionally prepends a flat walk-order
`## tree` section preserving parent-child shape. `## tree` section preserving parent-child shape.
`include_strays=True` (default) additionally scans
`/proc/*/cmdline` for `tractor._child` / `tractor[<aid>]`
procs that are NOT descendants of any provided root these
are typically ghost subactor trees from PRIOR test runs
(cross-test launchpad contamination). Their subtrees are
walked and classified normally; the bucket counts then
include them. See `_find_tractor_strays()`.
''' '''
buf = StringIO() buf = StringIO()
@ -339,36 +389,64 @@ def dump_proc_tree(
gone: list = [] gone: list = []
pid_to_bucket: dict = {} pid_to_bucket: dict = {}
for r in roots: # lazy-imported, used to override cgroup-slice classification
for (p, depth) in _walk_tree_with_depth(r): # for `tractor._child` strays (they're orphans regardless of
if p.pid in seen: # whether they happen to be in the user.slice / system.slice
continue # cgroup — `desktop-launched app` is the *wrong* read for a
seen.add(p.pid) # leaked subactor that just happens to inherit user-session
try: # cgroup membership from its now-dead parent).
status: str = p.status() from ._reap import _is_tractor_subactor
ppid: int = p.ppid()
except psutil.NoSuchProcess: def _classify_walk(walk_roots: list[int]) -> None:
gone.append(p.pid) '''Walk + classify into the closure-shared bucket lists.'''
continue for r in walk_roots:
entry = (p, depth) for (p, depth) in _walk_tree_with_depth(r):
if status in defunct_statuses: if p.pid in seen:
zombies.append(entry) continue
pid_to_bucket[p.pid] = 'zombies' seen.add(p.pid)
elif ppid == 1: try:
slice_kind: str | None = _which_cgroup_slice(p.pid) status: str = p.status()
if slice_kind == 'system': ppid: int = p.ppid()
system_slice.append(entry) except psutil.NoSuchProcess:
pid_to_bucket[p.pid] = 'system-slice' gone.append(p.pid)
elif slice_kind == 'user': continue
user_slice.append(entry) entry = (p, depth)
pid_to_bucket[p.pid] = 'user-slice' if status in defunct_statuses:
zombies.append(entry)
pid_to_bucket[p.pid] = 'zombies'
elif ppid == 1:
# `tractor._child` procs reparented to init are
# leaked subactors regardless of cgroup-slice —
# short-circuit to `orphans` before falling back
# to the systemd-slice categorization (which is
# only meaningful for NON-tractor procs).
if _is_tractor_subactor(p.pid):
orphans.append(entry)
pid_to_bucket[p.pid] = 'orphans'
else:
slice_kind: str | None = _which_cgroup_slice(p.pid)
if slice_kind == 'system':
system_slice.append(entry)
pid_to_bucket[p.pid] = 'system-slice'
elif slice_kind == 'user':
user_slice.append(entry)
pid_to_bucket[p.pid] = 'user-slice'
else:
orphans.append(entry)
pid_to_bucket[p.pid] = 'orphans'
else: else:
orphans.append(entry) live.append(entry)
pid_to_bucket[p.pid] = 'orphans' pid_to_bucket[p.pid] = 'live'
else: walk_order.append(entry)
live.append(entry)
pid_to_bucket[p.pid] = 'live' _classify_walk(roots)
walk_order.append(entry) explicit_seen: set = set(seen)
stray_roots: list[int] = []
if include_strays:
stray_roots = _find_tractor_strays(seen)
if stray_roots:
_classify_walk(stray_roots)
total: int = ( total: int = (
len(live) len(live)
@ -378,6 +456,16 @@ def dump_proc_tree(
+ len(zombies) + len(zombies)
) )
echo(f'# ptree: {total} procs across roots {roots}') echo(f'# ptree: {total} procs across roots {roots}')
if stray_roots:
n_stray_proc: int = len(seen) - len(explicit_seen)
echo(
f'# + {n_stray_proc} `tractor._child` stray proc(s) '
f'NOT descendants of {roots} '
f'(likely cross-test ghosts; see bindspace dump for '
f'their UDS sock state):'
)
for sr in stray_roots:
echo(f'# stray-root: {sr}')
hdr: str = ( hdr: str = (
' ' + 'PID'.rjust(7) ' ' + 'PID'.rjust(7)
@ -472,8 +560,9 @@ def dump_proc_tree(
) )
_section( _section(
'orphans', orphans, 'orphans', orphans,
'`ppid==1`, NOT in a `system.slice`/`user.slice` cgroup ' '`ppid==1` + leaked: either NOT in a `system.slice`/'
'(likely leaked / parent gone)', '`user.slice` cgroup, OR a known `tractor._child` '
'proc (leaked subactor, regardless of cgroup-slice)',
bucket='orphans', bucket='orphans',
) )
_section( _section(
@ -928,6 +1017,17 @@ def _do_capture_snapshot(
''' '''
target_pid: int = pid if pid is not None else os.getpid() target_pid: int = pid if pid is not None else os.getpid()
# NOTE: print to `sys.__stderr__` (the ORIGINAL unredirected
# stderr) rather than `sys.stderr` so the snapshot-path message
# bypasses pytest's `--capture=sys` redirection. Under pytest
# xfailed/passed tests have their captured streams SUPPRESSED
# entirely (and `--show-capture` only affects FAILED tests),
# so writing to `sys.stderr` would hide the diag info from the
# human running the suite. `__stderr__` is the pre-capture fd,
# always lands on the real terminal. Outside pytest (e.g. the
# xontrib CLI), `sys.__stderr__ is sys.stderr` so no difference.
import sys
try: try:
dump_dir: Path = dump_all( dump_dir: Path = dump_all(
target_pid, target_pid,
@ -940,21 +1040,19 @@ def _do_capture_snapshot(
allow_sudo_prompt=False, allow_sudo_prompt=False,
) )
except Exception as e: except Exception as e:
import sys
print( print(
f'[{timeout_kind}_w_trace] ' f'[{timeout_kind}_w_trace] '
f'⚠️ dump_all() failed: {e!r} ' f'⚠️ dump_all() failed: {e!r} '
f'(label={label!r}, pid={target_pid})', f'(label={label!r}, pid={target_pid})',
file=sys.stderr, file=sys.__stderr__,
) )
return None return None
import sys
print( print(
f'[{timeout_kind}_w_trace] ' f'[{timeout_kind}_w_trace] '
f'⏰ timed out after {seconds}s (label={label!r}, ' f'⏰ timed out after {seconds}s (label={label!r}, '
f'pid={target_pid}); snapshot at: {dump_dir}', f'pid={target_pid}); snapshot at: {dump_dir}',
file=sys.stderr, file=sys.__stderr__,
) )
return dump_dir return dump_dir
@ -986,10 +1084,21 @@ async def fail_after_w_trace(
snapshot parent dir. Defaults to snapshot parent dir. Defaults to
`$XDG_CACHE_HOME/tractor/hung-dumps/`. `$XDG_CACHE_HOME/tractor/hung-dumps/`.
On `trio.TooSlowError`: Snapshot is taken in EITHER of two cases:
1. Capture `dump_all()` (best-effort; failure is logged 1. `trio.fail_after` raises `TooSlowError` at scope-
to stderr but doesn't mask the original exception). exit (body returned cleanly but past the deadline).
2. Re-raise so the test fails normally. 2. The body raised a non-`TooSlowError` exception AFTER
our scope's cancel had been triggered — e.g. an
`open_nursery.__aexit__` wraps the timeout-induced
`Cancelled` into a `BaseExceptionGroup` and that
BEG escapes BEFORE `trio.fail_after`'s exit-check
can raise `TooSlowError`. Without this branch the
BEG would propagate untouched and no diag would be
captured.
The captured dump is best-effort (failure is logged to
stderr but doesn't mask the original exception). The
original exception always propagates.
Example Example
------- -------
@ -1006,17 +1115,41 @@ async def fail_after_w_trace(
# importable from a plain-python REPL. # importable from a plain-python REPL.
import trio import trio
captured: bool = False
try: try:
with trio.fail_after(seconds): with trio.fail_after(seconds) as scope:
yield try:
yield
except BaseException:
# Body raised. If our `fail_after`'s scope had
# already cancelled (e.g. deadline hit and a
# nursery `__aexit__` wrapped the resulting
# `Cancelled` into a `BaseExceptionGroup`), the
# body's exc is downstream of OUR timeout —
# capture diag now since `trio.fail_after`'s
# `TooSlowError` re-raise won't fire when a
# different exc is in flight.
if scope.cancel_called:
_do_capture_snapshot(
label=label,
pid=pid,
out_dir=out_dir,
seconds=seconds,
timeout_kind='fail_after',
)
captured = True
raise
except trio.TooSlowError: except trio.TooSlowError:
_do_capture_snapshot( # Body finished without raising; `fail_after`'s exit-
label=label, # check fired `TooSlowError`.
pid=pid, if not captured:
out_dir=out_dir, _do_capture_snapshot(
seconds=seconds, label=label,
timeout_kind='fail_after', pid=pid,
) out_dir=out_dir,
seconds=seconds,
timeout_kind='fail_after',
)
raise raise