From 4e56ee11cbf034a0bb1e8b4c9975f61bc8f71bf6 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 23:32:48 -0400 Subject: [PATCH] Sync `tractor_diag.xsh` move-out + `_reap` doc wording Complete the xontrib-side slimming from "Mv core impl `tractor_diag.xsh` to `_testing.trace`" (its xontrib hunk couldn't ride with the testing-harness segment since the xontrib lands in this one) and sync a couple `_reap.py` issue/doc-ref comment strings to their final form. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tractor/_testing/_reap.py | 15 +- xontrib/tractor_diag.xsh | 732 ++++++-------------------------------- 2 files changed, 116 insertions(+), 631 deletions(-) diff --git a/tractor/_testing/_reap.py b/tractor/_testing/_reap.py index 9c74a902..fa4df944 100644 --- a/tractor/_testing/_reap.py +++ b/tractor/_testing/_reap.py @@ -313,7 +313,8 @@ def find_runaway_subactors( `cpu_threshold` (default 95%) — the smoking-gun signature of a runaway tight-loop bug (e.g. a C-level `recvfrom` loop on a closed socket that missed EOF - detection; #452-class issue). + detection — see + `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`). `cpu_percent(interval=sample_interval)` is the canonical psutil API for a "what %CPU is this proc @@ -945,7 +946,7 @@ def track_orphaned_uds_per_test(): `wait_for_actor`/`find_actor` discovery probes can accidentally hit (FileExistsError on rebind, or epoll register on a half-closed peer-FIN'd fd → see - issue #452). Catching the leak the test that caused + issue #454). Catching the leak the test that caused it (vs. blanket session-end sweep) makes blame obvious + prevents cascade flakiness. @@ -1027,10 +1028,12 @@ def detect_runaway_subactors_per_test(): **Does NOT kill the runaway** — by design. The point of this fixture is to make tight-loop bugs (e.g. C-level `recvfrom` loop on a closed socket - that missed EOF detection — issue #452-class) loudly - visible AT the test that triggers, while keeping - the live pid available for hands-on diagnosis. The - session-end `_reap_orphaned_subactors` fixture will + that missed EOF detection — see + `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`) + loudly visible AT the test that triggers, while + keeping the live pid available for hands-on + diagnosis. The session-end + `_reap_orphaned_subactors` fixture will SIGINT-then-SIGKILL any survivors when the test session completes normally; if the user Ctrl-C's pytest mid-warning, the pid stays alive for as long diff --git a/xontrib/tractor_diag.xsh b/xontrib/tractor_diag.xsh index 38e0ab92..37f97a7c 100644 --- a/xontrib/tractor_diag.xsh +++ b/xontrib/tractor_diag.xsh @@ -6,7 +6,7 @@ prefix-completion treats them as a sub-cmd group — type `acli.` to see the full set. Provides: - - `acli.ptree ` psutil-backed proc tree, + - `acli.ptree ` psutil-backed proc tree, live + zombies split. - `acli.hung_dump [...]` kernel `wchan`/`stack` + `py-spy dump` (incl `--locals`) @@ -17,6 +17,10 @@ Provides: (e.g. `piker`, `tractor`); path -> use as-is. default: `$XDG_RUNTIME_DIR/tractor`. + - `acli.dump_all [--out-dir] full snapshot bundle — + [--label]` ptree + hung_dump + bindspace + written to a timestamped dir + for sharing / AI introspection. - `acli.reap [opts]` SC-polite zombie-subactor reaper + optional `/dev/shm/` + UDS sock-file sweeps. @@ -34,7 +38,12 @@ Or source directly: source ./xontrib/tractor_diag.xsh Pipe-to-paste idiom (xonsh): - hung-dump pytest |t /tmp/hung.log + acli.hung_dump pytest |t /tmp/hung.log + +The diagnostic core lives in `tractor._testing.trace` so it +can also be invoked from inside pytest tests (e.g. via +`fail_after_w_trace` / `afk_alarm_w_trace` capture-on-hang +helpers) — these aliases are just thin terminal wrappers. Requires `psutil` for full functionality (`ptree` and the `hung_dump` tree-walk). Falls back to `pgrep -P` recursion if @@ -50,25 +59,14 @@ from typing import ( ) -import os -import re -import subprocess as sp from pathlib import Path -try: - import psutil -except ImportError: - psutil = None - print( - '[tractor-diag] `psutil` missing — ' - 'acli.ptree disabled, acli.hung_dump uses pgrep fallback. ' - '`uv pip install psutil` for full functionality.' - ) - - -# matches tractor's UDS sock naming: `@.sock` -_UDS_SOCK_RE = re.compile( - r'^(?P.+)@(?P\d+)\.sock$' +from tractor._testing.trace import ( + dump_all as _dump_all, + dump_hung_state, + dump_proc_tree, + resolve_pids, + scan_bindspace, ) @aliases.unthreadable @@ -216,159 +214,7 @@ def watch( return 0 -# --- helpers -------------------------------------------------- - -def _resolve_pids(arg: str) -> list: - '''Resolve a numeric pid OR a `pgrep -f` pattern.''' - if arg.isdigit(): - return [int(arg)] - try: - out = sp.check_output( - ['pgrep', '-f', arg], - text=True, - ) - except sp.CalledProcessError: - return [] - return [int(p) for p in out.split() if p] - - -def _walk_tree_psutil(pid: int) -> list: - '''Flat list `[Process, *descendants]` via psutil.''' - try: - p = psutil.Process(pid) - except psutil.NoSuchProcess: - return [] - return [p] + p.children(recursive=True) - - -def _walk_tree_with_depth(pid: int): - ''' - Yield `(proc, depth)` pairs walking `pid`'s tree. `depth==0` - is the root; `depth==1` are direct children, etc. Used by - `ptree` to render parent/child relationships visually. - ''' - try: - root = psutil.Process(pid) - except psutil.NoSuchProcess: - return - yield root, 0 - stack: list = [(root, 0)] - seen: set = {pid} - while stack: - parent, d = stack.pop() - try: - kids = parent.children() - except psutil.NoSuchProcess: - continue - for k in kids: - if k.pid in seen: - continue - seen.add(k.pid) - yield k, d + 1 - stack.append((k, d + 1)) - - -def _which_cgroup_slice(pid: int) -> str|None: - ''' - Return which top-level systemd cgroup slice `pid` is - rooted in, or `None` if it's not in either: - - - `'system'`: under `/system.slice/...` — typically - `.service` units (long-lived daemons explicitly - enabled via `systemctl enable`, e.g. - `auto-cpufreq.service`, `dbus.service`, - `systemd-journald.service`). - - - `'user'`: under `/user.slice/user-.slice/...` - — typically `.scope` units that systemd auto-wraps - around desktop-launched apps + login-session - procs (e.g. `app-firefox-.scope`, - `session-.scope`). - - - `None`: NOT in either slice — pid 1 is NOT - managing this proc via cgroup. Combined with - `ppid==1`, this is the genuine "leaked / parent - died" orphan signal. - - Both slice categories are by-design `ppid==1` (pid 1 - is actively managing them) and should NOT be flagged - as concerning orphans, but distinguishing them is - useful: `system.slice` is "real services on this - box", `user.slice` is "stuff in your login session". - - Returns `None` on any read error (proc gone, perm - denied, non-Linux, etc.) — callers should treat that - as "unknown, classify as plain orphan". - - ''' - try: - with open(f'/proc/{pid}/cgroup') as f: - cg: str = f.read() - except ( - FileNotFoundError, - PermissionError, - ProcessLookupError, - OSError, - ): - return None - if '/system.slice/' in cg: - return 'system' - if '/user.slice/' in cg: - return 'user' - return None - - -def _walk_tree_pgrep(pid: int) -> list: - '''psutil-less fallback — recursive `pgrep -P`.''' - out = [pid] - try: - kids = sp.check_output( - ['pgrep', '-P', str(pid)], - text=True, - ).split() - except sp.CalledProcessError: - return out - for k in kids: - out.extend(_walk_tree_pgrep(int(k))) - return out - - -def _ensure_sudo_cached() -> bool: - ''' - Ensure `sudo` credentials are cached so subsequent - `sudo -n` calls succeed without prompting. - - Returns True if cached (or successfully refreshed), - False if user cancelled or sudo is unavailable. - - Tries `sudo -n true` first as a no-op probe; if that - fails, runs `sudo -v` which prompts interactively to - validate/refresh the credential timestamp. - ''' - # probe — already cached? - cached = sp.run( - ['sudo', '-n', 'true'], - capture_output=True, - ).returncode == 0 - if cached: - return True - - print( - '[tractor-diag] needs `sudo` for /proc//stack ' - 'and `py-spy dump`; caching creds via `sudo -v`...' - ) - try: - rc = sp.run(['sudo', '-v']).returncode - except KeyboardInterrupt: - print(' cancelled — proceeding without sudo') - return False - except FileNotFoundError: - print(' sudo not on PATH — proceeding without sudo') - return False - return rc == 0 - - -# --- ptree --------------------------------------------------- +# --- ptree ---------------------------------------------------- def _ptree( args: list[str], @@ -380,36 +226,8 @@ def _ptree( usage: acli.ptree [--tree|-t] [...] - classification (per-proc, not per-tree): - - - zombies: `status in (Z, X)` — defunct, parent - hasn't reaped (or kernel-marked dead). - - orphans: `ppid == 1` — original parent exited; - has been reparented to init. Includes - the *root* of an abandoned tree AND - any descendant that ended up reparented - to init mid-flight. - - live: real parent (`ppid > 1`), non-defunct. - - Trees of orphan roots are still walked — their - descendants show as `live` if they themselves still - have a real (non-init) parent (the orphan root), but - the orphan root itself appears in `orphans`. - - Cross-bucket parent annotation (always emitted): - when a row's parent (by ppid) lives in a *different* - severity bucket, the row is suffixed with - `[parent: (in ``)]` so the visual - `└─` marker still resolves to a findable parent - even when bucketing scatters parent and child into - separate sections. - - `--tree` / `-t` flag (opt-in): - additionally emit a flat walk-order `## tree` - section at the top — a contiguous parent-child - tree shape with no severity-grouping. Same procs, - no annotations needed because each parent appears - directly above its children. + See `tractor._testing.trace.dump_proc_tree()` for the + bucket semantics + classification details. To watch this live with flicker-free repaint (alt-screen, per-line EL, SIGWINCH-aware): @@ -430,224 +248,19 @@ def _ptree( if not pos_args: print('usage: acli.ptree [--tree|-t] [...]') return 1 - if psutil is None: - print('ptree requires psutil; install via `uv pip install psutil`') - return 1 roots: list = [] for a in pos_args: - roots.extend(_resolve_pids(a)) + roots.extend(resolve_pids(a)) roots = sorted(set(roots)) if not roots: print(f'(no procs match: {pos_args})') return 1 - # statuses considered "defunct" — STATUS_ZOMBIE is the - # common case (`Z`); STATUS_DEAD (`X`) is rarer but kernel- - # reported and equally not-coming-back. - defunct_statuses: set = { - psutil.STATUS_ZOMBIE, - getattr(psutil, 'STATUS_DEAD', 'dead'), - } - - seen: set = set() - walk_order: list = [] # [(proc, depth)] preserved walk order - live: list = [] # [(proc, depth)] - orphans: list = [] - # `ppid==1` AND rooted in `/system.slice/` cgroup — - # real systemd-managed services (e.g. `auto-cpufreq`, - # `NetworkManager`). - system_slice: list = [] - # `ppid==1` AND rooted in `/user.slice/.../*.scope` — - # desktop-launched apps wrapped by systemd-user in - # transient `.scope` units (e.g. Firefox, browsers, - # editors started from a launcher). - user_slice: list = [] - zombies: list = [] - gone: list = [] - - # parent-bucket lookup populated post-classification so - # `_row()` can annotate cross-bucket parent refs. - pid_to_bucket: dict = {} - - for r in roots: - for (p, depth) in _walk_tree_with_depth(r): - if p.pid in seen: - continue - seen.add(p.pid) - try: - status: str = p.status() - ppid: int = p.ppid() - except psutil.NoSuchProcess: - gone.append(p.pid) - continue - entry = (p, depth) - # severity order: - # zombie > orphan > system-slice > user-slice > live - # `ppid==1` splits into: - # - `system-slice` (rooted in `/system.slice/` — - # real services, by-design `ppid==1`) - # - `user-slice` (rooted in - # `/user.slice/.../*.scope` — desktop apps - # wrapped by systemd-user, by-design `ppid==1`) - # - `orphans` (everything else with `ppid==1` — - # genuinely concerning). - if status in defunct_statuses: - zombies.append(entry) - pid_to_bucket[p.pid] = 'zombies' - elif ppid == 1: - slice_kind: str|None = _which_cgroup_slice(p.pid) - if slice_kind == 'system': - system_slice.append(entry) - pid_to_bucket[p.pid] = 'system-slice' - elif slice_kind == 'user': - user_slice.append(entry) - pid_to_bucket[p.pid] = 'user-slice' - else: - orphans.append(entry) - pid_to_bucket[p.pid] = 'orphans' - else: - live.append(entry) - pid_to_bucket[p.pid] = 'live' - walk_order.append(entry) - - total: int = ( - len(live) - + len(orphans) - + len(system_slice) - + len(user_slice) - + len(zombies) - ) - print(f'# ptree: {total} procs across roots {roots}') - - hdr = ' ' + 'PID'.rjust(7) + ' ' + 'PPID'.rjust(7) + ' ' - hdr += 'STATUS'.ljust(10) + ' CMD' - - def _row(entry, bucket: str|None = None): - ''' - Render `(proc, depth)` as an aligned row. Tree depth is - rendered as a `└─` marker on the CMD column so PID/PPID/ - STATUS stay column-aligned. - - When `bucket` is given AND the row's parent lives in a - *different* bucket, append a `[parent: (in ``)]` - suffix so the `└─` marker can be resolved across the - severity-section split. - ''' - p, depth = entry - tree_pfx = (' ' * depth) + ('└─ ' if depth > 0 else '') - - # cross-bucket parent annotation; safe to compute up - # front because `p.ppid()` is cheap and rarely - # raises (parent pid is read from `/proc//stat`, - # cached by psutil). - parent_anno: str = '' - if ( - bucket is not None - and depth > 0 - ): - try: - parent_pid: int = p.ppid() - except psutil.NoSuchProcess: - parent_pid = 0 - if parent_pid and parent_pid != 1: - parent_bucket: str|None = pid_to_bucket.get(parent_pid) - if ( - parent_bucket is not None - and parent_bucket != bucket - ): - parent_anno = ( - f' [parent: {parent_pid} ' - f'(in `{parent_bucket}`)]' - ) - - # NOTE: `psutil.ZombieProcess` is a *subclass* of - # `psutil.NoSuchProcess`, but the proc is NOT gone — - # it's a zombie whose `/proc//cmdline` is empty/ - # unreadable. Catch it FIRST so we still render a - # row (using fields that DO work on zombies: pid, - # ppid, status, name). - try: - cmd = ' '.join(p.cmdline())[:140] or '[' + p.name() + ']' - r = ' ' + str(p.pid).rjust(7) - r += ' ' + str(p.ppid()).rjust(7) - r += ' ' + p.status().ljust(10) - r += ' ' + tree_pfx + cmd + parent_anno - return r - except psutil.ZombieProcess: - try: - ppid_str = str(p.ppid()) - name = p.name() - except psutil.NoSuchProcess: - ppid_str, name = '?', '?' - r = ' ' + str(p.pid).rjust(7) - r += ' ' + ppid_str.rjust(7) - r += ' ' + 'zombie'.ljust(10) - r += ' ' + tree_pfx + '[' + name + ' ]' + parent_anno - return r - except psutil.NoSuchProcess: - return ' ' + str(p.pid).rjust(7) + ' (gone mid-walk)' - - def _section( - title: str, - procs: list, - hint: str = '', - bucket: str|None = None, - ): - print(f'\n## {title} ({len(procs)})' + (f' — {hint}' if hint else '')) - if not procs: - print(' (none)') - return - print(hdr) - for p in procs: - print(_row(p, bucket=bucket)) - - # `--tree` opt-in: emit a flat walk-order section first - # so the parent-child tree shape is contiguous (no - # severity-grouping). No `bucket` arg → no cross-bucket - # annotation, since each parent appears directly above - # its children here. - if flag_tree: - _section( - 'tree', walk_order, - 'flat walk-order, parent-child preserved', - ) - - # severity-ordered: most concerning first. Each section - # passes its own `bucket` name so `_row()` can annotate - # rows whose parents live in a different section. - _section( - 'zombies', zombies, - 'status `Z`/`X`, parent has not reaped', - bucket='zombies', - ) - _section( - 'orphans', orphans, - '`ppid==1`, NOT in a `system.slice`/`user.slice` cgroup ' - '(likely leaked / parent gone)', - bucket='orphans', - ) - _section( - 'system-slice', system_slice, - '`ppid==1`, rooted under `/system.slice/` ' - '(real systemd-managed service — daemon, login ' - 'session manager, etc; not a leak)', - bucket='system-slice', - ) - _section( - 'user-slice', user_slice, - '`ppid==1`, rooted under `/user.slice/.../*.scope` ' - '(desktop-launched app wrapped by systemd-user — ' - 'browser, editor, etc; not a leak)', - bucket='user-slice', - ) - _section('live', live, bucket='live') - - if gone: - print(f'\n## gone-during-walk ({len(gone)}): {gone}') + print(dump_proc_tree(roots, flag_tree=flag_tree), end='') -# --- hung-dump ------------------------------------------------ +# --- hung-dump ----------------------------------------------- def _hung_dump(args): ''' @@ -657,248 +270,116 @@ def _hung_dump(args): usage: acli.hung_dump [...] note: `/proc//stack` and `py-spy dump` typically - require CAP_SYS_PTRACE — invoked via `sudo -n`. run - `sudo true` first to cache creds. + require CAP_SYS_PTRACE — invoked via `sudo -n`. If sudo + isn't cached this alias prompts (via `sudo -v`); for the + non-interactive equivalent see + `tractor._testing.trace.dump_hung_state(allow_sudo_prompt=False)`. + ''' if not args: print('usage: acli.hung_dump [...]') return 1 - # cache sudo creds upfront so per-pid `sudo -n` calls - # for `cat /proc//stack` and `py-spy dump` don't - # each prompt (or silently fail). - have_sudo: bool = _ensure_sudo_cached() - roots: list = [] for a in args: - roots.extend(_resolve_pids(a)) + roots.extend(resolve_pids(a)) roots = sorted(set(roots)) if not roots: print(f'(no procs match: {args})') return 1 - pids: list = [] - seen: set = set() - for r in roots: - if psutil is not None: - walk = [p.pid for p in _walk_tree_psutil(r)] - else: - walk = _walk_tree_pgrep(r) - for pid in walk: - if pid not in seen: - seen.add(pid) - pids.append(pid) - - print(f'# tree: {pids}') - print('\n## ps forest') - $[ps -o pid,ppid,pgid,stat,cmd -p @(','.join(map(str, pids)))] - - for pid in pids: - print(f'\n## pid {pid}') - - for f in ('wchan', 'stack'): - path = Path(f'/proc/{pid}/{f}') - try: - txt = path.read_text().rstrip() - print(f'-- /proc/{pid}/{f} --\n{txt}') - except PermissionError: - if not have_sudo: - print( - f'-- /proc/{pid}/{f}: ' - 'PermissionError (no sudo) --' - ) - continue - try: - txt = sp.check_output( - ['sudo', '-n', 'cat', str(path)], - text=True, - stderr=sp.DEVNULL, - ).rstrip() - print(f'-- /proc/{pid}/{f} (sudo) --\n{txt}') - except sp.CalledProcessError: - print( - f'-- /proc/{pid}/{f}: ' - 'sudo cred expired? rerun --' - ) - except FileNotFoundError: - print(f'-- /proc/{pid}/{f}: proc gone --') - - print(f'-- py-spy {pid} --') - if not have_sudo: - print(' (skipped — no sudo)') - continue - try: - $[sudo -n py-spy dump --pid @(pid) --locals] - except Exception as e: - print(f' (py-spy failed: {e})') + print( + dump_hung_state(roots, allow_sudo_prompt=True), + end='', + ) -# --- bindspace-scan ------------------------------------------- +# --- bindspace-scan ------------------------------------------ def _bindspace_scan(args): ''' - Scan a tractor UDS bindspace dir for orphan sock files - (those whose embedded `` no longer corresponds to - a live process). + Scan a tractor UDS bindspace dir for orphan sock files. usage: acli.bindspace_scan [|] - - no arg -> `$XDG_RUNTIME_DIR/tractor` - (or `/run/user//tractor`) - - bare `` -> `$XDG_RUNTIME_DIR/`, - for projects like `piker` that bind - their own sibling sub-dir alongside - tractor's default - - path (abs or - containing `/`) -> use as-is + See `tractor._testing.trace.scan_bindspace()` for full arg + semantics + output-bucket details. ''' - runtime: str = os.environ.get( - 'XDG_RUNTIME_DIR', - f'/run/user/{os.getuid()}', - ) - if args: - arg: str = args[0] - if ( - arg.startswith('/') - or - '/' in arg - ): - bs_dir = Path(arg) - else: - # bare name -> `$XDG_RUNTIME_DIR/` so - # callers can say `acli.bindspace_scan piker` - bs_dir = Path(runtime) / arg - else: - bs_dir = Path(runtime) / 'tractor' + arg: str | None = args[0] if args else None + print(scan_bindspace(arg), end='') - if not bs_dir.exists(): - print(f'(no bindspace at {bs_dir})') + +# --- dump-all (snapshot bundle) ------------------------------ + +def _dump_all_alias(args): + ''' + Capture a full diag snapshot bundle for a hung proc-tree + into a timestamped directory for offline / AI inspection. + + usage: acli.dump_all + [--label