""" `xontrib_tractor_diag`: pytest/tractor diagnostic aliases. All aliases live under the `acli.` namespace so xonsh's prefix-completion treats them as a sub-cmd group — type `acli.` to see the full set. Provides: - `acli.ptree ` psutil-backed proc tree, live + zombies split. - `acli.hung_dump [...]` kernel `wchan`/`stack` + `py-spy dump` (incl `--locals`) for each pid in tree. - `acli.bindspace_scan [|]` find orphaned tractor UDS sock files (no live owner pid). bare name -> `$XDG_RUNTIME_DIR/` (e.g. `piker`, `tractor`); path -> use as-is. default: `$XDG_RUNTIME_DIR/tractor`. - `acli.reap [opts]` SC-polite zombie-subactor reaper + optional `/dev/shm/` + UDS sock-file sweeps. alias for `scripts/tractor-reap`. Loading from repo root: xontrib load -p ./xontrib tractor_diag Or source directly: source ./xontrib/tractor_diag.xsh Pipe-to-paste idiom (xonsh): hung-dump pytest |t /tmp/hung.log Requires `psutil` for full functionality (`ptree` and the `hung-dump` tree-walk). Falls back to `pgrep -P` recursion if missing. """ import os import re import subprocess as sp from pathlib import Path try: import psutil except ImportError: psutil = None print( '[tractor-diag] `psutil` missing — ' 'acli.ptree disabled, acli.hung_dump uses pgrep fallback. ' '`uv pip install psutil` for full functionality.' ) # matches tractor's UDS sock naming: `@.sock` _UDS_SOCK_RE = re.compile( r'^(?P.+)@(?P\d+)\.sock$' ) # --- helpers -------------------------------------------------- def _resolve_pids(arg: str) -> list: '''Resolve a numeric pid OR a `pgrep -f` pattern.''' if arg.isdigit(): return [int(arg)] try: out = sp.check_output( ['pgrep', '-f', arg], text=True, ) except sp.CalledProcessError: return [] return [int(p) for p in out.split() if p] def _walk_tree_psutil(pid: int) -> list: '''Flat list `[Process, *descendants]` via psutil.''' try: p = psutil.Process(pid) except psutil.NoSuchProcess: return [] return [p] + p.children(recursive=True) def _walk_tree_with_depth(pid: int): ''' Yield `(proc, depth)` pairs walking `pid`'s tree. `depth==0` is the root; `depth==1` are direct children, etc. Used by `ptree` to render parent/child relationships visually. ''' try: root = psutil.Process(pid) except psutil.NoSuchProcess: return yield root, 0 stack: list = [(root, 0)] seen: set = {pid} while stack: parent, d = stack.pop() try: kids = parent.children() except psutil.NoSuchProcess: continue for k in kids: if k.pid in seen: continue seen.add(k.pid) yield k, d + 1 stack.append((k, d + 1)) def _which_cgroup_slice(pid: int) -> str|None: ''' Return which top-level systemd cgroup slice `pid` is rooted in, or `None` if it's not in either: - `'system'`: under `/system.slice/...` — typically `.service` units (long-lived daemons explicitly enabled via `systemctl enable`, e.g. `auto-cpufreq.service`, `dbus.service`, `systemd-journald.service`). - `'user'`: under `/user.slice/user-.slice/...` — typically `.scope` units that systemd auto-wraps around desktop-launched apps + login-session procs (e.g. `app-firefox-.scope`, `session-.scope`). - `None`: NOT in either slice — pid 1 is NOT managing this proc via cgroup. Combined with `ppid==1`, this is the genuine "leaked / parent died" orphan signal. Both slice categories are by-design `ppid==1` (pid 1 is actively managing them) and should NOT be flagged as concerning orphans, but distinguishing them is useful: `system.slice` is "real services on this box", `user.slice` is "stuff in your login session". Returns `None` on any read error (proc gone, perm denied, non-Linux, etc.) — callers should treat that as "unknown, classify as plain orphan". ''' try: with open(f'/proc/{pid}/cgroup') as f: cg: str = f.read() except ( FileNotFoundError, PermissionError, ProcessLookupError, OSError, ): return None if '/system.slice/' in cg: return 'system' if '/user.slice/' in cg: return 'user' return None def _walk_tree_pgrep(pid: int) -> list: '''psutil-less fallback — recursive `pgrep -P`.''' out = [pid] try: kids = sp.check_output( ['pgrep', '-P', str(pid)], text=True, ).split() except sp.CalledProcessError: return out for k in kids: out.extend(_walk_tree_pgrep(int(k))) return out def _ensure_sudo_cached() -> bool: ''' Ensure `sudo` credentials are cached so subsequent `sudo -n` calls succeed without prompting. Returns True if cached (or successfully refreshed), False if user cancelled or sudo is unavailable. Tries `sudo -n true` first as a no-op probe; if that fails, runs `sudo -v` which prompts interactively to validate/refresh the credential timestamp. ''' # probe — already cached? cached = sp.run( ['sudo', '-n', 'true'], capture_output=True, ).returncode == 0 if cached: return True print( '[tractor-diag] needs `sudo` for /proc//stack ' 'and `py-spy dump`; caching creds via `sudo -v`...' ) try: rc = sp.run(['sudo', '-v']).returncode except KeyboardInterrupt: print(' cancelled — proceeding without sudo') return False except FileNotFoundError: print(' sudo not on PATH — proceeding without sudo') return False return rc == 0 # --- ptree --------------------------------------------------- def _ptree(args): ''' psutil-backed proc tree; per-proc classification into severity-ordered buckets so leaked / defunct procs don't hide in the noise of normal `live` rows. usage: acli.ptree [--tree|-t] [...] classification (per-proc, not per-tree): - zombies: `status in (Z, X)` — defunct, parent hasn't reaped (or kernel-marked dead). - orphans: `ppid == 1` — original parent exited; has been reparented to init. Includes the *root* of an abandoned tree AND any descendant that ended up reparented to init mid-flight. - live: real parent (`ppid > 1`), non-defunct. Trees of orphan roots are still walked — their descendants show as `live` if they themselves still have a real (non-init) parent (the orphan root), but the orphan root itself appears in `orphans`. Cross-bucket parent annotation (always emitted): when a row's parent (by ppid) lives in a *different* severity bucket, the row is suffixed with `[parent: (in ``)]` so the visual `└─` marker still resolves to a findable parent even when bucketing scatters parent and child into separate sections. `--tree` / `-t` flag (opt-in): additionally emit a flat walk-order `## tree` section at the top — a contiguous parent-child tree shape with no severity-grouping. Same procs, no annotations needed because each parent appears directly above its children. ''' flag_tree: bool = False pos_args: list = [] for a in args: if a in ('--tree', '-t'): flag_tree = True else: pos_args.append(a) if not pos_args: print('usage: acli.ptree [--tree|-t] [...]') return 1 if psutil is None: print('ptree requires psutil; install via `uv pip install psutil`') return 1 roots: list = [] for a in pos_args: roots.extend(_resolve_pids(a)) roots = sorted(set(roots)) if not roots: print(f'(no procs match: {pos_args})') return 1 # statuses considered "defunct" — STATUS_ZOMBIE is the # common case (`Z`); STATUS_DEAD (`X`) is rarer but kernel- # reported and equally not-coming-back. defunct_statuses: set = { psutil.STATUS_ZOMBIE, getattr(psutil, 'STATUS_DEAD', 'dead'), } seen: set = set() walk_order: list = [] # [(proc, depth)] preserved walk order live: list = [] # [(proc, depth)] orphans: list = [] # `ppid==1` AND rooted in `/system.slice/` cgroup — # real systemd-managed services (e.g. `auto-cpufreq`, # `NetworkManager`). system_slice: list = [] # `ppid==1` AND rooted in `/user.slice/.../*.scope` — # desktop-launched apps wrapped by systemd-user in # transient `.scope` units (e.g. Firefox, browsers, # editors started from a launcher). user_slice: list = [] zombies: list = [] gone: list = [] # parent-bucket lookup populated post-classification so # `_row()` can annotate cross-bucket parent refs. pid_to_bucket: dict = {} for r in roots: for (p, depth) in _walk_tree_with_depth(r): if p.pid in seen: continue seen.add(p.pid) try: status: str = p.status() ppid: int = p.ppid() except psutil.NoSuchProcess: gone.append(p.pid) continue entry = (p, depth) # severity order: # zombie > orphan > system-slice > user-slice > live # `ppid==1` splits into: # - `system-slice` (rooted in `/system.slice/` — # real services, by-design `ppid==1`) # - `user-slice` (rooted in # `/user.slice/.../*.scope` — desktop apps # wrapped by systemd-user, by-design `ppid==1`) # - `orphans` (everything else with `ppid==1` — # genuinely concerning). if status in defunct_statuses: zombies.append(entry) pid_to_bucket[p.pid] = 'zombies' elif ppid == 1: slice_kind: str|None = _which_cgroup_slice(p.pid) if slice_kind == 'system': system_slice.append(entry) pid_to_bucket[p.pid] = 'system-slice' elif slice_kind == 'user': user_slice.append(entry) pid_to_bucket[p.pid] = 'user-slice' else: orphans.append(entry) pid_to_bucket[p.pid] = 'orphans' else: live.append(entry) pid_to_bucket[p.pid] = 'live' walk_order.append(entry) total: int = ( len(live) + len(orphans) + len(system_slice) + len(user_slice) + len(zombies) ) print(f'# ptree: {total} procs across roots {roots}') hdr = ' ' + 'PID'.rjust(7) + ' ' + 'PPID'.rjust(7) + ' ' hdr += 'STATUS'.ljust(10) + ' CMD' def _row(entry, bucket: str|None = None): ''' Render `(proc, depth)` as an aligned row. Tree depth is rendered as a `└─` marker on the CMD column so PID/PPID/ STATUS stay column-aligned. When `bucket` is given AND the row's parent lives in a *different* bucket, append a `[parent: (in ``)]` suffix so the `└─` marker can be resolved across the severity-section split. ''' p, depth = entry tree_pfx = (' ' * depth) + ('└─ ' if depth > 0 else '') # cross-bucket parent annotation; safe to compute up # front because `p.ppid()` is cheap and rarely # raises (parent pid is read from `/proc//stat`, # cached by psutil). parent_anno: str = '' if ( bucket is not None and depth > 0 ): try: parent_pid: int = p.ppid() except psutil.NoSuchProcess: parent_pid = 0 if parent_pid and parent_pid != 1: parent_bucket: str|None = pid_to_bucket.get(parent_pid) if ( parent_bucket is not None and parent_bucket != bucket ): parent_anno = ( f' [parent: {parent_pid} ' f'(in `{parent_bucket}`)]' ) # NOTE: `psutil.ZombieProcess` is a *subclass* of # `psutil.NoSuchProcess`, but the proc is NOT gone — # it's a zombie whose `/proc//cmdline` is empty/ # unreadable. Catch it FIRST so we still render a # row (using fields that DO work on zombies: pid, # ppid, status, name). try: cmd = ' '.join(p.cmdline())[:140] or '[' + p.name() + ']' r = ' ' + str(p.pid).rjust(7) r += ' ' + str(p.ppid()).rjust(7) r += ' ' + p.status().ljust(10) r += ' ' + tree_pfx + cmd + parent_anno return r except psutil.ZombieProcess: try: ppid_str = str(p.ppid()) name = p.name() except psutil.NoSuchProcess: ppid_str, name = '?', '?' r = ' ' + str(p.pid).rjust(7) r += ' ' + ppid_str.rjust(7) r += ' ' + 'zombie'.ljust(10) r += ' ' + tree_pfx + '[' + name + ' ]' + parent_anno return r except psutil.NoSuchProcess: return ' ' + str(p.pid).rjust(7) + ' (gone mid-walk)' def _section( title: str, procs: list, hint: str = '', bucket: str|None = None, ): print(f'\n## {title} ({len(procs)})' + (f' — {hint}' if hint else '')) if not procs: print(' (none)') return print(hdr) for p in procs: print(_row(p, bucket=bucket)) # `--tree` opt-in: emit a flat walk-order section first # so the parent-child tree shape is contiguous (no # severity-grouping). No `bucket` arg → no cross-bucket # annotation, since each parent appears directly above # its children here. if flag_tree: _section( 'tree', walk_order, 'flat walk-order, parent-child preserved', ) # severity-ordered: most concerning first. Each section # passes its own `bucket` name so `_row()` can annotate # rows whose parents live in a different section. _section( 'zombies', zombies, 'status `Z`/`X`, parent has not reaped', bucket='zombies', ) _section( 'orphans', orphans, '`ppid==1`, NOT in a `system.slice`/`user.slice` cgroup ' '(likely leaked / parent gone)', bucket='orphans', ) _section( 'system-slice', system_slice, '`ppid==1`, rooted under `/system.slice/` ' '(real systemd-managed service — daemon, login ' 'session manager, etc; not a leak)', bucket='system-slice', ) _section( 'user-slice', user_slice, '`ppid==1`, rooted under `/user.slice/.../*.scope` ' '(desktop-launched app wrapped by systemd-user — ' 'browser, editor, etc; not a leak)', bucket='user-slice', ) _section('live', live, bucket='live') if gone: print(f'\n## gone-during-walk ({len(gone)}): {gone}') # --- hung-dump ------------------------------------------------ def _hung_dump(args): ''' kernel + python state for a hung pytest/tractor tree. walks all descendants of each `` arg. usage: acli.hung_dump [...] note: `/proc//stack` and `py-spy dump` typically require CAP_SYS_PTRACE — invoked via `sudo -n`. run `sudo true` first to cache creds. ''' if not args: print('usage: acli.hung_dump [...]') return 1 # cache sudo creds upfront so per-pid `sudo -n` calls # for `cat /proc//stack` and `py-spy dump` don't # each prompt (or silently fail). have_sudo: bool = _ensure_sudo_cached() roots: list = [] for a in args: roots.extend(_resolve_pids(a)) roots = sorted(set(roots)) if not roots: print(f'(no procs match: {args})') return 1 pids: list = [] seen: set = set() for r in roots: if psutil is not None: walk = [p.pid for p in _walk_tree_psutil(r)] else: walk = _walk_tree_pgrep(r) for pid in walk: if pid not in seen: seen.add(pid) pids.append(pid) print(f'# tree: {pids}') print('\n## ps forest') $[ps -o pid,ppid,pgid,stat,cmd -p @(','.join(map(str, pids)))] for pid in pids: print(f'\n## pid {pid}') for f in ('wchan', 'stack'): path = Path(f'/proc/{pid}/{f}') try: txt = path.read_text().rstrip() print(f'-- /proc/{pid}/{f} --\n{txt}') except PermissionError: if not have_sudo: print( f'-- /proc/{pid}/{f}: ' 'PermissionError (no sudo) --' ) continue try: txt = sp.check_output( ['sudo', '-n', 'cat', str(path)], text=True, stderr=sp.DEVNULL, ).rstrip() print(f'-- /proc/{pid}/{f} (sudo) --\n{txt}') except sp.CalledProcessError: print( f'-- /proc/{pid}/{f}: ' 'sudo cred expired? rerun --' ) except FileNotFoundError: print(f'-- /proc/{pid}/{f}: proc gone --') print(f'-- py-spy {pid} --') if not have_sudo: print(' (skipped — no sudo)') continue try: $[sudo -n py-spy dump --pid @(pid) --locals] except Exception as e: print(f' (py-spy failed: {e})') # --- bindspace-scan ------------------------------------------- def _bindspace_scan(args): ''' Scan a tractor UDS bindspace dir for orphan sock files (those whose embedded `` no longer corresponds to a live process). usage: acli.bindspace_scan [|] - no arg -> `$XDG_RUNTIME_DIR/tractor` (or `/run/user//tractor`) - bare `` -> `$XDG_RUNTIME_DIR/`, for projects like `piker` that bind their own sibling sub-dir alongside tractor's default - path (abs or containing `/`) -> use as-is ''' runtime: str = os.environ.get( 'XDG_RUNTIME_DIR', f'/run/user/{os.getuid()}', ) if args: arg: str = args[0] if ( arg.startswith('/') or '/' in arg ): bs_dir = Path(arg) else: # bare name -> `$XDG_RUNTIME_DIR/` so # callers can say `acli.bindspace_scan piker` bs_dir = Path(runtime) / arg else: bs_dir = Path(runtime) / 'tractor' if not bs_dir.exists(): print(f'(no bindspace at {bs_dir})') return 1 socks = sorted(bs_dir.glob('*.sock')) print(f'## bindspace {bs_dir} ({len(socks)} sock file(s))') live: list = [] orphans: list = [] bogus: list = [] for s in socks: m = _UDS_SOCK_RE.match(s.name) if not m: bogus.append(s) continue pid = int(m['pid']) name = m['name'] try: os.kill(pid, 0) live.append((s, pid, name)) except ProcessLookupError: orphans.append((s, pid, name)) except PermissionError: # exists but owned by another user live.append((s, pid, name)) print(f'\n## live ({len(live)})') if not live: print(' (none)') for s, pid, name in live: row = ' ' + str(pid).rjust(7) row += ' ' + name.ljust(32) row += ' ' + s.name print(row) print(f'\n## orphaned ({len(orphans)})') if not orphans: print(' (none)') for s, pid, name in orphans: row = ' ' + str(pid).rjust(7) row += ' ' + name.ljust(32) row += ' ' + s.name + ' (no live proc)' print(row) if bogus: print( f'\n## non-tractor ({len(bogus)}) ' f'— filename lacks `@` suffix, ' f'cannot determine liveness intrinsically' ) for s in bogus: print(f' {s.name}') # show a copy-pastable `ss` cmd per sock so the # caller can resolve listener-PID externally # (e.g. for piker's `chart.sock` / `pikerd.sock` # style flat names). `ss -lpx 'src = '` # prints `users:(("",pid=,fd=))` for # the listening side; empty output -> nobody's # listening -> safe to unlink. print( '\nto check liveness manually ' '(needs `iproute2`/`ss`):' ) for s in bogus: print(f" ss -lpx 'src = {s}'") if orphans: unlink_cmd = ' '.join(str(o[0]) for o in orphans) print(f'\nto unlink orphans:\n rm {unlink_cmd}') # --- acli.reap ------------------------------------------------ def _tractor_reap(args): ''' SC-polite zombie-subactor reaper + optional `/dev/shm/` orphan-segment sweep + optional UDS sock-file sweep. usage: acli.reap [-h] [--parent PID] [--grace SEC] [--dry-run] [--shm | --shm-only] [--uds | --uds-only] phases (run in order when enabled): 1. process reap — finds tractor subactor procs left alive after a `pytest`/app run that failed to fully cancel its tree. Default = orphan-mode (PPid==1 init-reparented procs whose cwd matches repo root AND cmdline contains `python`). With `--parent`, scopes to descendants of a specific live PID. SIGINT first, then SIGKILL after `--grace` (default 3.0s). 2. shm sweep (`--shm`/`--shm-only`) — unlinks `/dev/shm/` entries owned by the current uid that no live process has open. Needed because `tractor` disables `mp.resource_tracker`. 3. UDS sweep (`--uds`/`--uds-only`) — unlinks `${XDG_RUNTIME_DIR}/tractor/@.sock` files whose binder pid is dead (or the `1616` registry sentinel). See issue #452. Mirrors `scripts/tractor-reap` (use `-n`/`--dry-run` first to see what would be touched). ''' import argparse parser = argparse.ArgumentParser( prog='acli.reap', description=_tractor_reap.__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( '--parent', '-p', type=int, default=None, help='descendant-mode: reap procs with PPid==', ) parser.add_argument( '--grace', '-g', type=float, default=3.0, help='SIGINT grace window in seconds (default 3.0)', ) parser.add_argument( '--dry-run', '-n', action='store_true', help='list matched pids/paths but do not signal/unlink', ) parser.add_argument( '--shm', action='store_true', help='also unlink orphaned /dev/shm segments', ) parser.add_argument( '--shm-only', action='store_true', help='skip process reap; only do the shm sweep', ) parser.add_argument( '--uds', action='store_true', help='also unlink orphaned UDS sock-files', ) parser.add_argument( '--uds-only', action='store_true', help='skip process reap + shm; only do the UDS sweep', ) try: ns = parser.parse_args(args) except SystemExit as se: # `argparse` raises SystemExit on `-h`/bad-args; let # xonsh treat it as a normal alias return code. return int(se.code) if se.code is not None else 0 skip_proc_reap: bool = ( ns.shm_only or ns.uds_only ) # `tractor` is assumed to be importable in the xonsh env # this xontrib was sourced into (a venv with the package # installed). The standalone `scripts/tractor-reap` does # `git rev-parse --show-toplevel` + `sys.path.insert` for # cold-shell usability — that overhead is unnecessary # here since we're already inside the project's venv. from tractor._testing._reap import ( find_descendants, find_orphans, find_orphaned_shm, find_orphaned_uds, reap, reap_shm, reap_uds, ) rc: int = 0 # phase 1: process reap (skipped under `--*-only`) if not skip_proc_reap: if ns.parent is not None: pids: list = find_descendants(ns.parent) mode: str = f'descendants of PPid={ns.parent}' else: pids = find_orphans() mode = ( 'orphans (PPid==1, intrinsic ' 'cmdline/comm match — `tractor[…]` or ' '`tractor._child`)' ) if not pids: print(f'[acli.reap] no {mode} to reap') elif ns.dry_run: print( f'[acli.reap] dry-run — {mode}:\n {pids}' ) else: _, survivors = reap(pids, grace=ns.grace) if survivors: rc = 1 # phase 2: shm sweep (opt-in) if ns.shm or ns.shm_only: leaked: list = find_orphaned_shm() if not leaked: print( '[acli.reap] no orphaned /dev/shm ' 'segments to sweep' ) elif ns.dry_run: print( f'[acli.reap] dry-run — {len(leaked)} ' f'orphaned shm segment(s):\n {leaked}' ) else: _, errors = reap_shm(leaked) if errors: rc = 1 # phase 3: UDS sweep (opt-in) if ns.uds or ns.uds_only: leaked_uds: list = find_orphaned_uds() if not leaked_uds: print( '[acli.reap] no orphaned UDS sock-files ' 'to sweep' ) elif ns.dry_run: print( f'[acli.reap] dry-run — {len(leaked_uds)} ' f'orphaned UDS sock-file(s):\n {leaked_uds}' ) else: _, errors = reap_uds(leaked_uds) if errors: rc = 1 return rc # --- registration --------------------------------------------- # all aliases under the `acli.` namespace so xonsh's prefix- # completion makes them feel like a sub-cmd group: type # `acli.` and the full set is suggested. no parent # `acli` cmd exists — the dot is purely a naming convention. _TCLI_ALIASES: dict = { 'acli.ptree': _ptree, 'acli.hung_dump': _hung_dump, 'acli.bindspace_scan': _bindspace_scan, 'acli.reap': _tractor_reap, } for _name, _fn in _TCLI_ALIASES.items(): aliases[_name] = _fn # xontrib protocol hooks (for `xontrib load tractor_diag`). # also harmless when sourced directly. def _load_xontrib_(xsh, **_): return {} def _unload_xontrib_(xsh, **_): for name in _TCLI_ALIASES: aliases.pop(name, None) return {}