2026-05-06 18:07:24 +00:00
|
|
|
"""
|
|
|
|
|
`xontrib_tractor_diag`: pytest/tractor diagnostic aliases.
|
|
|
|
|
|
2026-05-07 22:07:34 +00:00
|
|
|
All aliases live under the `acli.` namespace so xonsh's
|
|
|
|
|
prefix-completion treats them as a sub-cmd group — type
|
|
|
|
|
`acli.<TAB>` to see the full set.
|
|
|
|
|
|
2026-05-06 18:07:24 +00:00
|
|
|
Provides:
|
2026-05-07 22:07:34 +00:00
|
|
|
- `acli.pytree <pid|pgrep-pat>` psutil-backed proc tree,
|
|
|
|
|
live + zombies split.
|
|
|
|
|
- `acli.hung_dump <pid|pat> [...]` kernel `wchan`/`stack` +
|
|
|
|
|
`py-spy dump` (incl `--locals`)
|
|
|
|
|
for each pid in tree.
|
|
|
|
|
- `acli.bindspace_scan [<dir>]` find orphaned tractor UDS
|
|
|
|
|
sock files (no live owner pid).
|
|
|
|
|
default: `$XDG_RUNTIME_DIR/tractor`.
|
|
|
|
|
- `acli.reap [opts]` SC-polite zombie-subactor
|
|
|
|
|
reaper + optional `/dev/shm/`
|
|
|
|
|
+ UDS sock-file sweeps.
|
|
|
|
|
alias for `scripts/tractor-reap`.
|
2026-05-06 18:07:24 +00:00
|
|
|
|
|
|
|
|
Loading from repo root:
|
|
|
|
|
xontrib load -p ./xontrib tractor_diag
|
|
|
|
|
|
|
|
|
|
Or source directly:
|
|
|
|
|
source ./xontrib/tractor_diag.xsh
|
|
|
|
|
|
|
|
|
|
Pipe-to-paste idiom (xonsh):
|
|
|
|
|
hung-dump pytest |t /tmp/hung.log
|
|
|
|
|
|
|
|
|
|
Requires `psutil` for full functionality (`pytree` and the
|
|
|
|
|
`hung-dump` tree-walk). Falls back to `pgrep -P` recursion
|
|
|
|
|
if missing.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
import subprocess as sp
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
import psutil
|
|
|
|
|
except ImportError:
|
|
|
|
|
psutil = None
|
|
|
|
|
print(
|
|
|
|
|
'[tractor-diag] `psutil` missing — '
|
2026-05-07 22:07:34 +00:00
|
|
|
'acli.pytree disabled, acli.hung_dump uses pgrep fallback. '
|
2026-05-06 18:07:24 +00:00
|
|
|
'`uv pip install psutil` for full functionality.'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# matches tractor's UDS sock naming: `<actor_name>@<pid>.sock`
|
|
|
|
|
_UDS_SOCK_RE = re.compile(
|
|
|
|
|
r'^(?P<name>.+)@(?P<pid>\d+)\.sock$'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# --- helpers --------------------------------------------------
|
|
|
|
|
|
|
|
|
|
def _resolve_pids(arg: str) -> list:
|
|
|
|
|
'''Resolve a numeric pid OR a `pgrep -f` pattern.'''
|
|
|
|
|
if arg.isdigit():
|
|
|
|
|
return [int(arg)]
|
|
|
|
|
try:
|
|
|
|
|
out = sp.check_output(
|
|
|
|
|
['pgrep', '-f', arg],
|
|
|
|
|
text=True,
|
|
|
|
|
)
|
|
|
|
|
except sp.CalledProcessError:
|
|
|
|
|
return []
|
|
|
|
|
return [int(p) for p in out.split() if p]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _walk_tree_psutil(pid: int) -> list:
|
|
|
|
|
'''Flat list `[Process, *descendants]` via psutil.'''
|
|
|
|
|
try:
|
|
|
|
|
p = psutil.Process(pid)
|
|
|
|
|
except psutil.NoSuchProcess:
|
|
|
|
|
return []
|
|
|
|
|
return [p] + p.children(recursive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _walk_tree_with_depth(pid: int):
|
|
|
|
|
'''
|
|
|
|
|
Yield `(proc, depth)` pairs walking `pid`'s tree. `depth==0`
|
|
|
|
|
is the root; `depth==1` are direct children, etc. Used by
|
|
|
|
|
`pytree` to render parent/child relationships visually.
|
|
|
|
|
'''
|
|
|
|
|
try:
|
|
|
|
|
root = psutil.Process(pid)
|
|
|
|
|
except psutil.NoSuchProcess:
|
|
|
|
|
return
|
|
|
|
|
yield root, 0
|
|
|
|
|
stack: list = [(root, 0)]
|
|
|
|
|
seen: set = {pid}
|
|
|
|
|
while stack:
|
|
|
|
|
parent, d = stack.pop()
|
|
|
|
|
try:
|
|
|
|
|
kids = parent.children()
|
|
|
|
|
except psutil.NoSuchProcess:
|
|
|
|
|
continue
|
|
|
|
|
for k in kids:
|
|
|
|
|
if k.pid in seen:
|
|
|
|
|
continue
|
|
|
|
|
seen.add(k.pid)
|
|
|
|
|
yield k, d + 1
|
|
|
|
|
stack.append((k, d + 1))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _walk_tree_pgrep(pid: int) -> list:
|
|
|
|
|
'''psutil-less fallback — recursive `pgrep -P`.'''
|
|
|
|
|
out = [pid]
|
|
|
|
|
try:
|
|
|
|
|
kids = sp.check_output(
|
|
|
|
|
['pgrep', '-P', str(pid)],
|
|
|
|
|
text=True,
|
|
|
|
|
).split()
|
|
|
|
|
except sp.CalledProcessError:
|
|
|
|
|
return out
|
|
|
|
|
for k in kids:
|
|
|
|
|
out.extend(_walk_tree_pgrep(int(k)))
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _ensure_sudo_cached() -> bool:
|
|
|
|
|
'''
|
|
|
|
|
Ensure `sudo` credentials are cached so subsequent
|
|
|
|
|
`sudo -n` calls succeed without prompting.
|
|
|
|
|
|
|
|
|
|
Returns True if cached (or successfully refreshed),
|
|
|
|
|
False if user cancelled or sudo is unavailable.
|
|
|
|
|
|
|
|
|
|
Tries `sudo -n true` first as a no-op probe; if that
|
|
|
|
|
fails, runs `sudo -v` which prompts interactively to
|
|
|
|
|
validate/refresh the credential timestamp.
|
|
|
|
|
'''
|
|
|
|
|
# probe — already cached?
|
|
|
|
|
cached = sp.run(
|
|
|
|
|
['sudo', '-n', 'true'],
|
|
|
|
|
capture_output=True,
|
|
|
|
|
).returncode == 0
|
|
|
|
|
if cached:
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
print(
|
|
|
|
|
'[tractor-diag] needs `sudo` for /proc/<pid>/stack '
|
|
|
|
|
'and `py-spy dump`; caching creds via `sudo -v`...'
|
|
|
|
|
)
|
|
|
|
|
try:
|
|
|
|
|
rc = sp.run(['sudo', '-v']).returncode
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
print(' cancelled — proceeding without sudo')
|
|
|
|
|
return False
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
print(' sudo not on PATH — proceeding without sudo')
|
|
|
|
|
return False
|
|
|
|
|
return rc == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# --- pytree ---------------------------------------------------
|
|
|
|
|
|
|
|
|
|
def _pytree(args):
|
|
|
|
|
'''
|
|
|
|
|
psutil-backed proc tree; per-proc classification into
|
|
|
|
|
severity-ordered buckets so leaked / defunct procs
|
|
|
|
|
don't hide in the noise of normal `live` rows.
|
|
|
|
|
|
2026-05-07 22:07:34 +00:00
|
|
|
usage: acli.pytree [--tree|-t] <pid|pgrep-pattern> [...]
|
2026-05-06 18:07:24 +00:00
|
|
|
|
|
|
|
|
classification (per-proc, not per-tree):
|
|
|
|
|
|
|
|
|
|
- zombies: `status in (Z, X)` — defunct, parent
|
|
|
|
|
hasn't reaped (or kernel-marked dead).
|
|
|
|
|
- orphans: `ppid == 1` — original parent exited;
|
|
|
|
|
has been reparented to init. Includes
|
|
|
|
|
the *root* of an abandoned tree AND
|
|
|
|
|
any descendant that ended up reparented
|
|
|
|
|
to init mid-flight.
|
|
|
|
|
- live: real parent (`ppid > 1`), non-defunct.
|
|
|
|
|
|
|
|
|
|
Trees of orphan roots are still walked — their
|
|
|
|
|
descendants show as `live` if they themselves still
|
|
|
|
|
have a real (non-init) parent (the orphan root), but
|
|
|
|
|
the orphan root itself appears in `orphans`.
|
2026-05-06 23:04:55 +00:00
|
|
|
|
|
|
|
|
Cross-bucket parent annotation (always emitted):
|
|
|
|
|
when a row's parent (by ppid) lives in a *different*
|
|
|
|
|
severity bucket, the row is suffixed with
|
|
|
|
|
`[parent: <pid> (in `<bucket>`)]` so the visual
|
|
|
|
|
`└─` marker still resolves to a findable parent
|
|
|
|
|
even when bucketing scatters parent and child into
|
|
|
|
|
separate sections.
|
|
|
|
|
|
|
|
|
|
`--tree` / `-t` flag (opt-in):
|
|
|
|
|
additionally emit a flat walk-order `## tree`
|
|
|
|
|
section at the top — a contiguous parent-child
|
|
|
|
|
tree shape with no severity-grouping. Same procs,
|
|
|
|
|
no annotations needed because each parent appears
|
|
|
|
|
directly above its children.
|
2026-05-06 18:07:24 +00:00
|
|
|
'''
|
2026-05-06 23:04:55 +00:00
|
|
|
flag_tree: bool = False
|
|
|
|
|
pos_args: list = []
|
|
|
|
|
for a in args:
|
|
|
|
|
if a in ('--tree', '-t'):
|
|
|
|
|
flag_tree = True
|
|
|
|
|
else:
|
|
|
|
|
pos_args.append(a)
|
|
|
|
|
|
|
|
|
|
if not pos_args:
|
2026-05-07 22:07:34 +00:00
|
|
|
print('usage: acli.pytree [--tree|-t] <pid|pgrep-pattern> [...]')
|
2026-05-06 18:07:24 +00:00
|
|
|
return 1
|
|
|
|
|
if psutil is None:
|
|
|
|
|
print('pytree requires psutil; install via `uv pip install psutil`')
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
roots: list = []
|
2026-05-06 23:04:55 +00:00
|
|
|
for a in pos_args:
|
2026-05-06 18:07:24 +00:00
|
|
|
roots.extend(_resolve_pids(a))
|
|
|
|
|
roots = sorted(set(roots))
|
|
|
|
|
if not roots:
|
2026-05-06 23:04:55 +00:00
|
|
|
print(f'(no procs match: {pos_args})')
|
2026-05-06 18:07:24 +00:00
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
# statuses considered "defunct" — STATUS_ZOMBIE is the
|
|
|
|
|
# common case (`Z`); STATUS_DEAD (`X`) is rarer but kernel-
|
|
|
|
|
# reported and equally not-coming-back.
|
|
|
|
|
defunct_statuses: set = {
|
|
|
|
|
psutil.STATUS_ZOMBIE,
|
|
|
|
|
getattr(psutil, 'STATUS_DEAD', 'dead'),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
seen: set = set()
|
2026-05-06 23:04:55 +00:00
|
|
|
walk_order: list = [] # [(proc, depth)] preserved walk order
|
|
|
|
|
live: list = [] # [(proc, depth)]
|
2026-05-06 18:07:24 +00:00
|
|
|
orphans: list = []
|
|
|
|
|
zombies: list = []
|
|
|
|
|
gone: list = []
|
|
|
|
|
|
2026-05-06 23:04:55 +00:00
|
|
|
# parent-bucket lookup populated post-classification so
|
|
|
|
|
# `_row()` can annotate cross-bucket parent refs.
|
|
|
|
|
pid_to_bucket: dict = {}
|
|
|
|
|
|
2026-05-06 18:07:24 +00:00
|
|
|
for r in roots:
|
|
|
|
|
for (p, depth) in _walk_tree_with_depth(r):
|
|
|
|
|
if p.pid in seen:
|
|
|
|
|
continue
|
|
|
|
|
seen.add(p.pid)
|
|
|
|
|
try:
|
|
|
|
|
status: str = p.status()
|
|
|
|
|
ppid: int = p.ppid()
|
|
|
|
|
except psutil.NoSuchProcess:
|
|
|
|
|
gone.append(p.pid)
|
|
|
|
|
continue
|
|
|
|
|
entry = (p, depth)
|
|
|
|
|
# severity order: zombie > orphan > live.
|
|
|
|
|
if status in defunct_statuses:
|
|
|
|
|
zombies.append(entry)
|
2026-05-06 23:04:55 +00:00
|
|
|
pid_to_bucket[p.pid] = 'zombies'
|
2026-05-06 18:07:24 +00:00
|
|
|
elif ppid == 1:
|
|
|
|
|
orphans.append(entry)
|
2026-05-06 23:04:55 +00:00
|
|
|
pid_to_bucket[p.pid] = 'orphans'
|
2026-05-06 18:07:24 +00:00
|
|
|
else:
|
|
|
|
|
live.append(entry)
|
2026-05-06 23:04:55 +00:00
|
|
|
pid_to_bucket[p.pid] = 'live'
|
|
|
|
|
walk_order.append(entry)
|
2026-05-06 18:07:24 +00:00
|
|
|
|
|
|
|
|
total: int = len(live) + len(orphans) + len(zombies)
|
|
|
|
|
print(f'# pytree: {total} procs across roots {roots}')
|
|
|
|
|
|
|
|
|
|
hdr = ' ' + 'PID'.rjust(7) + ' ' + 'PPID'.rjust(7) + ' '
|
|
|
|
|
hdr += 'STATUS'.ljust(10) + ' CMD'
|
|
|
|
|
|
2026-05-06 23:04:55 +00:00
|
|
|
def _row(entry, bucket: str|None = None):
|
2026-05-06 18:07:24 +00:00
|
|
|
'''
|
|
|
|
|
Render `(proc, depth)` as an aligned row. Tree depth is
|
|
|
|
|
rendered as a `└─` marker on the CMD column so PID/PPID/
|
|
|
|
|
STATUS stay column-aligned.
|
2026-05-06 23:04:55 +00:00
|
|
|
|
|
|
|
|
When `bucket` is given AND the row's parent lives in a
|
|
|
|
|
*different* bucket, append a `[parent: <pid> (in `<b>`)]`
|
|
|
|
|
suffix so the `└─` marker can be resolved across the
|
|
|
|
|
severity-section split.
|
2026-05-06 18:07:24 +00:00
|
|
|
'''
|
|
|
|
|
p, depth = entry
|
|
|
|
|
tree_pfx = (' ' * depth) + ('└─ ' if depth > 0 else '')
|
2026-05-06 23:04:55 +00:00
|
|
|
|
|
|
|
|
# cross-bucket parent annotation; safe to compute up
|
|
|
|
|
# front because `p.ppid()` is cheap and rarely
|
|
|
|
|
# raises (parent pid is read from `/proc/<pid>/stat`,
|
|
|
|
|
# cached by psutil).
|
|
|
|
|
parent_anno: str = ''
|
|
|
|
|
if (
|
|
|
|
|
bucket is not None
|
|
|
|
|
and depth > 0
|
|
|
|
|
):
|
|
|
|
|
try:
|
|
|
|
|
parent_pid: int = p.ppid()
|
|
|
|
|
except psutil.NoSuchProcess:
|
|
|
|
|
parent_pid = 0
|
|
|
|
|
if parent_pid and parent_pid != 1:
|
|
|
|
|
parent_bucket: str|None = pid_to_bucket.get(parent_pid)
|
|
|
|
|
if (
|
|
|
|
|
parent_bucket is not None
|
|
|
|
|
and parent_bucket != bucket
|
|
|
|
|
):
|
|
|
|
|
parent_anno = (
|
|
|
|
|
f' [parent: {parent_pid} '
|
|
|
|
|
f'(in `{parent_bucket}`)]'
|
|
|
|
|
)
|
|
|
|
|
|
2026-05-06 18:07:24 +00:00
|
|
|
# NOTE: `psutil.ZombieProcess` is a *subclass* of
|
|
|
|
|
# `psutil.NoSuchProcess`, but the proc is NOT gone —
|
|
|
|
|
# it's a zombie whose `/proc/<pid>/cmdline` is empty/
|
|
|
|
|
# unreadable. Catch it FIRST so we still render a
|
|
|
|
|
# row (using fields that DO work on zombies: pid,
|
|
|
|
|
# ppid, status, name).
|
|
|
|
|
try:
|
|
|
|
|
cmd = ' '.join(p.cmdline())[:140] or '[' + p.name() + ']'
|
|
|
|
|
r = ' ' + str(p.pid).rjust(7)
|
|
|
|
|
r += ' ' + str(p.ppid()).rjust(7)
|
|
|
|
|
r += ' ' + p.status().ljust(10)
|
2026-05-06 23:04:55 +00:00
|
|
|
r += ' ' + tree_pfx + cmd + parent_anno
|
2026-05-06 18:07:24 +00:00
|
|
|
return r
|
|
|
|
|
except psutil.ZombieProcess:
|
|
|
|
|
try:
|
2026-05-06 23:04:55 +00:00
|
|
|
ppid_str = str(p.ppid())
|
2026-05-06 18:07:24 +00:00
|
|
|
name = p.name()
|
|
|
|
|
except psutil.NoSuchProcess:
|
2026-05-06 23:04:55 +00:00
|
|
|
ppid_str, name = '?', '?'
|
2026-05-06 18:07:24 +00:00
|
|
|
r = ' ' + str(p.pid).rjust(7)
|
2026-05-06 23:04:55 +00:00
|
|
|
r += ' ' + ppid_str.rjust(7)
|
2026-05-06 18:07:24 +00:00
|
|
|
r += ' ' + 'zombie'.ljust(10)
|
2026-05-06 23:04:55 +00:00
|
|
|
r += ' ' + tree_pfx + '[' + name + ' <defunct>]' + parent_anno
|
2026-05-06 18:07:24 +00:00
|
|
|
return r
|
|
|
|
|
except psutil.NoSuchProcess:
|
|
|
|
|
return ' ' + str(p.pid).rjust(7) + ' (gone mid-walk)'
|
|
|
|
|
|
2026-05-06 23:04:55 +00:00
|
|
|
def _section(
|
|
|
|
|
title: str,
|
|
|
|
|
procs: list,
|
|
|
|
|
hint: str = '',
|
|
|
|
|
bucket: str|None = None,
|
|
|
|
|
):
|
2026-05-06 18:07:24 +00:00
|
|
|
print(f'\n## {title} ({len(procs)})' + (f' — {hint}' if hint else ''))
|
|
|
|
|
if not procs:
|
|
|
|
|
print(' (none)')
|
|
|
|
|
return
|
|
|
|
|
print(hdr)
|
|
|
|
|
for p in procs:
|
2026-05-06 23:04:55 +00:00
|
|
|
print(_row(p, bucket=bucket))
|
|
|
|
|
|
|
|
|
|
# `--tree` opt-in: emit a flat walk-order section first
|
|
|
|
|
# so the parent-child tree shape is contiguous (no
|
|
|
|
|
# severity-grouping). No `bucket` arg → no cross-bucket
|
|
|
|
|
# annotation, since each parent appears directly above
|
|
|
|
|
# its children here.
|
|
|
|
|
if flag_tree:
|
|
|
|
|
_section(
|
|
|
|
|
'tree', walk_order,
|
|
|
|
|
'flat walk-order, parent-child preserved',
|
|
|
|
|
)
|
2026-05-06 18:07:24 +00:00
|
|
|
|
2026-05-06 23:04:55 +00:00
|
|
|
# severity-ordered: most concerning first. Each section
|
|
|
|
|
# passes its own `bucket` name so `_row()` can annotate
|
|
|
|
|
# rows whose parents live in a different section.
|
2026-05-06 18:07:24 +00:00
|
|
|
_section(
|
|
|
|
|
'zombies', zombies,
|
|
|
|
|
'status `Z`/`X`, parent has not reaped',
|
2026-05-06 23:04:55 +00:00
|
|
|
bucket='zombies',
|
2026-05-06 18:07:24 +00:00
|
|
|
)
|
|
|
|
|
_section(
|
|
|
|
|
'orphans', orphans,
|
|
|
|
|
'`ppid==1`, reparented to init (leaked / parent gone)',
|
2026-05-06 23:04:55 +00:00
|
|
|
bucket='orphans',
|
2026-05-06 18:07:24 +00:00
|
|
|
)
|
2026-05-06 23:04:55 +00:00
|
|
|
_section('live', live, bucket='live')
|
2026-05-06 18:07:24 +00:00
|
|
|
|
|
|
|
|
if gone:
|
|
|
|
|
print(f'\n## gone-during-walk ({len(gone)}): {gone}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# --- hung-dump ------------------------------------------------
|
|
|
|
|
|
|
|
|
|
def _hung_dump(args):
|
|
|
|
|
'''
|
|
|
|
|
kernel + python state for a hung pytest/tractor tree.
|
|
|
|
|
walks all descendants of each `<pid|pgrep-pat>` arg.
|
|
|
|
|
|
2026-05-07 22:07:34 +00:00
|
|
|
usage: acli.hung_dump <pid|pgrep-pattern> [...]
|
2026-05-06 18:07:24 +00:00
|
|
|
|
|
|
|
|
note: `/proc/<pid>/stack` and `py-spy dump` typically
|
|
|
|
|
require CAP_SYS_PTRACE — invoked via `sudo -n`. run
|
|
|
|
|
`sudo true` first to cache creds.
|
|
|
|
|
'''
|
|
|
|
|
if not args:
|
2026-05-07 22:07:34 +00:00
|
|
|
print('usage: acli.hung_dump <pid|pgrep-pattern> [...]')
|
2026-05-06 18:07:24 +00:00
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
# cache sudo creds upfront so per-pid `sudo -n` calls
|
|
|
|
|
# for `cat /proc/<pid>/stack` and `py-spy dump` don't
|
|
|
|
|
# each prompt (or silently fail).
|
|
|
|
|
have_sudo: bool = _ensure_sudo_cached()
|
|
|
|
|
|
|
|
|
|
roots: list = []
|
|
|
|
|
for a in args:
|
|
|
|
|
roots.extend(_resolve_pids(a))
|
|
|
|
|
roots = sorted(set(roots))
|
|
|
|
|
if not roots:
|
|
|
|
|
print(f'(no procs match: {args})')
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
pids: list = []
|
|
|
|
|
seen: set = set()
|
|
|
|
|
for r in roots:
|
|
|
|
|
if psutil is not None:
|
|
|
|
|
walk = [p.pid for p in _walk_tree_psutil(r)]
|
|
|
|
|
else:
|
|
|
|
|
walk = _walk_tree_pgrep(r)
|
|
|
|
|
for pid in walk:
|
|
|
|
|
if pid not in seen:
|
|
|
|
|
seen.add(pid)
|
|
|
|
|
pids.append(pid)
|
|
|
|
|
|
|
|
|
|
print(f'# tree: {pids}')
|
|
|
|
|
print('\n## ps forest')
|
|
|
|
|
$[ps -o pid,ppid,pgid,stat,cmd -p @(','.join(map(str, pids)))]
|
|
|
|
|
|
|
|
|
|
for pid in pids:
|
|
|
|
|
print(f'\n## pid {pid}')
|
|
|
|
|
|
|
|
|
|
for f in ('wchan', 'stack'):
|
|
|
|
|
path = Path(f'/proc/{pid}/{f}')
|
|
|
|
|
try:
|
|
|
|
|
txt = path.read_text().rstrip()
|
|
|
|
|
print(f'-- /proc/{pid}/{f} --\n{txt}')
|
|
|
|
|
except PermissionError:
|
|
|
|
|
if not have_sudo:
|
|
|
|
|
print(
|
|
|
|
|
f'-- /proc/{pid}/{f}: '
|
|
|
|
|
'PermissionError (no sudo) --'
|
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
try:
|
|
|
|
|
txt = sp.check_output(
|
|
|
|
|
['sudo', '-n', 'cat', str(path)],
|
|
|
|
|
text=True,
|
|
|
|
|
stderr=sp.DEVNULL,
|
|
|
|
|
).rstrip()
|
|
|
|
|
print(f'-- /proc/{pid}/{f} (sudo) --\n{txt}')
|
|
|
|
|
except sp.CalledProcessError:
|
|
|
|
|
print(
|
|
|
|
|
f'-- /proc/{pid}/{f}: '
|
|
|
|
|
'sudo cred expired? rerun --'
|
|
|
|
|
)
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
print(f'-- /proc/{pid}/{f}: proc gone --')
|
|
|
|
|
|
|
|
|
|
print(f'-- py-spy {pid} --')
|
|
|
|
|
if not have_sudo:
|
|
|
|
|
print(' (skipped — no sudo)')
|
|
|
|
|
continue
|
|
|
|
|
try:
|
|
|
|
|
$[sudo -n py-spy dump --pid @(pid) --locals]
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f' (py-spy failed: {e})')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# --- bindspace-scan -------------------------------------------
|
|
|
|
|
|
|
|
|
|
def _bindspace_scan(args):
|
|
|
|
|
'''
|
|
|
|
|
Scan a tractor UDS bindspace dir for orphan sock files
|
|
|
|
|
(those whose embedded `<pid>` no longer corresponds to
|
|
|
|
|
a live process).
|
|
|
|
|
|
2026-05-07 22:07:34 +00:00
|
|
|
usage: acli.bindspace_scan [<dir>]
|
2026-05-06 18:07:24 +00:00
|
|
|
default: `$XDG_RUNTIME_DIR/tractor`
|
|
|
|
|
(or `/run/user/<uid>/tractor`)
|
|
|
|
|
'''
|
|
|
|
|
if args:
|
|
|
|
|
bs_dir = Path(args[0])
|
|
|
|
|
else:
|
|
|
|
|
runtime = os.environ.get(
|
|
|
|
|
'XDG_RUNTIME_DIR',
|
|
|
|
|
f'/run/user/{os.getuid()}',
|
|
|
|
|
)
|
|
|
|
|
bs_dir = Path(runtime) / 'tractor'
|
|
|
|
|
|
|
|
|
|
if not bs_dir.exists():
|
|
|
|
|
print(f'(no bindspace at {bs_dir})')
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
socks = sorted(bs_dir.glob('*.sock'))
|
|
|
|
|
print(f'## bindspace {bs_dir} ({len(socks)} sock file(s))')
|
|
|
|
|
|
|
|
|
|
live: list = []
|
|
|
|
|
orphans: list = []
|
|
|
|
|
bogus: list = []
|
|
|
|
|
|
|
|
|
|
for s in socks:
|
|
|
|
|
m = _UDS_SOCK_RE.match(s.name)
|
|
|
|
|
if not m:
|
|
|
|
|
bogus.append(s)
|
|
|
|
|
continue
|
|
|
|
|
pid = int(m['pid'])
|
|
|
|
|
name = m['name']
|
|
|
|
|
try:
|
|
|
|
|
os.kill(pid, 0)
|
|
|
|
|
live.append((s, pid, name))
|
|
|
|
|
except ProcessLookupError:
|
|
|
|
|
orphans.append((s, pid, name))
|
|
|
|
|
except PermissionError:
|
|
|
|
|
# exists but owned by another user
|
|
|
|
|
live.append((s, pid, name))
|
|
|
|
|
|
|
|
|
|
print(f'\n## live ({len(live)})')
|
|
|
|
|
if not live:
|
|
|
|
|
print(' (none)')
|
|
|
|
|
for s, pid, name in live:
|
|
|
|
|
row = ' ' + str(pid).rjust(7)
|
|
|
|
|
row += ' ' + name.ljust(32)
|
|
|
|
|
row += ' ' + s.name
|
|
|
|
|
print(row)
|
|
|
|
|
|
|
|
|
|
print(f'\n## orphaned ({len(orphans)})')
|
|
|
|
|
if not orphans:
|
|
|
|
|
print(' (none)')
|
|
|
|
|
for s, pid, name in orphans:
|
|
|
|
|
row = ' ' + str(pid).rjust(7)
|
|
|
|
|
row += ' ' + name.ljust(32)
|
|
|
|
|
row += ' ' + s.name + ' (no live proc)'
|
|
|
|
|
print(row)
|
|
|
|
|
|
|
|
|
|
if bogus:
|
|
|
|
|
print(f'\n## unparseable ({len(bogus)})')
|
|
|
|
|
for s in bogus:
|
|
|
|
|
print(f' {s.name}')
|
|
|
|
|
|
|
|
|
|
if orphans:
|
|
|
|
|
unlink_cmd = ' '.join(str(o[0]) for o in orphans)
|
|
|
|
|
print(f'\nto unlink orphans:\n rm {unlink_cmd}')
|
|
|
|
|
|
|
|
|
|
|
2026-05-07 22:07:34 +00:00
|
|
|
# --- acli.reap ------------------------------------------------
|
|
|
|
|
|
|
|
|
|
def _tractor_reap(args):
|
|
|
|
|
'''
|
|
|
|
|
SC-polite zombie-subactor reaper + optional `/dev/shm/`
|
|
|
|
|
orphan-segment sweep + optional UDS sock-file sweep.
|
|
|
|
|
|
|
|
|
|
usage: acli.reap [-h] [--parent PID] [--grace SEC]
|
|
|
|
|
[--dry-run] [--shm | --shm-only]
|
|
|
|
|
[--uds | --uds-only]
|
|
|
|
|
|
|
|
|
|
phases (run in order when enabled):
|
|
|
|
|
|
|
|
|
|
1. process reap — finds tractor subactor procs left
|
|
|
|
|
alive after a `pytest`/app run that failed to fully
|
|
|
|
|
cancel its tree. Default = orphan-mode (PPid==1
|
|
|
|
|
init-reparented procs whose cwd matches repo root
|
|
|
|
|
AND cmdline contains `python`). With `--parent`,
|
|
|
|
|
scopes to descendants of a specific live PID.
|
|
|
|
|
SIGINT first, then SIGKILL after `--grace` (default
|
|
|
|
|
3.0s).
|
|
|
|
|
2. shm sweep (`--shm`/`--shm-only`) — unlinks
|
|
|
|
|
`/dev/shm/<file>` entries owned by the current uid
|
|
|
|
|
that no live process has open. Needed because
|
|
|
|
|
`tractor` disables `mp.resource_tracker`.
|
|
|
|
|
3. UDS sweep (`--uds`/`--uds-only`) — unlinks
|
|
|
|
|
`${XDG_RUNTIME_DIR}/tractor/<name>@<pid>.sock`
|
|
|
|
|
files whose binder pid is dead (or the `1616`
|
|
|
|
|
registry sentinel). See issue #452.
|
|
|
|
|
|
|
|
|
|
Mirrors `scripts/tractor-reap` (use `-n`/`--dry-run`
|
|
|
|
|
first to see what would be touched).
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
prog='acli.reap',
|
|
|
|
|
description=_tractor_reap.__doc__,
|
|
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--parent', '-p',
|
|
|
|
|
type=int,
|
|
|
|
|
default=None,
|
|
|
|
|
help='descendant-mode: reap procs with PPid==<pid>',
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--grace', '-g',
|
|
|
|
|
type=float,
|
|
|
|
|
default=3.0,
|
|
|
|
|
help='SIGINT grace window in seconds (default 3.0)',
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--dry-run', '-n',
|
|
|
|
|
action='store_true',
|
|
|
|
|
help='list matched pids/paths but do not signal/unlink',
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--shm',
|
|
|
|
|
action='store_true',
|
|
|
|
|
help='also unlink orphaned /dev/shm segments',
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--shm-only',
|
|
|
|
|
action='store_true',
|
|
|
|
|
help='skip process reap; only do the shm sweep',
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--uds',
|
|
|
|
|
action='store_true',
|
|
|
|
|
help='also unlink orphaned UDS sock-files',
|
|
|
|
|
)
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--uds-only',
|
|
|
|
|
action='store_true',
|
|
|
|
|
help='skip process reap + shm; only do the UDS sweep',
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
ns = parser.parse_args(args)
|
|
|
|
|
except SystemExit as se:
|
|
|
|
|
# `argparse` raises SystemExit on `-h`/bad-args; let
|
|
|
|
|
# xonsh treat it as a normal alias return code.
|
|
|
|
|
return int(se.code) if se.code is not None else 0
|
|
|
|
|
|
|
|
|
|
skip_proc_reap: bool = (
|
|
|
|
|
ns.shm_only
|
|
|
|
|
or
|
|
|
|
|
ns.uds_only
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# repo-root resolution: `git rev-parse --show-toplevel`
|
|
|
|
|
# first, falling back to the xontrib file's parent of
|
|
|
|
|
# parent. mirrors `scripts/tractor-reap._repo_root()`.
|
|
|
|
|
try:
|
|
|
|
|
repo_str: str = sp.check_output(
|
|
|
|
|
['git', 'rev-parse', '--show-toplevel'],
|
|
|
|
|
stderr=sp.DEVNULL,
|
|
|
|
|
text=True,
|
|
|
|
|
).strip()
|
|
|
|
|
repo: Path = Path(repo_str)
|
|
|
|
|
except (sp.CalledProcessError, FileNotFoundError):
|
|
|
|
|
repo: Path = Path(__file__).resolve().parent.parent
|
|
|
|
|
|
|
|
|
|
# lazy-import the reap helpers since the package may not
|
|
|
|
|
# have been on `sys.path` at xontrib-load time (e.g. the
|
|
|
|
|
# contrib was sourced before activating the venv).
|
|
|
|
|
import sys
|
|
|
|
|
if str(repo) not in sys.path:
|
|
|
|
|
sys.path.insert(0, str(repo))
|
|
|
|
|
from tractor._testing._reap import (
|
|
|
|
|
find_descendants,
|
|
|
|
|
find_orphans,
|
|
|
|
|
find_orphaned_shm,
|
|
|
|
|
find_orphaned_uds,
|
|
|
|
|
reap,
|
|
|
|
|
reap_shm,
|
|
|
|
|
reap_uds,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
rc: int = 0
|
|
|
|
|
|
|
|
|
|
# phase 1: process reap (skipped under `--*-only`)
|
|
|
|
|
if not skip_proc_reap:
|
|
|
|
|
if ns.parent is not None:
|
|
|
|
|
pids: list = find_descendants(ns.parent)
|
|
|
|
|
mode: str = f'descendants of PPid={ns.parent}'
|
|
|
|
|
else:
|
|
|
|
|
pids = find_orphans(repo)
|
|
|
|
|
mode = f'orphans (PPid=1, cwd={repo})'
|
|
|
|
|
|
|
|
|
|
if not pids:
|
|
|
|
|
print(f'[acli.reap] no {mode} to reap')
|
|
|
|
|
elif ns.dry_run:
|
|
|
|
|
print(
|
|
|
|
|
f'[acli.reap] dry-run — {mode}:\n {pids}'
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
_, survivors = reap(pids, grace=ns.grace)
|
|
|
|
|
if survivors:
|
|
|
|
|
rc = 1
|
|
|
|
|
|
|
|
|
|
# phase 2: shm sweep (opt-in)
|
|
|
|
|
if ns.shm or ns.shm_only:
|
|
|
|
|
leaked: list = find_orphaned_shm()
|
|
|
|
|
if not leaked:
|
|
|
|
|
print(
|
|
|
|
|
'[acli.reap] no orphaned /dev/shm '
|
|
|
|
|
'segments to sweep'
|
|
|
|
|
)
|
|
|
|
|
elif ns.dry_run:
|
|
|
|
|
print(
|
|
|
|
|
f'[acli.reap] dry-run — {len(leaked)} '
|
|
|
|
|
f'orphaned shm segment(s):\n {leaked}'
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
_, errors = reap_shm(leaked)
|
|
|
|
|
if errors:
|
|
|
|
|
rc = 1
|
|
|
|
|
|
|
|
|
|
# phase 3: UDS sweep (opt-in)
|
|
|
|
|
if ns.uds or ns.uds_only:
|
|
|
|
|
leaked_uds: list = find_orphaned_uds()
|
|
|
|
|
if not leaked_uds:
|
|
|
|
|
print(
|
|
|
|
|
'[acli.reap] no orphaned UDS sock-files '
|
|
|
|
|
'to sweep'
|
|
|
|
|
)
|
|
|
|
|
elif ns.dry_run:
|
|
|
|
|
print(
|
|
|
|
|
f'[acli.reap] dry-run — {len(leaked_uds)} '
|
|
|
|
|
f'orphaned UDS sock-file(s):\n {leaked_uds}'
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
_, errors = reap_uds(leaked_uds)
|
|
|
|
|
if errors:
|
|
|
|
|
rc = 1
|
|
|
|
|
|
|
|
|
|
return rc
|
|
|
|
|
|
|
|
|
|
|
2026-05-06 18:07:24 +00:00
|
|
|
# --- registration ---------------------------------------------
|
|
|
|
|
|
2026-05-07 22:07:34 +00:00
|
|
|
# all aliases under the `acli.` namespace so xonsh's prefix-
|
|
|
|
|
# completion makes them feel like a sub-cmd group: type
|
|
|
|
|
# `acli.<TAB>` and the full set is suggested. no parent
|
|
|
|
|
# `acli` cmd exists — the dot is purely a naming convention.
|
|
|
|
|
_TCLI_ALIASES: dict = {
|
|
|
|
|
'acli.pytree': _pytree,
|
|
|
|
|
'acli.hung_dump': _hung_dump,
|
|
|
|
|
'acli.bindspace_scan': _bindspace_scan,
|
|
|
|
|
'acli.reap': _tractor_reap,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _name, _fn in _TCLI_ALIASES.items():
|
|
|
|
|
aliases[_name] = _fn
|
2026-05-06 18:07:24 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# xontrib protocol hooks (for `xontrib load tractor_diag`).
|
|
|
|
|
# also harmless when sourced directly.
|
|
|
|
|
def _load_xontrib_(xsh, **_):
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _unload_xontrib_(xsh, **_):
|
2026-05-07 22:07:34 +00:00
|
|
|
for name in _TCLI_ALIASES:
|
2026-05-06 18:07:24 +00:00
|
|
|
aliases.pop(name, None)
|
|
|
|
|
return {}
|