diff --git a/xontrib/tractor_diag.xsh b/xontrib/tractor_diag.xsh index 9230144b..d1231c59 100644 --- a/xontrib/tractor_diag.xsh +++ b/xontrib/tractor_diag.xsh @@ -1,15 +1,23 @@ """ `xontrib_tractor_diag`: pytest/tractor diagnostic aliases. +All aliases live under the `acli.` namespace so xonsh's +prefix-completion treats them as a sub-cmd group — type +`acli.` to see the full set. + Provides: - - `pytree ` psutil-backed proc tree, - live + zombies split. - - `hung-dump [...]` kernel `wchan`/`stack` + - `py-spy dump` (incl `--locals`) - for each pid in tree. - - `bindspace-scan []` find orphaned tractor UDS - sock files (no live owner pid). - default: `$XDG_RUNTIME_DIR/tractor`. + - `acli.pytree ` psutil-backed proc tree, + live + zombies split. + - `acli.hung_dump [...]` kernel `wchan`/`stack` + + `py-spy dump` (incl `--locals`) + for each pid in tree. + - `acli.bindspace_scan []` find orphaned tractor UDS + sock files (no live owner pid). + default: `$XDG_RUNTIME_DIR/tractor`. + - `acli.reap [opts]` SC-polite zombie-subactor + reaper + optional `/dev/shm/` + + UDS sock-file sweeps. + alias for `scripts/tractor-reap`. Loading from repo root: xontrib load -p ./xontrib tractor_diag @@ -36,7 +44,7 @@ except ImportError: psutil = None print( '[tractor-diag] `psutil` missing — ' - 'pytree disabled, hung-dump uses pgrep fallback. ' + 'acli.pytree disabled, acli.hung_dump uses pgrep fallback. ' '`uv pip install psutil` for full functionality.' ) @@ -157,7 +165,7 @@ def _pytree(args): severity-ordered buckets so leaked / defunct procs don't hide in the noise of normal `live` rows. - usage: pytree [--tree|-t] [...] + usage: acli.pytree [--tree|-t] [...] classification (per-proc, not per-tree): @@ -199,7 +207,7 @@ def _pytree(args): pos_args.append(a) if not pos_args: - print('usage: pytree [--tree|-t] [...]') + print('usage: acli.pytree [--tree|-t] [...]') return 1 if psutil is None: print('pytree requires psutil; install via `uv pip install psutil`') @@ -378,14 +386,14 @@ def _hung_dump(args): kernel + python state for a hung pytest/tractor tree. walks all descendants of each `` arg. - usage: hung-dump [...] + usage: acli.hung_dump [...] note: `/proc//stack` and `py-spy dump` typically require CAP_SYS_PTRACE — invoked via `sudo -n`. run `sudo true` first to cache creds. ''' if not args: - print('usage: hung-dump [...]') + print('usage: acli.hung_dump [...]') return 1 # cache sudo creds upfront so per-pid `sudo -n` calls @@ -465,7 +473,7 @@ def _bindspace_scan(args): (those whose embedded `` no longer corresponds to a live process). - usage: bindspace-scan [] + usage: acli.bindspace_scan [] default: `$XDG_RUNTIME_DIR/tractor` (or `/run/user//tractor`) ''' @@ -533,11 +541,203 @@ def _bindspace_scan(args): print(f'\nto unlink orphans:\n rm {unlink_cmd}') +# --- acli.reap ------------------------------------------------ + +def _tractor_reap(args): + ''' + SC-polite zombie-subactor reaper + optional `/dev/shm/` + orphan-segment sweep + optional UDS sock-file sweep. + + usage: acli.reap [-h] [--parent PID] [--grace SEC] + [--dry-run] [--shm | --shm-only] + [--uds | --uds-only] + + phases (run in order when enabled): + + 1. process reap — finds tractor subactor procs left + alive after a `pytest`/app run that failed to fully + cancel its tree. Default = orphan-mode (PPid==1 + init-reparented procs whose cwd matches repo root + AND cmdline contains `python`). With `--parent`, + scopes to descendants of a specific live PID. + SIGINT first, then SIGKILL after `--grace` (default + 3.0s). + 2. shm sweep (`--shm`/`--shm-only`) — unlinks + `/dev/shm/` entries owned by the current uid + that no live process has open. Needed because + `tractor` disables `mp.resource_tracker`. + 3. UDS sweep (`--uds`/`--uds-only`) — unlinks + `${XDG_RUNTIME_DIR}/tractor/@.sock` + files whose binder pid is dead (or the `1616` + registry sentinel). See issue #452. + + Mirrors `scripts/tractor-reap` (use `-n`/`--dry-run` + first to see what would be touched). + + ''' + import argparse + + parser = argparse.ArgumentParser( + prog='acli.reap', + description=_tractor_reap.__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + '--parent', '-p', + type=int, + default=None, + help='descendant-mode: reap procs with PPid==', + ) + parser.add_argument( + '--grace', '-g', + type=float, + default=3.0, + help='SIGINT grace window in seconds (default 3.0)', + ) + parser.add_argument( + '--dry-run', '-n', + action='store_true', + help='list matched pids/paths but do not signal/unlink', + ) + parser.add_argument( + '--shm', + action='store_true', + help='also unlink orphaned /dev/shm segments', + ) + parser.add_argument( + '--shm-only', + action='store_true', + help='skip process reap; only do the shm sweep', + ) + parser.add_argument( + '--uds', + action='store_true', + help='also unlink orphaned UDS sock-files', + ) + parser.add_argument( + '--uds-only', + action='store_true', + help='skip process reap + shm; only do the UDS sweep', + ) + + try: + ns = parser.parse_args(args) + except SystemExit as se: + # `argparse` raises SystemExit on `-h`/bad-args; let + # xonsh treat it as a normal alias return code. + return int(se.code) if se.code is not None else 0 + + skip_proc_reap: bool = ( + ns.shm_only + or + ns.uds_only + ) + + # repo-root resolution: `git rev-parse --show-toplevel` + # first, falling back to the xontrib file's parent of + # parent. mirrors `scripts/tractor-reap._repo_root()`. + try: + repo_str: str = sp.check_output( + ['git', 'rev-parse', '--show-toplevel'], + stderr=sp.DEVNULL, + text=True, + ).strip() + repo: Path = Path(repo_str) + except (sp.CalledProcessError, FileNotFoundError): + repo: Path = Path(__file__).resolve().parent.parent + + # lazy-import the reap helpers since the package may not + # have been on `sys.path` at xontrib-load time (e.g. the + # contrib was sourced before activating the venv). + import sys + if str(repo) not in sys.path: + sys.path.insert(0, str(repo)) + from tractor._testing._reap import ( + find_descendants, + find_orphans, + find_orphaned_shm, + find_orphaned_uds, + reap, + reap_shm, + reap_uds, + ) + + rc: int = 0 + + # phase 1: process reap (skipped under `--*-only`) + if not skip_proc_reap: + if ns.parent is not None: + pids: list = find_descendants(ns.parent) + mode: str = f'descendants of PPid={ns.parent}' + else: + pids = find_orphans(repo) + mode = f'orphans (PPid=1, cwd={repo})' + + if not pids: + print(f'[acli.reap] no {mode} to reap') + elif ns.dry_run: + print( + f'[acli.reap] dry-run — {mode}:\n {pids}' + ) + else: + _, survivors = reap(pids, grace=ns.grace) + if survivors: + rc = 1 + + # phase 2: shm sweep (opt-in) + if ns.shm or ns.shm_only: + leaked: list = find_orphaned_shm() + if not leaked: + print( + '[acli.reap] no orphaned /dev/shm ' + 'segments to sweep' + ) + elif ns.dry_run: + print( + f'[acli.reap] dry-run — {len(leaked)} ' + f'orphaned shm segment(s):\n {leaked}' + ) + else: + _, errors = reap_shm(leaked) + if errors: + rc = 1 + + # phase 3: UDS sweep (opt-in) + if ns.uds or ns.uds_only: + leaked_uds: list = find_orphaned_uds() + if not leaked_uds: + print( + '[acli.reap] no orphaned UDS sock-files ' + 'to sweep' + ) + elif ns.dry_run: + print( + f'[acli.reap] dry-run — {len(leaked_uds)} ' + f'orphaned UDS sock-file(s):\n {leaked_uds}' + ) + else: + _, errors = reap_uds(leaked_uds) + if errors: + rc = 1 + + return rc + + # --- registration --------------------------------------------- -aliases['pytree'] = _pytree -aliases['hung-dump'] = _hung_dump -aliases['bindspace-scan'] = _bindspace_scan +# all aliases under the `acli.` namespace so xonsh's prefix- +# completion makes them feel like a sub-cmd group: type +# `acli.` and the full set is suggested. no parent +# `acli` cmd exists — the dot is purely a naming convention. +_TCLI_ALIASES: dict = { + 'acli.pytree': _pytree, + 'acli.hung_dump': _hung_dump, + 'acli.bindspace_scan': _bindspace_scan, + 'acli.reap': _tractor_reap, +} + +for _name, _fn in _TCLI_ALIASES.items(): + aliases[_name] = _fn # xontrib protocol hooks (for `xontrib load tractor_diag`). @@ -547,6 +747,6 @@ def _load_xontrib_(xsh, **_): def _unload_xontrib_(xsh, **_): - for name in ('pytree', 'hung-dump', 'bindspace-scan'): + for name in _TCLI_ALIASES: aliases.pop(name, None) return {}