From 522b57570b5e15c5e5e16a7d9f2ac5400d5bc9ba Mon Sep 17 00:00:00 2001 From: goodboy Date: Fri, 8 May 2026 00:51:05 -0400 Subject: [PATCH] Add `_is_tractor_subactor()`, cgroup-aware `ptree` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework reap/diag tooling to identify tractor sub-actors via intrinsic proc signals — cmdline/comm markers from `setproctitle` — instead of env-var or cwd matching. Deats, - new `_is_tractor_subactor()` checks cmdline for `tractor[` / `tractor._child` markers, falls back to `/proc//comm` for zombie-resilient detection (kernel preserves `comm` past exit until reap) - `_read_comm()` reads kernel per-task name set by `setproctitle()` — the zombie-safe ID signal - `_read_status_state()` reads single-letter proc state from `/proc//status` (`Z` = zombie) - `find_orphans()` drops `repo_root` requirement, uses `_is_tractor_subactor()` for intrinsic sub-actor ID instead of cwd coincidence-matching - new `find_zombies()` with optional `parent_pid` filter for zombie-state sub-actors Also, - rename `pytree` -> `ptree` throughout xontrib - add `_which_cgroup_slice()` — reads `/proc//cgroup` to distinguish `system.slice` services vs `user.slice` desktop apps from genuinely leaked orphans - `_ptree` classifies `ppid==1` procs into `system-slice`, `user-slice`, and `orphans` buckets with per-section output - `_tractor_reap` drops `git rev-parse` / `sys.path` hack — assumes tractor importable from active venv (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tractor/_testing/_reap.py | 167 ++++++++++++++++++++++++++++++++++---- xontrib/tractor_diag.xsh | 162 +++++++++++++++++++++++++++--------- 2 files changed, 276 insertions(+), 53 deletions(-) diff --git a/tractor/_testing/_reap.py b/tractor/_testing/_reap.py index 797c337d..31e09dfa 100644 --- a/tractor/_testing/_reap.py +++ b/tractor/_testing/_reap.py @@ -188,6 +188,86 @@ def _read_cmdline(pid: int) -> str: return '' +def _read_comm(pid: int) -> str: + ''' + Read `/proc//comm` — the kernel's per-task name + (truncated to ~15 bytes on Linux). Set by + `setproctitle.setproctitle()` so this is one of the + most reliable identifiers for tractor sub-actors — + notably, **survives zombie state** (kernel preserves + `comm` even after exit, until reaped) where + `cmdline`/`environ` may not. + + ''' + try: + with open(f'/proc/{pid}/comm') as f: + return f.read().rstrip('\n') + except ( + FileNotFoundError, + PermissionError, + ProcessLookupError, + ): + return '' + + +# Intrinsic markers that identify a tractor sub-actor +# regardless of cwd / venv path / launch context. Used by +# `_is_tractor_subactor()` below. +# +# - cmdline `tractor[`: matches the `setproctitle`-set form +# (`tractor[]`) — set in +# `_actor_child_main` for ALL backends, mutates argv via +# libc so visible in `/proc//cmdline`. +# - cmdline `tractor._child`: matches the legacy +# `python -m tractor._child --uid (...)` form. Catches +# procs that died before `_actor_child_main` got to call +# `setproctitle()` — argv from exec is still kernel- +# visible at that point. +# - comm `tractor[`: same proctitle-set form, but visible +# via `/proc//comm` (kernel-truncated to ~15 bytes, +# `tractor[doggy:`). Critical for ZOMBIES — kernel +# preserves `comm` past task-exit until parent reaps, +# while `cmdline` for zombies often reads as empty. +_TRACTOR_PROC_CMDLINE_MARKERS: tuple[str, ...] = ( + 'tractor._child', + 'tractor[', +) +_TRACTOR_PROC_COMM_MARKER: str = 'tractor[' + + +def _is_tractor_subactor(pid: int) -> bool: + ''' + Detect whether `pid` is a tractor sub-actor process + using **intrinsic** signals — cmdline → comm — in + priority order. + + No filesystem-state coupling (cwd / venv path) and no + env-var dependency: `setproctitle`-mutated argv (set + in `_actor_child_main`) covers all live + most-zombie + cases; legacy `python -m tractor._child` cmdline + catches anything that died before `setproctitle` ran; + kernel `comm` covers zombies that survived past + `_actor_child_main` long enough to setproctitle. + + ''' + # 1. cmdline match — catches both `setproctitle`-set + # `tractor[]` (live) AND legacy `python -m + # tractor._child` (any) form. + cmdline: str = _read_cmdline(pid) + if any(m in cmdline for m in _TRACTOR_PROC_CMDLINE_MARKERS): + return True + + # 2. Zombie-resilient fallback: kernel-preserved `comm` + # (set by setproctitle). Critical for zombies whose + # `cmdline` reads as empty post-exit but whose + # `comm` survives to `wait()` time. + comm: str = _read_comm(pid) + if _TRACTOR_PROC_COMM_MARKER in comm: + return True + + return False + + def _iter_live_pids() -> list[int]: ''' Enumerate currently-alive pids from `/proc`. Returns @@ -291,34 +371,89 @@ def find_runaway_subactors( return runaways +def _read_status_state(pid: int) -> str | None: + ''' + Return the single-letter task state from + `/proc//status` (`R`/`S`/`D`/`Z`/`T`/`X`/`I`) or + `None` if unreadable. `Z` = zombie. + + ''' + try: + with open(f'/proc/{pid}/status') as f: + for line in f: + if line.startswith('State:'): + # `State:\tZ (zombie)` -> 'Z' + parts = line.split() + if len(parts) >= 2: + return parts[1] + except ( + FileNotFoundError, + PermissionError, + ProcessLookupError, + ): + return None + return None + + def find_orphans( - repo_root: pathlib.Path, + repo_root: pathlib.Path|None = None, ) -> list[int]: ''' - PIDs that are: + PIDs that are reparented to init (`PPid == 1`) AND + are tractor sub-actors per `_is_tractor_subactor()`'s + intrinsic checks (env-var → cmdline → comm). - - reparented to init (`PPid == 1`), - - have `cwd == `, - - and have a `python` in their cmdline. - - This is the "pytest-died-mid-session" case where the - subactor forks got reparented. The cwd filter is the - critical bit that keeps us from sweeping up unrelated - init-children on the box. + The `repo_root` arg is kept for back-compat with + callers that previously passed it (the old impl used + it to filter by cwd) but is no longer required — + tractor sub-actor identity is intrinsic to the proc, + not its launch context. ''' - repo: str = str(repo_root) + # `repo_root` kept in signature for back-compat; today + # the intrinsic env/cmdline/comm signals identify a + # tractor sub-actor without coincidence-of-cwd + # matching. Suppressed-arg stays a no-op so existing + # callers don't have to change. + _ = repo_root # noqa hits: list[int] = [] for pid in _iter_live_pids(): if _read_status_ppid(pid) != 1: continue - cwd: str | None = _read_cwd(pid) - if cwd != repo: + if _is_tractor_subactor(pid): + hits.append(pid) + return hits + + +def find_zombies( + parent_pid: int|None = None, +) -> list[int]: + ''' + PIDs in zombie state (`/proc//status: State: Z`) + that are tractor sub-actors per + `_is_tractor_subactor()`. + + When `parent_pid` is given, restricts to descendants + of that pid (typical for pytest session-end fixture + use). When `None`, scans all zombies on the box. + + Detection for zombies relies primarily on + `/proc//comm` (kernel-preserved past zombie + state, set by `setproctitle`) since + `cmdline`/`environ` are usually empty post-exit. + + ''' + hits: list[int] = [] + for pid in _iter_live_pids(): + if _read_status_state(pid) != 'Z': continue - cmd: str = _read_cmdline(pid) - if 'python' not in cmd: + if ( + parent_pid is not None + and _read_status_ppid(pid) != parent_pid + ): continue - hits.append(pid) + if _is_tractor_subactor(pid): + hits.append(pid) return hits diff --git a/xontrib/tractor_diag.xsh b/xontrib/tractor_diag.xsh index d1231c59..5316a9bb 100644 --- a/xontrib/tractor_diag.xsh +++ b/xontrib/tractor_diag.xsh @@ -6,7 +6,7 @@ prefix-completion treats them as a sub-cmd group — type `acli.` to see the full set. Provides: - - `acli.pytree ` psutil-backed proc tree, + - `acli.ptree ` psutil-backed proc tree, live + zombies split. - `acli.hung_dump [...]` kernel `wchan`/`stack` + `py-spy dump` (incl `--locals`) @@ -28,7 +28,7 @@ Or source directly: Pipe-to-paste idiom (xonsh): hung-dump pytest |t /tmp/hung.log -Requires `psutil` for full functionality (`pytree` and the +Requires `psutil` for full functionality (`ptree` and the `hung-dump` tree-walk). Falls back to `pgrep -P` recursion if missing. """ @@ -44,7 +44,7 @@ except ImportError: psutil = None print( '[tractor-diag] `psutil` missing — ' - 'acli.pytree disabled, acli.hung_dump uses pgrep fallback. ' + 'acli.ptree disabled, acli.hung_dump uses pgrep fallback. ' '`uv pip install psutil` for full functionality.' ) @@ -84,7 +84,7 @@ def _walk_tree_with_depth(pid: int): ''' Yield `(proc, depth)` pairs walking `pid`'s tree. `depth==0` is the root; `depth==1` are direct children, etc. Used by - `pytree` to render parent/child relationships visually. + `ptree` to render parent/child relationships visually. ''' try: root = psutil.Process(pid) @@ -107,6 +107,56 @@ def _walk_tree_with_depth(pid: int): stack.append((k, d + 1)) +def _which_cgroup_slice(pid: int) -> str|None: + ''' + Return which top-level systemd cgroup slice `pid` is + rooted in, or `None` if it's not in either: + + - `'system'`: under `/system.slice/...` — typically + `.service` units (long-lived daemons explicitly + enabled via `systemctl enable`, e.g. + `auto-cpufreq.service`, `dbus.service`, + `systemd-journald.service`). + + - `'user'`: under `/user.slice/user-.slice/...` + — typically `.scope` units that systemd auto-wraps + around desktop-launched apps + login-session + procs (e.g. `app-firefox-.scope`, + `session-.scope`). + + - `None`: NOT in either slice — pid 1 is NOT + managing this proc via cgroup. Combined with + `ppid==1`, this is the genuine "leaked / parent + died" orphan signal. + + Both slice categories are by-design `ppid==1` (pid 1 + is actively managing them) and should NOT be flagged + as concerning orphans, but distinguishing them is + useful: `system.slice` is "real services on this + box", `user.slice` is "stuff in your login session". + + Returns `None` on any read error (proc gone, perm + denied, non-Linux, etc.) — callers should treat that + as "unknown, classify as plain orphan". + + ''' + try: + with open(f'/proc/{pid}/cgroup') as f: + cg: str = f.read() + except ( + FileNotFoundError, + PermissionError, + ProcessLookupError, + OSError, + ): + return None + if '/system.slice/' in cg: + return 'system' + if '/user.slice/' in cg: + return 'user' + return None + + def _walk_tree_pgrep(pid: int) -> list: '''psutil-less fallback — recursive `pgrep -P`.''' out = [pid] @@ -157,15 +207,15 @@ def _ensure_sudo_cached() -> bool: return rc == 0 -# --- pytree --------------------------------------------------- +# --- ptree --------------------------------------------------- -def _pytree(args): +def _ptree(args): ''' psutil-backed proc tree; per-proc classification into severity-ordered buckets so leaked / defunct procs don't hide in the noise of normal `live` rows. - usage: acli.pytree [--tree|-t] [...] + usage: acli.ptree [--tree|-t] [...] classification (per-proc, not per-tree): @@ -207,10 +257,10 @@ def _pytree(args): pos_args.append(a) if not pos_args: - print('usage: acli.pytree [--tree|-t] [...]') + print('usage: acli.ptree [--tree|-t] [...]') return 1 if psutil is None: - print('pytree requires psutil; install via `uv pip install psutil`') + print('ptree requires psutil; install via `uv pip install psutil`') return 1 roots: list = [] @@ -233,6 +283,15 @@ def _pytree(args): walk_order: list = [] # [(proc, depth)] preserved walk order live: list = [] # [(proc, depth)] orphans: list = [] + # `ppid==1` AND rooted in `/system.slice/` cgroup — + # real systemd-managed services (e.g. `auto-cpufreq`, + # `NetworkManager`). + system_slice: list = [] + # `ppid==1` AND rooted in `/user.slice/.../*.scope` — + # desktop-launched apps wrapped by systemd-user in + # transient `.scope` units (e.g. Firefox, browsers, + # editors started from a launcher). + user_slice: list = [] zombies: list = [] gone: list = [] @@ -252,20 +311,43 @@ def _pytree(args): gone.append(p.pid) continue entry = (p, depth) - # severity order: zombie > orphan > live. + # severity order: + # zombie > orphan > system-slice > user-slice > live + # `ppid==1` splits into: + # - `system-slice` (rooted in `/system.slice/` — + # real services, by-design `ppid==1`) + # - `user-slice` (rooted in + # `/user.slice/.../*.scope` — desktop apps + # wrapped by systemd-user, by-design `ppid==1`) + # - `orphans` (everything else with `ppid==1` — + # genuinely concerning). if status in defunct_statuses: zombies.append(entry) pid_to_bucket[p.pid] = 'zombies' elif ppid == 1: - orphans.append(entry) - pid_to_bucket[p.pid] = 'orphans' + slice_kind: str|None = _which_cgroup_slice(p.pid) + if slice_kind == 'system': + system_slice.append(entry) + pid_to_bucket[p.pid] = 'system-slice' + elif slice_kind == 'user': + user_slice.append(entry) + pid_to_bucket[p.pid] = 'user-slice' + else: + orphans.append(entry) + pid_to_bucket[p.pid] = 'orphans' else: live.append(entry) pid_to_bucket[p.pid] = 'live' walk_order.append(entry) - total: int = len(live) + len(orphans) + len(zombies) - print(f'# pytree: {total} procs across roots {roots}') + total: int = ( + len(live) + + len(orphans) + + len(system_slice) + + len(user_slice) + + len(zombies) + ) + print(f'# ptree: {total} procs across roots {roots}') hdr = ' ' + 'PID'.rjust(7) + ' ' + 'PPID'.rjust(7) + ' ' hdr += 'STATUS'.ljust(10) + ' CMD' @@ -370,9 +452,24 @@ def _pytree(args): ) _section( 'orphans', orphans, - '`ppid==1`, reparented to init (leaked / parent gone)', + '`ppid==1`, NOT in a `system.slice`/`user.slice` cgroup ' + '(likely leaked / parent gone)', bucket='orphans', ) + _section( + 'system-slice', system_slice, + '`ppid==1`, rooted under `/system.slice/` ' + '(real systemd-managed service — daemon, login ' + 'session manager, etc; not a leak)', + bucket='system-slice', + ) + _section( + 'user-slice', user_slice, + '`ppid==1`, rooted under `/user.slice/.../*.scope` ' + '(desktop-launched app wrapped by systemd-user — ' + 'browser, editor, etc; not a leak)', + bucket='user-slice', + ) _section('live', live, bucket='live') if gone: @@ -633,25 +730,12 @@ def _tractor_reap(args): ns.uds_only ) - # repo-root resolution: `git rev-parse --show-toplevel` - # first, falling back to the xontrib file's parent of - # parent. mirrors `scripts/tractor-reap._repo_root()`. - try: - repo_str: str = sp.check_output( - ['git', 'rev-parse', '--show-toplevel'], - stderr=sp.DEVNULL, - text=True, - ).strip() - repo: Path = Path(repo_str) - except (sp.CalledProcessError, FileNotFoundError): - repo: Path = Path(__file__).resolve().parent.parent - - # lazy-import the reap helpers since the package may not - # have been on `sys.path` at xontrib-load time (e.g. the - # contrib was sourced before activating the venv). - import sys - if str(repo) not in sys.path: - sys.path.insert(0, str(repo)) + # `tractor` is assumed to be importable in the xonsh env + # this xontrib was sourced into (a venv with the package + # installed). The standalone `scripts/tractor-reap` does + # `git rev-parse --show-toplevel` + `sys.path.insert` for + # cold-shell usability — that overhead is unnecessary + # here since we're already inside the project's venv. from tractor._testing._reap import ( find_descendants, find_orphans, @@ -670,8 +754,12 @@ def _tractor_reap(args): pids: list = find_descendants(ns.parent) mode: str = f'descendants of PPid={ns.parent}' else: - pids = find_orphans(repo) - mode = f'orphans (PPid=1, cwd={repo})' + pids = find_orphans() + mode = ( + 'orphans (PPid==1, intrinsic ' + 'cmdline/comm match — `tractor[…]` or ' + '`tractor._child`)' + ) if not pids: print(f'[acli.reap] no {mode} to reap') @@ -730,7 +818,7 @@ def _tractor_reap(args): # `acli.` and the full set is suggested. no parent # `acli` cmd exists — the dot is purely a naming convention. _TCLI_ALIASES: dict = { - 'acli.pytree': _pytree, + 'acli.ptree': _ptree, 'acli.hung_dump': _hung_dump, 'acli.bindspace_scan': _bindspace_scan, 'acli.reap': _tractor_reap,