Add `_is_tractor_subactor()`, cgroup-aware `ptree`

Rework reap/diag tooling to identify tractor sub-actors via
intrinsic proc signals — cmdline/comm markers from `setproctitle` —
instead of env-var or cwd matching.

Deats,
- new `_is_tractor_subactor()` checks cmdline for `tractor[` /
  `tractor._child` markers, falls back to `/proc/<pid>/comm` for
  zombie-resilient detection (kernel preserves `comm` past exit
  until reap)
- `_read_comm()` reads kernel per-task name set by `setproctitle()`
  — the zombie-safe ID signal
- `_read_status_state()` reads single-letter proc state from
  `/proc/<pid>/status` (`Z` = zombie)
- `find_orphans()` drops `repo_root` requirement, uses
  `_is_tractor_subactor()` for intrinsic sub-actor ID instead of
  cwd coincidence-matching
- new `find_zombies()` with optional `parent_pid` filter for
  zombie-state sub-actors

Also,
- rename `pytree` -> `ptree` throughout xontrib
- add `_which_cgroup_slice()` — reads `/proc/<pid>/cgroup` to
  distinguish `system.slice` services vs `user.slice` desktop apps
  from genuinely leaked orphans
- `_ptree` classifies `ppid==1` procs into `system-slice`,
  `user-slice`, and `orphans` buckets with per-section output
- `_tractor_reap` drops `git rev-parse` / `sys.path` hack — assumes
  tractor importable from active venv

(this patch was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
subint_forkserver_backend
Gud Boi 2026-05-08 00:51:05 -04:00
parent d60245777e
commit 522b57570b
2 changed files with 276 additions and 53 deletions

View File

@ -188,6 +188,86 @@ def _read_cmdline(pid: int) -> str:
return '' return ''
def _read_comm(pid: int) -> str:
'''
Read `/proc/<pid>/comm` the kernel's per-task name
(truncated to ~15 bytes on Linux). Set by
`setproctitle.setproctitle()` so this is one of the
most reliable identifiers for tractor sub-actors
notably, **survives zombie state** (kernel preserves
`comm` even after exit, until reaped) where
`cmdline`/`environ` may not.
'''
try:
with open(f'/proc/{pid}/comm') as f:
return f.read().rstrip('\n')
except (
FileNotFoundError,
PermissionError,
ProcessLookupError,
):
return ''
# Intrinsic markers that identify a tractor sub-actor
# regardless of cwd / venv path / launch context. Used by
# `_is_tractor_subactor()` below.
#
# - cmdline `tractor[`: matches the `setproctitle`-set form
# (`tractor[<aid.reprol()>]`) — set in
# `_actor_child_main` for ALL backends, mutates argv via
# libc so visible in `/proc/<pid>/cmdline`.
# - cmdline `tractor._child`: matches the legacy
# `python -m tractor._child --uid (...)` form. Catches
# procs that died before `_actor_child_main` got to call
# `setproctitle()` — argv from exec is still kernel-
# visible at that point.
# - comm `tractor[`: same proctitle-set form, but visible
# via `/proc/<pid>/comm` (kernel-truncated to ~15 bytes,
# `tractor[doggy:`). Critical for ZOMBIES — kernel
# preserves `comm` past task-exit until parent reaps,
# while `cmdline` for zombies often reads as empty.
_TRACTOR_PROC_CMDLINE_MARKERS: tuple[str, ...] = (
'tractor._child',
'tractor[',
)
_TRACTOR_PROC_COMM_MARKER: str = 'tractor['
def _is_tractor_subactor(pid: int) -> bool:
'''
Detect whether `pid` is a tractor sub-actor process
using **intrinsic** signals cmdline comm in
priority order.
No filesystem-state coupling (cwd / venv path) and no
env-var dependency: `setproctitle`-mutated argv (set
in `_actor_child_main`) covers all live + most-zombie
cases; legacy `python -m tractor._child` cmdline
catches anything that died before `setproctitle` ran;
kernel `comm` covers zombies that survived past
`_actor_child_main` long enough to setproctitle.
'''
# 1. cmdline match — catches both `setproctitle`-set
# `tractor[<aid>]` (live) AND legacy `python -m
# tractor._child` (any) form.
cmdline: str = _read_cmdline(pid)
if any(m in cmdline for m in _TRACTOR_PROC_CMDLINE_MARKERS):
return True
# 2. Zombie-resilient fallback: kernel-preserved `comm`
# (set by setproctitle). Critical for zombies whose
# `cmdline` reads as empty post-exit but whose
# `comm` survives to `wait()` time.
comm: str = _read_comm(pid)
if _TRACTOR_PROC_COMM_MARKER in comm:
return True
return False
def _iter_live_pids() -> list[int]: def _iter_live_pids() -> list[int]:
''' '''
Enumerate currently-alive pids from `/proc`. Returns Enumerate currently-alive pids from `/proc`. Returns
@ -291,33 +371,88 @@ def find_runaway_subactors(
return runaways return runaways
def _read_status_state(pid: int) -> str | None:
'''
Return the single-letter task state from
`/proc/<pid>/status` (`R`/`S`/`D`/`Z`/`T`/`X`/`I`) or
`None` if unreadable. `Z` = zombie.
'''
try:
with open(f'/proc/{pid}/status') as f:
for line in f:
if line.startswith('State:'):
# `State:\tZ (zombie)` -> 'Z'
parts = line.split()
if len(parts) >= 2:
return parts[1]
except (
FileNotFoundError,
PermissionError,
ProcessLookupError,
):
return None
return None
def find_orphans( def find_orphans(
repo_root: pathlib.Path, repo_root: pathlib.Path|None = None,
) -> list[int]: ) -> list[int]:
''' '''
PIDs that are: PIDs that are reparented to init (`PPid == 1`) AND
are tractor sub-actors per `_is_tractor_subactor()`'s
intrinsic checks (env-var cmdline comm).
- reparented to init (`PPid == 1`), The `repo_root` arg is kept for back-compat with
- have `cwd == <repo_root>`, callers that previously passed it (the old impl used
- and have a `python` in their cmdline. it to filter by cwd) but is no longer required
tractor sub-actor identity is intrinsic to the proc,
This is the "pytest-died-mid-session" case where the not its launch context.
subactor forks got reparented. The cwd filter is the
critical bit that keeps us from sweeping up unrelated
init-children on the box.
''' '''
repo: str = str(repo_root) # `repo_root` kept in signature for back-compat; today
# the intrinsic env/cmdline/comm signals identify a
# tractor sub-actor without coincidence-of-cwd
# matching. Suppressed-arg stays a no-op so existing
# callers don't have to change.
_ = repo_root # noqa
hits: list[int] = [] hits: list[int] = []
for pid in _iter_live_pids(): for pid in _iter_live_pids():
if _read_status_ppid(pid) != 1: if _read_status_ppid(pid) != 1:
continue continue
cwd: str | None = _read_cwd(pid) if _is_tractor_subactor(pid):
if cwd != repo: hits.append(pid)
return hits
def find_zombies(
parent_pid: int|None = None,
) -> list[int]:
'''
PIDs in zombie state (`/proc/<pid>/status: State: Z`)
that are tractor sub-actors per
`_is_tractor_subactor()`.
When `parent_pid` is given, restricts to descendants
of that pid (typical for pytest session-end fixture
use). When `None`, scans all zombies on the box.
Detection for zombies relies primarily on
`/proc/<pid>/comm` (kernel-preserved past zombie
state, set by `setproctitle`) since
`cmdline`/`environ` are usually empty post-exit.
'''
hits: list[int] = []
for pid in _iter_live_pids():
if _read_status_state(pid) != 'Z':
continue continue
cmd: str = _read_cmdline(pid) if (
if 'python' not in cmd: parent_pid is not None
and _read_status_ppid(pid) != parent_pid
):
continue continue
if _is_tractor_subactor(pid):
hits.append(pid) hits.append(pid)
return hits return hits

View File

@ -6,7 +6,7 @@ prefix-completion treats them as a sub-cmd group — type
`acli.<TAB>` to see the full set. `acli.<TAB>` to see the full set.
Provides: Provides:
- `acli.pytree <pid|pgrep-pat>` psutil-backed proc tree, - `acli.ptree <pid|pgrep-pat>` psutil-backed proc tree,
live + zombies split. live + zombies split.
- `acli.hung_dump <pid|pat> [...]` kernel `wchan`/`stack` + - `acli.hung_dump <pid|pat> [...]` kernel `wchan`/`stack` +
`py-spy dump` (incl `--locals`) `py-spy dump` (incl `--locals`)
@ -28,7 +28,7 @@ Or source directly:
Pipe-to-paste idiom (xonsh): Pipe-to-paste idiom (xonsh):
hung-dump pytest |t /tmp/hung.log hung-dump pytest |t /tmp/hung.log
Requires `psutil` for full functionality (`pytree` and the Requires `psutil` for full functionality (`ptree` and the
`hung-dump` tree-walk). Falls back to `pgrep -P` recursion `hung-dump` tree-walk). Falls back to `pgrep -P` recursion
if missing. if missing.
""" """
@ -44,7 +44,7 @@ except ImportError:
psutil = None psutil = None
print( print(
'[tractor-diag] `psutil` missing — ' '[tractor-diag] `psutil` missing — '
'acli.pytree disabled, acli.hung_dump uses pgrep fallback. ' 'acli.ptree disabled, acli.hung_dump uses pgrep fallback. '
'`uv pip install psutil` for full functionality.' '`uv pip install psutil` for full functionality.'
) )
@ -84,7 +84,7 @@ def _walk_tree_with_depth(pid: int):
''' '''
Yield `(proc, depth)` pairs walking `pid`'s tree. `depth==0` Yield `(proc, depth)` pairs walking `pid`'s tree. `depth==0`
is the root; `depth==1` are direct children, etc. Used by is the root; `depth==1` are direct children, etc. Used by
`pytree` to render parent/child relationships visually. `ptree` to render parent/child relationships visually.
''' '''
try: try:
root = psutil.Process(pid) root = psutil.Process(pid)
@ -107,6 +107,56 @@ def _walk_tree_with_depth(pid: int):
stack.append((k, d + 1)) stack.append((k, d + 1))
def _which_cgroup_slice(pid: int) -> str|None:
'''
Return which top-level systemd cgroup slice `pid` is
rooted in, or `None` if it's not in either:
- `'system'`: under `/system.slice/...` — typically
`.service` units (long-lived daemons explicitly
enabled via `systemctl enable`, e.g.
`auto-cpufreq.service`, `dbus.service`,
`systemd-journald.service`).
- `'user'`: under `/user.slice/user-<uid>.slice/...`
— typically `.scope` units that systemd auto-wraps
around desktop-launched apps + login-session
procs (e.g. `app-firefox-<id>.scope`,
`session-<id>.scope`).
- `None`: NOT in either slice — pid 1 is NOT
managing this proc via cgroup. Combined with
`ppid==1`, this is the genuine "leaked / parent
died" orphan signal.
Both slice categories are by-design `ppid==1` (pid 1
is actively managing them) and should NOT be flagged
as concerning orphans, but distinguishing them is
useful: `system.slice` is "real services on this
box", `user.slice` is "stuff in your login session".
Returns `None` on any read error (proc gone, perm
denied, non-Linux, etc.) — callers should treat that
as "unknown, classify as plain orphan".
'''
try:
with open(f'/proc/{pid}/cgroup') as f:
cg: str = f.read()
except (
FileNotFoundError,
PermissionError,
ProcessLookupError,
OSError,
):
return None
if '/system.slice/' in cg:
return 'system'
if '/user.slice/' in cg:
return 'user'
return None
def _walk_tree_pgrep(pid: int) -> list: def _walk_tree_pgrep(pid: int) -> list:
'''psutil-less fallback — recursive `pgrep -P`.''' '''psutil-less fallback — recursive `pgrep -P`.'''
out = [pid] out = [pid]
@ -157,15 +207,15 @@ def _ensure_sudo_cached() -> bool:
return rc == 0 return rc == 0
# --- pytree --------------------------------------------------- # --- ptree ---------------------------------------------------
def _pytree(args): def _ptree(args):
''' '''
psutil-backed proc tree; per-proc classification into psutil-backed proc tree; per-proc classification into
severity-ordered buckets so leaked / defunct procs severity-ordered buckets so leaked / defunct procs
don't hide in the noise of normal `live` rows. don't hide in the noise of normal `live` rows.
usage: acli.pytree [--tree|-t] <pid|pgrep-pattern> [...] usage: acli.ptree [--tree|-t] <pid|pgrep-pattern> [...]
classification (per-proc, not per-tree): classification (per-proc, not per-tree):
@ -207,10 +257,10 @@ def _pytree(args):
pos_args.append(a) pos_args.append(a)
if not pos_args: if not pos_args:
print('usage: acli.pytree [--tree|-t] <pid|pgrep-pattern> [...]') print('usage: acli.ptree [--tree|-t] <pid|pgrep-pattern> [...]')
return 1 return 1
if psutil is None: if psutil is None:
print('pytree requires psutil; install via `uv pip install psutil`') print('ptree requires psutil; install via `uv pip install psutil`')
return 1 return 1
roots: list = [] roots: list = []
@ -233,6 +283,15 @@ def _pytree(args):
walk_order: list = [] # [(proc, depth)] preserved walk order walk_order: list = [] # [(proc, depth)] preserved walk order
live: list = [] # [(proc, depth)] live: list = [] # [(proc, depth)]
orphans: list = [] orphans: list = []
# `ppid==1` AND rooted in `/system.slice/` cgroup —
# real systemd-managed services (e.g. `auto-cpufreq`,
# `NetworkManager`).
system_slice: list = []
# `ppid==1` AND rooted in `/user.slice/.../*.scope` —
# desktop-launched apps wrapped by systemd-user in
# transient `.scope` units (e.g. Firefox, browsers,
# editors started from a launcher).
user_slice: list = []
zombies: list = [] zombies: list = []
gone: list = [] gone: list = []
@ -252,11 +311,28 @@ def _pytree(args):
gone.append(p.pid) gone.append(p.pid)
continue continue
entry = (p, depth) entry = (p, depth)
# severity order: zombie > orphan > live. # severity order:
# zombie > orphan > system-slice > user-slice > live
# `ppid==1` splits into:
# - `system-slice` (rooted in `/system.slice/` —
# real services, by-design `ppid==1`)
# - `user-slice` (rooted in
# `/user.slice/.../*.scope` — desktop apps
# wrapped by systemd-user, by-design `ppid==1`)
# - `orphans` (everything else with `ppid==1` —
# genuinely concerning).
if status in defunct_statuses: if status in defunct_statuses:
zombies.append(entry) zombies.append(entry)
pid_to_bucket[p.pid] = 'zombies' pid_to_bucket[p.pid] = 'zombies'
elif ppid == 1: elif ppid == 1:
slice_kind: str|None = _which_cgroup_slice(p.pid)
if slice_kind == 'system':
system_slice.append(entry)
pid_to_bucket[p.pid] = 'system-slice'
elif slice_kind == 'user':
user_slice.append(entry)
pid_to_bucket[p.pid] = 'user-slice'
else:
orphans.append(entry) orphans.append(entry)
pid_to_bucket[p.pid] = 'orphans' pid_to_bucket[p.pid] = 'orphans'
else: else:
@ -264,8 +340,14 @@ def _pytree(args):
pid_to_bucket[p.pid] = 'live' pid_to_bucket[p.pid] = 'live'
walk_order.append(entry) walk_order.append(entry)
total: int = len(live) + len(orphans) + len(zombies) total: int = (
print(f'# pytree: {total} procs across roots {roots}') len(live)
+ len(orphans)
+ len(system_slice)
+ len(user_slice)
+ len(zombies)
)
print(f'# ptree: {total} procs across roots {roots}')
hdr = ' ' + 'PID'.rjust(7) + ' ' + 'PPID'.rjust(7) + ' ' hdr = ' ' + 'PID'.rjust(7) + ' ' + 'PPID'.rjust(7) + ' '
hdr += 'STATUS'.ljust(10) + ' CMD' hdr += 'STATUS'.ljust(10) + ' CMD'
@ -370,9 +452,24 @@ def _pytree(args):
) )
_section( _section(
'orphans', orphans, 'orphans', orphans,
'`ppid==1`, reparented to init (leaked / parent gone)', '`ppid==1`, NOT in a `system.slice`/`user.slice` cgroup '
'(likely leaked / parent gone)',
bucket='orphans', bucket='orphans',
) )
_section(
'system-slice', system_slice,
'`ppid==1`, rooted under `/system.slice/` '
'(real systemd-managed service — daemon, login '
'session manager, etc; not a leak)',
bucket='system-slice',
)
_section(
'user-slice', user_slice,
'`ppid==1`, rooted under `/user.slice/.../*.scope` '
'(desktop-launched app wrapped by systemd-user — '
'browser, editor, etc; not a leak)',
bucket='user-slice',
)
_section('live', live, bucket='live') _section('live', live, bucket='live')
if gone: if gone:
@ -633,25 +730,12 @@ def _tractor_reap(args):
ns.uds_only ns.uds_only
) )
# repo-root resolution: `git rev-parse --show-toplevel` # `tractor` is assumed to be importable in the xonsh env
# first, falling back to the xontrib file's parent of # this xontrib was sourced into (a venv with the package
# parent. mirrors `scripts/tractor-reap._repo_root()`. # installed). The standalone `scripts/tractor-reap` does
try: # `git rev-parse --show-toplevel` + `sys.path.insert` for
repo_str: str = sp.check_output( # cold-shell usability — that overhead is unnecessary
['git', 'rev-parse', '--show-toplevel'], # here since we're already inside the project's venv.
stderr=sp.DEVNULL,
text=True,
).strip()
repo: Path = Path(repo_str)
except (sp.CalledProcessError, FileNotFoundError):
repo: Path = Path(__file__).resolve().parent.parent
# lazy-import the reap helpers since the package may not
# have been on `sys.path` at xontrib-load time (e.g. the
# contrib was sourced before activating the venv).
import sys
if str(repo) not in sys.path:
sys.path.insert(0, str(repo))
from tractor._testing._reap import ( from tractor._testing._reap import (
find_descendants, find_descendants,
find_orphans, find_orphans,
@ -670,8 +754,12 @@ def _tractor_reap(args):
pids: list = find_descendants(ns.parent) pids: list = find_descendants(ns.parent)
mode: str = f'descendants of PPid={ns.parent}' mode: str = f'descendants of PPid={ns.parent}'
else: else:
pids = find_orphans(repo) pids = find_orphans()
mode = f'orphans (PPid=1, cwd={repo})' mode = (
'orphans (PPid==1, intrinsic '
'cmdline/comm match — `tractor[…]` or '
'`tractor._child`)'
)
if not pids: if not pids:
print(f'[acli.reap] no {mode} to reap') print(f'[acli.reap] no {mode} to reap')
@ -730,7 +818,7 @@ def _tractor_reap(args):
# `acli.<TAB>` and the full set is suggested. no parent # `acli.<TAB>` and the full set is suggested. no parent
# `acli` cmd exists — the dot is purely a naming convention. # `acli` cmd exists — the dot is purely a naming convention.
_TCLI_ALIASES: dict = { _TCLI_ALIASES: dict = {
'acli.pytree': _pytree, 'acli.ptree': _ptree,
'acli.hung_dump': _hung_dump, 'acli.hung_dump': _hung_dump,
'acli.bindspace_scan': _bindspace_scan, 'acli.bindspace_scan': _bindspace_scan,
'acli.reap': _tractor_reap, 'acli.reap': _tractor_reap,