Add `_is_tractor_subactor()`, cgroup-aware `ptree`
Rework reap/diag tooling to identify tractor sub-actors via intrinsic proc signals — cmdline/comm markers from `setproctitle` — instead of env-var or cwd matching. Deats, - new `_is_tractor_subactor()` checks cmdline for `tractor[` / `tractor._child` markers, falls back to `/proc/<pid>/comm` for zombie-resilient detection (kernel preserves `comm` past exit until reap) - `_read_comm()` reads kernel per-task name set by `setproctitle()` — the zombie-safe ID signal - `_read_status_state()` reads single-letter proc state from `/proc/<pid>/status` (`Z` = zombie) - `find_orphans()` drops `repo_root` requirement, uses `_is_tractor_subactor()` for intrinsic sub-actor ID instead of cwd coincidence-matching - new `find_zombies()` with optional `parent_pid` filter for zombie-state sub-actors Also, - rename `pytree` -> `ptree` throughout xontrib - add `_which_cgroup_slice()` — reads `/proc/<pid>/cgroup` to distinguish `system.slice` services vs `user.slice` desktop apps from genuinely leaked orphans - `_ptree` classifies `ppid==1` procs into `system-slice`, `user-slice`, and `orphans` buckets with per-section output - `_tractor_reap` drops `git rev-parse` / `sys.path` hack — assumes tractor importable from active venv (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-codesubint_forkserver_backend
parent
d60245777e
commit
522b57570b
|
|
@ -188,6 +188,86 @@ def _read_cmdline(pid: int) -> str:
|
|||
return ''
|
||||
|
||||
|
||||
def _read_comm(pid: int) -> str:
|
||||
'''
|
||||
Read `/proc/<pid>/comm` — the kernel's per-task name
|
||||
(truncated to ~15 bytes on Linux). Set by
|
||||
`setproctitle.setproctitle()` so this is one of the
|
||||
most reliable identifiers for tractor sub-actors —
|
||||
notably, **survives zombie state** (kernel preserves
|
||||
`comm` even after exit, until reaped) where
|
||||
`cmdline`/`environ` may not.
|
||||
|
||||
'''
|
||||
try:
|
||||
with open(f'/proc/{pid}/comm') as f:
|
||||
return f.read().rstrip('\n')
|
||||
except (
|
||||
FileNotFoundError,
|
||||
PermissionError,
|
||||
ProcessLookupError,
|
||||
):
|
||||
return ''
|
||||
|
||||
|
||||
# Intrinsic markers that identify a tractor sub-actor
|
||||
# regardless of cwd / venv path / launch context. Used by
|
||||
# `_is_tractor_subactor()` below.
|
||||
#
|
||||
# - cmdline `tractor[`: matches the `setproctitle`-set form
|
||||
# (`tractor[<aid.reprol()>]`) — set in
|
||||
# `_actor_child_main` for ALL backends, mutates argv via
|
||||
# libc so visible in `/proc/<pid>/cmdline`.
|
||||
# - cmdline `tractor._child`: matches the legacy
|
||||
# `python -m tractor._child --uid (...)` form. Catches
|
||||
# procs that died before `_actor_child_main` got to call
|
||||
# `setproctitle()` — argv from exec is still kernel-
|
||||
# visible at that point.
|
||||
# - comm `tractor[`: same proctitle-set form, but visible
|
||||
# via `/proc/<pid>/comm` (kernel-truncated to ~15 bytes,
|
||||
# `tractor[doggy:`). Critical for ZOMBIES — kernel
|
||||
# preserves `comm` past task-exit until parent reaps,
|
||||
# while `cmdline` for zombies often reads as empty.
|
||||
_TRACTOR_PROC_CMDLINE_MARKERS: tuple[str, ...] = (
|
||||
'tractor._child',
|
||||
'tractor[',
|
||||
)
|
||||
_TRACTOR_PROC_COMM_MARKER: str = 'tractor['
|
||||
|
||||
|
||||
def _is_tractor_subactor(pid: int) -> bool:
|
||||
'''
|
||||
Detect whether `pid` is a tractor sub-actor process
|
||||
using **intrinsic** signals — cmdline → comm — in
|
||||
priority order.
|
||||
|
||||
No filesystem-state coupling (cwd / venv path) and no
|
||||
env-var dependency: `setproctitle`-mutated argv (set
|
||||
in `_actor_child_main`) covers all live + most-zombie
|
||||
cases; legacy `python -m tractor._child` cmdline
|
||||
catches anything that died before `setproctitle` ran;
|
||||
kernel `comm` covers zombies that survived past
|
||||
`_actor_child_main` long enough to setproctitle.
|
||||
|
||||
'''
|
||||
# 1. cmdline match — catches both `setproctitle`-set
|
||||
# `tractor[<aid>]` (live) AND legacy `python -m
|
||||
# tractor._child` (any) form.
|
||||
cmdline: str = _read_cmdline(pid)
|
||||
if any(m in cmdline for m in _TRACTOR_PROC_CMDLINE_MARKERS):
|
||||
return True
|
||||
|
||||
# 2. Zombie-resilient fallback: kernel-preserved `comm`
|
||||
# (set by setproctitle). Critical for zombies whose
|
||||
# `cmdline` reads as empty post-exit but whose
|
||||
# `comm` survives to `wait()` time.
|
||||
comm: str = _read_comm(pid)
|
||||
if _TRACTOR_PROC_COMM_MARKER in comm:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _iter_live_pids() -> list[int]:
|
||||
'''
|
||||
Enumerate currently-alive pids from `/proc`. Returns
|
||||
|
|
@ -291,33 +371,88 @@ def find_runaway_subactors(
|
|||
return runaways
|
||||
|
||||
|
||||
def _read_status_state(pid: int) -> str | None:
|
||||
'''
|
||||
Return the single-letter task state from
|
||||
`/proc/<pid>/status` (`R`/`S`/`D`/`Z`/`T`/`X`/`I`) or
|
||||
`None` if unreadable. `Z` = zombie.
|
||||
|
||||
'''
|
||||
try:
|
||||
with open(f'/proc/{pid}/status') as f:
|
||||
for line in f:
|
||||
if line.startswith('State:'):
|
||||
# `State:\tZ (zombie)` -> 'Z'
|
||||
parts = line.split()
|
||||
if len(parts) >= 2:
|
||||
return parts[1]
|
||||
except (
|
||||
FileNotFoundError,
|
||||
PermissionError,
|
||||
ProcessLookupError,
|
||||
):
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def find_orphans(
|
||||
repo_root: pathlib.Path,
|
||||
repo_root: pathlib.Path|None = None,
|
||||
) -> list[int]:
|
||||
'''
|
||||
PIDs that are:
|
||||
PIDs that are reparented to init (`PPid == 1`) AND
|
||||
are tractor sub-actors per `_is_tractor_subactor()`'s
|
||||
intrinsic checks (env-var → cmdline → comm).
|
||||
|
||||
- reparented to init (`PPid == 1`),
|
||||
- have `cwd == <repo_root>`,
|
||||
- and have a `python` in their cmdline.
|
||||
|
||||
This is the "pytest-died-mid-session" case where the
|
||||
subactor forks got reparented. The cwd filter is the
|
||||
critical bit that keeps us from sweeping up unrelated
|
||||
init-children on the box.
|
||||
The `repo_root` arg is kept for back-compat with
|
||||
callers that previously passed it (the old impl used
|
||||
it to filter by cwd) but is no longer required —
|
||||
tractor sub-actor identity is intrinsic to the proc,
|
||||
not its launch context.
|
||||
|
||||
'''
|
||||
repo: str = str(repo_root)
|
||||
# `repo_root` kept in signature for back-compat; today
|
||||
# the intrinsic env/cmdline/comm signals identify a
|
||||
# tractor sub-actor without coincidence-of-cwd
|
||||
# matching. Suppressed-arg stays a no-op so existing
|
||||
# callers don't have to change.
|
||||
_ = repo_root # noqa
|
||||
hits: list[int] = []
|
||||
for pid in _iter_live_pids():
|
||||
if _read_status_ppid(pid) != 1:
|
||||
continue
|
||||
cwd: str | None = _read_cwd(pid)
|
||||
if cwd != repo:
|
||||
if _is_tractor_subactor(pid):
|
||||
hits.append(pid)
|
||||
return hits
|
||||
|
||||
|
||||
def find_zombies(
|
||||
parent_pid: int|None = None,
|
||||
) -> list[int]:
|
||||
'''
|
||||
PIDs in zombie state (`/proc/<pid>/status: State: Z`)
|
||||
that are tractor sub-actors per
|
||||
`_is_tractor_subactor()`.
|
||||
|
||||
When `parent_pid` is given, restricts to descendants
|
||||
of that pid (typical for pytest session-end fixture
|
||||
use). When `None`, scans all zombies on the box.
|
||||
|
||||
Detection for zombies relies primarily on
|
||||
`/proc/<pid>/comm` (kernel-preserved past zombie
|
||||
state, set by `setproctitle`) since
|
||||
`cmdline`/`environ` are usually empty post-exit.
|
||||
|
||||
'''
|
||||
hits: list[int] = []
|
||||
for pid in _iter_live_pids():
|
||||
if _read_status_state(pid) != 'Z':
|
||||
continue
|
||||
cmd: str = _read_cmdline(pid)
|
||||
if 'python' not in cmd:
|
||||
if (
|
||||
parent_pid is not None
|
||||
and _read_status_ppid(pid) != parent_pid
|
||||
):
|
||||
continue
|
||||
if _is_tractor_subactor(pid):
|
||||
hits.append(pid)
|
||||
return hits
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ prefix-completion treats them as a sub-cmd group — type
|
|||
`acli.<TAB>` to see the full set.
|
||||
|
||||
Provides:
|
||||
- `acli.pytree <pid|pgrep-pat>` psutil-backed proc tree,
|
||||
- `acli.ptree <pid|pgrep-pat>` psutil-backed proc tree,
|
||||
live + zombies split.
|
||||
- `acli.hung_dump <pid|pat> [...]` kernel `wchan`/`stack` +
|
||||
`py-spy dump` (incl `--locals`)
|
||||
|
|
@ -28,7 +28,7 @@ Or source directly:
|
|||
Pipe-to-paste idiom (xonsh):
|
||||
hung-dump pytest |t /tmp/hung.log
|
||||
|
||||
Requires `psutil` for full functionality (`pytree` and the
|
||||
Requires `psutil` for full functionality (`ptree` and the
|
||||
`hung-dump` tree-walk). Falls back to `pgrep -P` recursion
|
||||
if missing.
|
||||
"""
|
||||
|
|
@ -44,7 +44,7 @@ except ImportError:
|
|||
psutil = None
|
||||
print(
|
||||
'[tractor-diag] `psutil` missing — '
|
||||
'acli.pytree disabled, acli.hung_dump uses pgrep fallback. '
|
||||
'acli.ptree disabled, acli.hung_dump uses pgrep fallback. '
|
||||
'`uv pip install psutil` for full functionality.'
|
||||
)
|
||||
|
||||
|
|
@ -84,7 +84,7 @@ def _walk_tree_with_depth(pid: int):
|
|||
'''
|
||||
Yield `(proc, depth)` pairs walking `pid`'s tree. `depth==0`
|
||||
is the root; `depth==1` are direct children, etc. Used by
|
||||
`pytree` to render parent/child relationships visually.
|
||||
`ptree` to render parent/child relationships visually.
|
||||
'''
|
||||
try:
|
||||
root = psutil.Process(pid)
|
||||
|
|
@ -107,6 +107,56 @@ def _walk_tree_with_depth(pid: int):
|
|||
stack.append((k, d + 1))
|
||||
|
||||
|
||||
def _which_cgroup_slice(pid: int) -> str|None:
|
||||
'''
|
||||
Return which top-level systemd cgroup slice `pid` is
|
||||
rooted in, or `None` if it's not in either:
|
||||
|
||||
- `'system'`: under `/system.slice/...` — typically
|
||||
`.service` units (long-lived daemons explicitly
|
||||
enabled via `systemctl enable`, e.g.
|
||||
`auto-cpufreq.service`, `dbus.service`,
|
||||
`systemd-journald.service`).
|
||||
|
||||
- `'user'`: under `/user.slice/user-<uid>.slice/...`
|
||||
— typically `.scope` units that systemd auto-wraps
|
||||
around desktop-launched apps + login-session
|
||||
procs (e.g. `app-firefox-<id>.scope`,
|
||||
`session-<id>.scope`).
|
||||
|
||||
- `None`: NOT in either slice — pid 1 is NOT
|
||||
managing this proc via cgroup. Combined with
|
||||
`ppid==1`, this is the genuine "leaked / parent
|
||||
died" orphan signal.
|
||||
|
||||
Both slice categories are by-design `ppid==1` (pid 1
|
||||
is actively managing them) and should NOT be flagged
|
||||
as concerning orphans, but distinguishing them is
|
||||
useful: `system.slice` is "real services on this
|
||||
box", `user.slice` is "stuff in your login session".
|
||||
|
||||
Returns `None` on any read error (proc gone, perm
|
||||
denied, non-Linux, etc.) — callers should treat that
|
||||
as "unknown, classify as plain orphan".
|
||||
|
||||
'''
|
||||
try:
|
||||
with open(f'/proc/{pid}/cgroup') as f:
|
||||
cg: str = f.read()
|
||||
except (
|
||||
FileNotFoundError,
|
||||
PermissionError,
|
||||
ProcessLookupError,
|
||||
OSError,
|
||||
):
|
||||
return None
|
||||
if '/system.slice/' in cg:
|
||||
return 'system'
|
||||
if '/user.slice/' in cg:
|
||||
return 'user'
|
||||
return None
|
||||
|
||||
|
||||
def _walk_tree_pgrep(pid: int) -> list:
|
||||
'''psutil-less fallback — recursive `pgrep -P`.'''
|
||||
out = [pid]
|
||||
|
|
@ -157,15 +207,15 @@ def _ensure_sudo_cached() -> bool:
|
|||
return rc == 0
|
||||
|
||||
|
||||
# --- pytree ---------------------------------------------------
|
||||
# --- ptree ---------------------------------------------------
|
||||
|
||||
def _pytree(args):
|
||||
def _ptree(args):
|
||||
'''
|
||||
psutil-backed proc tree; per-proc classification into
|
||||
severity-ordered buckets so leaked / defunct procs
|
||||
don't hide in the noise of normal `live` rows.
|
||||
|
||||
usage: acli.pytree [--tree|-t] <pid|pgrep-pattern> [...]
|
||||
usage: acli.ptree [--tree|-t] <pid|pgrep-pattern> [...]
|
||||
|
||||
classification (per-proc, not per-tree):
|
||||
|
||||
|
|
@ -207,10 +257,10 @@ def _pytree(args):
|
|||
pos_args.append(a)
|
||||
|
||||
if not pos_args:
|
||||
print('usage: acli.pytree [--tree|-t] <pid|pgrep-pattern> [...]')
|
||||
print('usage: acli.ptree [--tree|-t] <pid|pgrep-pattern> [...]')
|
||||
return 1
|
||||
if psutil is None:
|
||||
print('pytree requires psutil; install via `uv pip install psutil`')
|
||||
print('ptree requires psutil; install via `uv pip install psutil`')
|
||||
return 1
|
||||
|
||||
roots: list = []
|
||||
|
|
@ -233,6 +283,15 @@ def _pytree(args):
|
|||
walk_order: list = [] # [(proc, depth)] preserved walk order
|
||||
live: list = [] # [(proc, depth)]
|
||||
orphans: list = []
|
||||
# `ppid==1` AND rooted in `/system.slice/` cgroup —
|
||||
# real systemd-managed services (e.g. `auto-cpufreq`,
|
||||
# `NetworkManager`).
|
||||
system_slice: list = []
|
||||
# `ppid==1` AND rooted in `/user.slice/.../*.scope` —
|
||||
# desktop-launched apps wrapped by systemd-user in
|
||||
# transient `.scope` units (e.g. Firefox, browsers,
|
||||
# editors started from a launcher).
|
||||
user_slice: list = []
|
||||
zombies: list = []
|
||||
gone: list = []
|
||||
|
||||
|
|
@ -252,11 +311,28 @@ def _pytree(args):
|
|||
gone.append(p.pid)
|
||||
continue
|
||||
entry = (p, depth)
|
||||
# severity order: zombie > orphan > live.
|
||||
# severity order:
|
||||
# zombie > orphan > system-slice > user-slice > live
|
||||
# `ppid==1` splits into:
|
||||
# - `system-slice` (rooted in `/system.slice/` —
|
||||
# real services, by-design `ppid==1`)
|
||||
# - `user-slice` (rooted in
|
||||
# `/user.slice/.../*.scope` — desktop apps
|
||||
# wrapped by systemd-user, by-design `ppid==1`)
|
||||
# - `orphans` (everything else with `ppid==1` —
|
||||
# genuinely concerning).
|
||||
if status in defunct_statuses:
|
||||
zombies.append(entry)
|
||||
pid_to_bucket[p.pid] = 'zombies'
|
||||
elif ppid == 1:
|
||||
slice_kind: str|None = _which_cgroup_slice(p.pid)
|
||||
if slice_kind == 'system':
|
||||
system_slice.append(entry)
|
||||
pid_to_bucket[p.pid] = 'system-slice'
|
||||
elif slice_kind == 'user':
|
||||
user_slice.append(entry)
|
||||
pid_to_bucket[p.pid] = 'user-slice'
|
||||
else:
|
||||
orphans.append(entry)
|
||||
pid_to_bucket[p.pid] = 'orphans'
|
||||
else:
|
||||
|
|
@ -264,8 +340,14 @@ def _pytree(args):
|
|||
pid_to_bucket[p.pid] = 'live'
|
||||
walk_order.append(entry)
|
||||
|
||||
total: int = len(live) + len(orphans) + len(zombies)
|
||||
print(f'# pytree: {total} procs across roots {roots}')
|
||||
total: int = (
|
||||
len(live)
|
||||
+ len(orphans)
|
||||
+ len(system_slice)
|
||||
+ len(user_slice)
|
||||
+ len(zombies)
|
||||
)
|
||||
print(f'# ptree: {total} procs across roots {roots}')
|
||||
|
||||
hdr = ' ' + 'PID'.rjust(7) + ' ' + 'PPID'.rjust(7) + ' '
|
||||
hdr += 'STATUS'.ljust(10) + ' CMD'
|
||||
|
|
@ -370,9 +452,24 @@ def _pytree(args):
|
|||
)
|
||||
_section(
|
||||
'orphans', orphans,
|
||||
'`ppid==1`, reparented to init (leaked / parent gone)',
|
||||
'`ppid==1`, NOT in a `system.slice`/`user.slice` cgroup '
|
||||
'(likely leaked / parent gone)',
|
||||
bucket='orphans',
|
||||
)
|
||||
_section(
|
||||
'system-slice', system_slice,
|
||||
'`ppid==1`, rooted under `/system.slice/` '
|
||||
'(real systemd-managed service — daemon, login '
|
||||
'session manager, etc; not a leak)',
|
||||
bucket='system-slice',
|
||||
)
|
||||
_section(
|
||||
'user-slice', user_slice,
|
||||
'`ppid==1`, rooted under `/user.slice/.../*.scope` '
|
||||
'(desktop-launched app wrapped by systemd-user — '
|
||||
'browser, editor, etc; not a leak)',
|
||||
bucket='user-slice',
|
||||
)
|
||||
_section('live', live, bucket='live')
|
||||
|
||||
if gone:
|
||||
|
|
@ -633,25 +730,12 @@ def _tractor_reap(args):
|
|||
ns.uds_only
|
||||
)
|
||||
|
||||
# repo-root resolution: `git rev-parse --show-toplevel`
|
||||
# first, falling back to the xontrib file's parent of
|
||||
# parent. mirrors `scripts/tractor-reap._repo_root()`.
|
||||
try:
|
||||
repo_str: str = sp.check_output(
|
||||
['git', 'rev-parse', '--show-toplevel'],
|
||||
stderr=sp.DEVNULL,
|
||||
text=True,
|
||||
).strip()
|
||||
repo: Path = Path(repo_str)
|
||||
except (sp.CalledProcessError, FileNotFoundError):
|
||||
repo: Path = Path(__file__).resolve().parent.parent
|
||||
|
||||
# lazy-import the reap helpers since the package may not
|
||||
# have been on `sys.path` at xontrib-load time (e.g. the
|
||||
# contrib was sourced before activating the venv).
|
||||
import sys
|
||||
if str(repo) not in sys.path:
|
||||
sys.path.insert(0, str(repo))
|
||||
# `tractor` is assumed to be importable in the xonsh env
|
||||
# this xontrib was sourced into (a venv with the package
|
||||
# installed). The standalone `scripts/tractor-reap` does
|
||||
# `git rev-parse --show-toplevel` + `sys.path.insert` for
|
||||
# cold-shell usability — that overhead is unnecessary
|
||||
# here since we're already inside the project's venv.
|
||||
from tractor._testing._reap import (
|
||||
find_descendants,
|
||||
find_orphans,
|
||||
|
|
@ -670,8 +754,12 @@ def _tractor_reap(args):
|
|||
pids: list = find_descendants(ns.parent)
|
||||
mode: str = f'descendants of PPid={ns.parent}'
|
||||
else:
|
||||
pids = find_orphans(repo)
|
||||
mode = f'orphans (PPid=1, cwd={repo})'
|
||||
pids = find_orphans()
|
||||
mode = (
|
||||
'orphans (PPid==1, intrinsic '
|
||||
'cmdline/comm match — `tractor[…]` or '
|
||||
'`tractor._child`)'
|
||||
)
|
||||
|
||||
if not pids:
|
||||
print(f'[acli.reap] no {mode} to reap')
|
||||
|
|
@ -730,7 +818,7 @@ def _tractor_reap(args):
|
|||
# `acli.<TAB>` and the full set is suggested. no parent
|
||||
# `acli` cmd exists — the dot is purely a naming convention.
|
||||
_TCLI_ALIASES: dict = {
|
||||
'acli.pytree': _pytree,
|
||||
'acli.ptree': _ptree,
|
||||
'acli.hung_dump': _hung_dump,
|
||||
'acli.bindspace_scan': _bindspace_scan,
|
||||
'acli.reap': _tractor_reap,
|
||||
|
|
|
|||
Loading…
Reference in New Issue