From 6d76b60404591aca2af195520f03b18a5a55a9cb Mon Sep 17 00:00:00 2001 From: goodboy Date: Sun, 26 Apr 2026 18:04:40 -0400 Subject: [PATCH] Add `tractor-reap` CLI + document auto-reap New `scripts/tractor-reap` CLI wraps the `_testing._reap` mod for manual zombie-subactor cleanup after crashed pytest sessions. Two modes: - orphan-mode (default): finds PPid==1 procs with cwd matching repo root + `python` in cmdline. - descendant-mode (`--parent `): scoped sweep under a still-live supervisor. SC-polite: SIGINT with bounded grace window (default 3s) before escalating to SIGKILL. Exit code signals whether escalation was needed (useful for CI health-checks). Also, document both the auto-reap fixture and the CLI in `/run-tests` SKILL.md (section 10). (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- .claude/skills/run-tests/SKILL.md | 64 +++++++++++++++ scripts/tractor-reap | 124 ++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+) create mode 100755 scripts/tractor-reap diff --git a/.claude/skills/run-tests/SKILL.md b/.claude/skills/run-tests/SKILL.md index 1c047332..8a579f57 100644 --- a/.claude/skills/run-tests/SKILL.md +++ b/.claude/skills/run-tests/SKILL.md @@ -521,3 +521,67 @@ filling log volume. Full post-mortem in `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`. Lesson codified here so future-me grep-finds the workaround before digging. + +## 10. Reaping zombie subactors (`tractor-reap`) + +**Symptom:** after a `pytest` run crashes, times out, +or is `Ctrl+C`'d, subactor forks (esp. under +`subint_forkserver`) can be reparented to `init` +(PPid==1) and linger. They hold onto ports, inherit +pytest's capture-pipe fds, and flakify later +sessions. + +**Two layers of defense:** + +### a) Session-scoped auto-fixture (always on) + +`tractor/_testing/pytest.py::_reap_orphaned_subactors` +runs at pytest session teardown. It walks `/proc` for +direct descendants of the pytest pid, SIGINTs them, +waits up to 3s, then SIGKILLs survivors. SC-polite: +gives the subactor runtime a chance to run its trio +cancel shield + IPC teardown before escalation. + +This is *autouse* and session-scoped — you don't need +to do anything. It just runs. + +### b) `scripts/tractor-reap` CLI (manual reap) + +For the **pytest-died-mid-session** case (Ctrl+C, OOM +kill, hung process you had to `kill -9`), the fixture +never ran. Reach for the CLI: + +```sh +# default: orphans (PPid==1, cwd==repo, cmd contains python) +scripts/tractor-reap + +# descendant-mode: from a still-live supervisor +scripts/tractor-reap --parent + +# see what would be reaped, don't signal +scripts/tractor-reap -n + +# tune the SIGINT → SIGKILL grace window +scripts/tractor-reap --grace 5 +``` + +Exit code: `0` if everyone exited on SIGINT, `1` if +SIGKILL had to escalate — so you can chain it in CI +health-checks (`scripts/tractor-reap || `). + +**What it matches** (orphan-mode): +- `PPid == 1` (reparented to init → definitely + orphaned, not just a currently-running child) +- `cwd == ` (keeps the sweep scoped; won't + touch unrelated init-children elsewhere) +- `python` in cmdline + +**What it does not do:** kill anything whose PPid is +still a live tractor parent. If the parent is alive +it's not an orphan; use `--parent ` if you need +to force-reap under a still-live supervisor. + +**When NOT to run it:** while a pytest session is +active in another terminal. It's safe (won't touch +that session's live children in orphan-mode) but can +race if the target session is mid-teardown. diff --git a/scripts/tractor-reap b/scripts/tractor-reap new file mode 100755 index 00000000..09220887 --- /dev/null +++ b/scripts/tractor-reap @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# tractor: structured concurrent "actors". +# Copyright 2018-eternity Tyler Goodlet. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +''' +`tractor-reap` — SC-polite zombie-subactor reaper. + +Finds `tractor` subactor processes left alive after a +`pytest` (or any tractor-app) run that failed to fully +cancel its actor tree, then sends SIGINT with a bounded +grace window before escalating to SIGKILL. + +Detection modes (auto-selected): + + --parent : descendant-mode — kill procs whose + PPid == . Use when a parent + is still alive and you want to + scope the sweep precisely (e.g. + CI wrapper calling in from outside + pytest). + + (default) : orphan-mode — kill procs with + PPid==1 (init-reparented) whose + cwd matches the repo root AND + whose cmdline contains `python`. + The cwd filter is what prevents + sweeping unrelated init-children. + +Usage: + + # after a pytest run crashed/was Ctrl+C'd + scripts/tractor-reap + + # from inside a still-live supervisor + scripts/tractor-reap --parent 12345 + + # dry-run: list what would be reaped, don't signal + scripts/tractor-reap -n + +''' +import argparse +import pathlib +import subprocess +import sys + + +def _repo_root() -> pathlib.Path: + ''' + Use `git rev-parse --show-toplevel` when available; + fall back to the repo this script lives in. + + ''' + try: + out: str = subprocess.check_output( + ['git', 'rev-parse', '--show-toplevel'], + stderr=subprocess.DEVNULL, + text=True, + ).strip() + return pathlib.Path(out) + except (subprocess.CalledProcessError, FileNotFoundError): + return pathlib.Path(__file__).resolve().parent.parent + + +def main() -> int: + parser = argparse.ArgumentParser( + prog='tractor-reap', + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + '--parent', '-p', + type=int, + default=None, + help='descendant-mode: reap procs with PPid==', + ) + parser.add_argument( + '--grace', '-g', + type=float, + default=3.0, + help='SIGINT grace window in seconds (default 3.0)', + ) + parser.add_argument( + '--dry-run', '-n', + action='store_true', + help='list matched pids but do not signal', + ) + args = parser.parse_args() + + # import lazily so `--help` doesn't require the tractor + # package to be importable (e.g. when running from a + # shell not inside a venv). + repo = _repo_root() + sys.path.insert(0, str(repo)) + from tractor._testing._reap import ( + find_descendants, + find_orphans, + reap, + ) + + if args.parent is not None: + pids: list[int] = find_descendants(args.parent) + mode: str = f'descendants of PPid={args.parent}' + else: + pids = find_orphans(repo) + mode = f'orphans (PPid=1, cwd={repo})' + + if not pids: + print(f'[tractor-reap] no {mode} to reap') + return 0 + + if args.dry_run: + print(f'[tractor-reap] dry-run — {mode}:\n {pids}') + return 0 + + signalled, survivors = reap(pids, grace=args.grace) + # exit 0 if everyone exited cleanly, else 1 to signal + # escalation happened — makes the command useful in + # CI health-checks and `||`-chaining. + return 0 if not survivors else 1 + + +if __name__ == '__main__': + raise SystemExit(main())