From 4f12d69b41211c56454c31fc1335ac658cf77416 Mon Sep 17 00:00:00 2001 From: goodboy Date: Mon, 27 Apr 2026 11:35:33 -0400 Subject: [PATCH] Add `--shm` orphan sweep to `tractor-reap` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since `tractor.ipc._mp_bs.disable_mantracker()` turns off `mp.resource_tracker` entirely (see the conc-anal doc `subint_forkserver_mp_shared_memory_issue.md`), a hard-crashing actor can leave `/dev/shm/` segments that nothing else GCs. New `tractor-reap` phase 2 sweeps them. Deats, - `tractor/_testing/_reap.py`: add `find_orphaned_shm()` + `reap_shm()` helpers. Match criteria: regular file under `/dev/shm`, owned by current uid, AND no live proc has it open (mmap'd or fd-held). In-use enumeration via `psutil.Process.memory_maps()` + `.open_files()` — xplatform, kernel-canonical (same answer `lsof` would give), no reliance on tractor-specific shm-key naming. - `_ensure_shm_supported()` guard: helpers raise `NotImplementedError` outside Linux/FreeBSD bc macOS POSIX shm has no fs-visible path (`shm_open` only) and Windows is a different story. - `scripts/tractor-reap`: new `--shm` (run after process reap) and `--shm-only` (skip process phase) flags. `-n` dry-runs both phases. Exit code is `1` if either phase had survivors/errors. - `pyproject.toml` + `uv.lock`: add `psutil>=7.0.0` to the `testing` dep group; lazy-imported in `_reap.py` so the process-reap path stays import-clean without it. Also, - doc `--shm` in `.claude/skills/run-tests/SKILL.md` (new section 10c) — covers match criteria + the preservation guarantee for unrelated apps. - flip mitigation status in `subint_forkserver_mp_shared_memory_issue.md` from "could extend `tractor-reap`" to "implemented", with a note that callers should still UUID-pin shm keys to avoid cross-session collisions. Verified locally vs 81 in-use segments held by `piker`, `lttng-ust-*`, `aja-shm-*` — all preserved; only the genuinely-orphaned tractor segments got unlinked. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- .claude/skills/run-tests/SKILL.md | 38 +++ ...ubint_forkserver_mp_shared_memory_issue.md | 22 +- pyproject.toml | 5 + scripts/tractor-reap | 110 ++++++-- tractor/_testing/_reap.py | 252 +++++++++++++++++- uv.lock | 2 + 6 files changed, 385 insertions(+), 44 deletions(-) diff --git a/.claude/skills/run-tests/SKILL.md b/.claude/skills/run-tests/SKILL.md index 8a579f57..b2014201 100644 --- a/.claude/skills/run-tests/SKILL.md +++ b/.claude/skills/run-tests/SKILL.md @@ -585,3 +585,41 @@ to force-reap under a still-live supervisor. active in another terminal. It's safe (won't touch that session's live children in orphan-mode) but can race if the target session is mid-teardown. + +### c) `--shm` / `--shm-only`: orphan-segment sweep + +Because `tractor.ipc._mp_bs.disable_mantracker()` +turns off `mp.resource_tracker` (see +`ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md`), +a hard-crashing actor can leave `/dev/shm/` +segments behind that nothing else GCs. + +```sh +# process reap THEN shm sweep +scripts/tractor-reap --shm + +# shm sweep only (skip process phase) +scripts/tractor-reap --shm-only + +# dry-run: list candidates, don't unlink +scripts/tractor-reap --shm -n +``` + +**Match criteria** (very conservative — this is a +shared-system path, can't be wrong): +- segment is a regular file under `/dev/shm`, +- owned by the **current uid** (`stat.st_uid`), +- AND **no live process holds it open** — + enumerated by walking every readable + `/proc//maps` (post-mmap mappings) AND + `/proc//fd/*` (pre-mmap shm-opened fds). + +The "nobody has it open" check is the +kernel-canonical "is this leaked?" test — same +answer `lsof /dev/shm/` would give. No +reliance on tractor-specific naming, so it works +for any tractor app. Critically, it WILL NOT touch +segments held by other apps you have running +(e.g. `piker`, `lttng-ust-*`, `aja-shm-*` — +verified locally with 81 in-use segments correctly +preserved). diff --git a/ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md b/ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md index 030a32d0..07214dad 100644 --- a/ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md +++ b/ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md @@ -132,14 +132,20 @@ segment (legitimate race in shared-key setups). - **Crash-leaked segments.** If an actor segfaults or is `SIGKILL`'d before its lifetime stack runs, - `/dev/shm/` will leak. Mitigations: - - `tractor-reap` (the new - `scripts/tractor-reap` CLI) doesn't yet sweep - `/dev/shm` — could extend it. - - Higher-level apps using shm should pin a UUID - into the key (the `'shml_'` pattern in - `test_child_attaches_alot`) so leaks are - distinct per session and easy to GC out-of-band. + `/dev/shm/` will leak. Mitigation: + `scripts/tractor-reap --shm` walks `/dev/shm`, + filters to segments owned by the current uid that + no live process is mapping or holding open (via + `/proc/*/maps` + `/proc/*/fd/*`), and unlinks + them. The "nobody-has-it-open" filter is + kernel-canonical so it never touches in-flight + segments held by sibling apps (verified locally + against 81 piker/lttng/aja-held segments — all + preserved). + - Higher-level apps using shm should still pin a + UUID into the key (the `'shml_'` pattern + in `test_child_attaches_alot`) so concurrent + sessions don't collide on the same key. - **Cross-actor unlink races.** Two actors holding the same shm key racing on `unlink()` — handled by the `FileNotFoundError` swallow. diff --git a/pyproject.toml b/pyproject.toml index 2c7dcb42..5d46a6c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,6 +84,11 @@ testing = [ # known-hanging `subint`-backend audit tests; see # `ai/conc-anal/subint_*_issue.md`). "pytest-timeout>=2.3", + # used by `tractor._testing._reap` for the + # `tractor-reap` zombie-subactor + leaked-shm + # cleanup utility (xplatform `Process.memory_maps`, + # `Process.open_files`). + "psutil>=7.0.0", ] repl = [ "pyperclip>=1.9.0", diff --git a/scripts/tractor-reap b/scripts/tractor-reap index 09220887..3640d210 100755 --- a/scripts/tractor-reap +++ b/scripts/tractor-reap @@ -4,14 +4,26 @@ # # SPDX-License-Identifier: AGPL-3.0-or-later ''' -`tractor-reap` — SC-polite zombie-subactor reaper. +`tractor-reap` — SC-polite zombie-subactor reaper + +optional `/dev/shm/` orphan-segment sweep. -Finds `tractor` subactor processes left alive after a -`pytest` (or any tractor-app) run that failed to fully -cancel its actor tree, then sends SIGINT with a bounded -grace window before escalating to SIGKILL. +Two cleanup phases (run in order when both are enabled): -Detection modes (auto-selected): +1. **process reap** — finds `tractor` subactor processes + left alive after a `pytest` (or any tractor-app) run + that failed to fully cancel its actor tree, then sends + SIGINT with a bounded grace window before escalating + to SIGKILL. + +2. **shm sweep** (`--shm` / `--shm-only`) — unlinks + `/dev/shm/` entries owned by the current uid + that no live process has open (mmap'd or fd-held). + Needed because `tractor` disables + `mp.resource_tracker` (see `tractor.ipc._mp_bs`), so a + hard-crashing actor leaves leaked segments that + nothing else GCs. + +Process-reap detection modes (auto-selected): --parent : descendant-mode — kill procs whose PPid == . Use when a parent @@ -29,14 +41,21 @@ Detection modes (auto-selected): Usage: - # after a pytest run crashed/was Ctrl+C'd + # process reap only (default) scripts/tractor-reap + # process reap + shm sweep + scripts/tractor-reap --shm + + # only the shm sweep, skip process reap + scripts/tractor-reap --shm-only + # from inside a still-live supervisor scripts/tractor-reap --parent 12345 - # dry-run: list what would be reaped, don't signal + # dry-run: list what would be reaped, don't act scripts/tractor-reap -n + scripts/tractor-reap --shm -n ''' import argparse @@ -83,7 +102,21 @@ def main() -> int: parser.add_argument( '--dry-run', '-n', action='store_true', - help='list matched pids but do not signal', + help='list matched pids/paths but do not signal/unlink', + ) + parser.add_argument( + '--shm', + action='store_true', + help=( + 'after process reap, also unlink orphaned ' + '/dev/shm segments owned by the current user ' + 'that no live process is mapping or holding open' + ), + ) + parser.add_argument( + '--shm-only', + action='store_true', + help='skip process reap; only do the shm sweep', ) args = parser.parse_args() @@ -95,29 +128,54 @@ def main() -> int: from tractor._testing._reap import ( find_descendants, find_orphans, + find_orphaned_shm, reap, + reap_shm, ) - if args.parent is not None: - pids: list[int] = find_descendants(args.parent) - mode: str = f'descendants of PPid={args.parent}' - else: - pids = find_orphans(repo) - mode = f'orphans (PPid=1, cwd={repo})' + rc: int = 0 - if not pids: - print(f'[tractor-reap] no {mode} to reap') - return 0 + # --- phase 1: process reap (skipped under --shm-only) --- + if not args.shm_only: + if args.parent is not None: + pids: list[int] = find_descendants(args.parent) + mode: str = f'descendants of PPid={args.parent}' + else: + pids = find_orphans(repo) + mode = f'orphans (PPid=1, cwd={repo})' - if args.dry_run: - print(f'[tractor-reap] dry-run — {mode}:\n {pids}') - return 0 + if not pids: + print(f'[tractor-reap] no {mode} to reap') + elif args.dry_run: + print( + f'[tractor-reap] dry-run — {mode}:\n {pids}' + ) + else: + _, survivors = reap(pids, grace=args.grace) + if survivors: + rc = 1 - signalled, survivors = reap(pids, grace=args.grace) - # exit 0 if everyone exited cleanly, else 1 to signal - # escalation happened — makes the command useful in - # CI health-checks and `||`-chaining. - return 0 if not survivors else 1 + # --- phase 2: shm sweep (opt-in) --- + if args.shm or args.shm_only: + leaked: list[str] = find_orphaned_shm() + if not leaked: + print( + '[tractor-reap] no orphaned /dev/shm ' + 'segments to sweep' + ) + elif args.dry_run: + print( + f'[tractor-reap] dry-run — {len(leaked)} ' + f'orphaned shm segment(s):\n {leaked}' + ) + else: + _, errors = reap_shm(leaked) + if errors: + rc = 1 + + # exit 0 if everything cleaned cleanly, else 1 — useful + # for CI health-check chaining. + return rc if __name__ == '__main__': diff --git a/tractor/_testing/_reap.py b/tractor/_testing/_reap.py index 3e2309ff..f16c22d3 100644 --- a/tractor/_testing/_reap.py +++ b/tractor/_testing/_reap.py @@ -16,17 +16,25 @@ ''' Zombie-subactor reaper — SC-polite (SIGINT first, SIGKILL -as last resort with a bounded grace window). +as last resort with a bounded grace window) plus optional +`/dev/shm/` orphan-segment sweep. Shared implementation between the `tractor-reap` CLI (`scripts/tractor-reap`) and the pytest session-scoped auto-fixture that guards the test suite against leftover subactor processes. -Design notes ------------- +Design notes — process reap +--------------------------- + +- Linux-only today: reads `/proc//{status,cwd,cmdline}`. + Module imports cleanly elsewhere; calling `find_*` on a + non-Linux box returns an empty list (no `/proc` + enumeration). A future xplatform pass could swap this + for `psutil.Process.children()` / + `psutil.process_iter()` since `psutil` is already a + test-time dependency. -- Linux-only: reads `/proc//{status,cwd,cmdline}`. - Two detection modes: 1. **descendant-mode** — when invoked from a still-live @@ -49,14 +57,71 @@ Design notes we want the subactor runtime to run its trio cancel shield + IPC teardown paths where it can. +Design notes — shm sweep +------------------------ + +Since `tractor/ipc/_mp_bs.disable_mantracker()` turns off +`mp.resource_tracker` entirely, a hard-crashing actor can +leave `/dev/shm/` segments behind that nothing else +GCs (see +`ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md`, +"Trade-offs / known gaps"). + +The shm sweep is **Linux-/FreeBSD-only**: both expose +POSIX shared-memory segments as regular files under +`/dev/shm`, so `os.stat()` + `os.unlink()` are the +correct primitives. macOS POSIX shm has no fs-visible +path (segments live behind `shm_open`/`shm_unlink` +syscalls only), and Windows is a different story +entirely. Calling the shm helpers on an unsupported +platform raises `NotImplementedError`. + +In-use enumeration delegates to `psutil` — +`Process.memory_maps()` (post-mmap) + +`Process.open_files()` (pre-mmap shm-opened fds) — +xplatform, mature, and handles the per-process +permission/race edge cases correctly. Segments matching +neither are genuinely leaked → safe to unlink. + +The "nobody has it open" check is the kernel-canonical +test — same answer `lsof /dev/shm/` would give. No +reliance on tractor-specific naming conventions (shm +keys are caller-defined). + ''' from __future__ import annotations import os import pathlib import signal +import stat +import sys import time +# `/dev/shm` is the POSIX-shm filesystem on Linux + FreeBSD. +# macOS uses `shm_open` syscalls without a fs-visible path, +# so the shm helpers refuse to run there. +_SHM_PLATFORM_OK: bool = sys.platform.startswith( + ('linux', 'freebsd') +) +SHM_DIR: str = '/dev/shm' + + +def _ensure_shm_supported() -> None: + ''' + Guard for shm helpers — they assume `/dev/shm` exists + as a tmpfs and `os.unlink()` is the right primitive. + Both true on Linux + FreeBSD; not true elsewhere. + + ''' + if not _SHM_PLATFORM_OK: + raise NotImplementedError( + f'shm reap is only supported on Linux/FreeBSD; ' + f'got sys.platform={sys.platform!r}. macOS ' + f'POSIX shm has no fs-visible path; Windows ' + f'has no /dev/shm equivalent.' + ) + def _read_status_ppid(pid: int) -> int | None: ''' @@ -69,7 +134,11 @@ def _read_status_ppid(pid: int) -> int | None: for line in f: if line.startswith('PPid:'): return int(line.split()[1]) - except (FileNotFoundError, PermissionError, ProcessLookupError): + except ( + FileNotFoundError, + PermissionError, + ProcessLookupError, + ): return None return None @@ -77,21 +146,32 @@ def _read_status_ppid(pid: int) -> int | None: def _read_cwd(pid: int) -> str | None: try: return os.readlink(f'/proc/{pid}/cwd') - except (FileNotFoundError, PermissionError, ProcessLookupError): + except ( + FileNotFoundError, + PermissionError, + ProcessLookupError, + ): return None def _read_cmdline(pid: int) -> str: try: with open(f'/proc/{pid}/cmdline', 'rb') as f: - return f.read().replace(b'\0', b' ').decode(errors='replace') - except (FileNotFoundError, PermissionError, ProcessLookupError): + return f.read().replace(b'\0', b' ').decode( + errors='replace', + ) + except ( + FileNotFoundError, + PermissionError, + ProcessLookupError, + ): return '' def _iter_live_pids() -> list[int]: ''' - Enumerate currently-alive pids from `/proc`. + Enumerate currently-alive pids from `/proc`. Returns + `[]` on systems without `/proc` (e.g. macOS). ''' try: @@ -225,6 +305,158 @@ def _is_alive(pid: int) -> bool: if line.startswith('State:'): # e.g. 'State:\tZ (zombie)' return 'Z' not in line.split()[1] - except (FileNotFoundError, ProcessLookupError): + except ( + FileNotFoundError, + ProcessLookupError, + ): return False return True + + +def _enumerate_in_use_shm( + shm_dir: str = SHM_DIR, +) -> set[str]: + ''' + Return the set of `/` paths currently + held open by any live process — via `psutil`'s + xplatform `Process.memory_maps()` (post-mmap + segments) and `Process.open_files()` (pre-mmap + shm-opened fds). + + Lazy-imports `psutil` so the module stays importable + on installs without it (it's a `testing` group dep). + + ''' + _ensure_shm_supported() + + # lazy + actionable failure: leaked shm sweep is the + # only thing in this module that needs psutil; we + # don't want a top-level ImportError breaking the + # process-reap path. + try: + import psutil + except ImportError as exc: + raise RuntimeError( + 'shm reap requires `psutil` — install the ' + '`testing` dep group, e.g. ' + '`uv sync --group testing`.' + ) from exc + + in_use: set[str] = set() + prefix: str = shm_dir.rstrip('/') + '/' + for proc in psutil.process_iter(['pid']): + try: + for m in proc.memory_maps(grouped=False): + if m.path.startswith(prefix): + in_use.add(m.path) + for f in proc.open_files(): + if f.path.startswith(prefix): + in_use.add(f.path) + except ( + psutil.NoSuchProcess, + psutil.AccessDenied, + psutil.ZombieProcess, + FileNotFoundError, + PermissionError, + ): + # raced — proc died or we can't see its + # mappings (e.g. root-owned). Skip; missing + # an in-use entry only means we'd preserve + # something we could reap, never the + # reverse — safe-by-default. + continue + return in_use + + +def find_orphaned_shm( + *, + uid: int | None = None, + shm_dir: str = SHM_DIR, +) -> list[str]: + ''' + `/` paths that are: + + - owned by `uid` (default: the current effective uid), + - and currently held by NO live process — i.e. + genuinely leaked. + + Linux/FreeBSD only — see module docstring. No reliance + on caller-defined shm-key naming, so this works for + any tractor app (not just the test suite). + + ''' + _ensure_shm_supported() + + if uid is None: + uid = os.geteuid() + + try: + entries: list[str] = os.listdir(shm_dir) + except OSError: + return [] + + in_use: set[str] = _enumerate_in_use_shm(shm_dir=shm_dir) + leaked: list[str] = [] + prefix: str = shm_dir.rstrip('/') + '/' + for entry in entries: + path: str = prefix + entry + try: + st: os.stat_result = os.stat(path) + except OSError: + continue + # only regular files — skip subdirs / sockets etc. + if not stat.S_ISREG(st.st_mode): + continue + if st.st_uid != uid: + continue + if path in in_use: + continue + leaked.append(path) + return leaked + + +def reap_shm( + paths: list[str], + *, + log=print, +) -> tuple[list[str], list[tuple[str, OSError]]]: + ''' + Unlink the given `/dev/shm/...` paths. + + Linux/FreeBSD only — `os.unlink()` is the correct + primitive on the POSIX-shm tmpfs there. macOS POSIX + shm has no fs-visible path; the equivalent there is + `posix_ipc.unlink_shared_memory(name)` (not + implemented here — see module docstring). + + Returns `(unlinked, errors)` where `errors` is a list + of `(path, exc)` for paths that could not be removed + (e.g. permissions). Paths that raced to being already- + gone are counted as successfully unlinked. + + ''' + _ensure_shm_supported() + + unlinked: list[str] = [] + errors: list[tuple[str, OSError]] = [] + for path in paths: + try: + os.unlink(path) + unlinked.append(path) + except FileNotFoundError: + # raced — already gone, treat as success + unlinked.append(path) + except OSError as exc: + errors.append((path, exc)) + + if unlinked: + log( + f'[tractor-reap] unlinked {len(unlinked)} ' + f'orphaned shm segment(s): {unlinked}' + ) + for path, exc in errors: + log( + f'[tractor-reap] could not unlink {path}: ' + f'{exc!r}' + ) + return (unlinked, errors) diff --git a/uv.lock b/uv.lock index 086ae8ef..1caf54cf 100644 --- a/uv.lock +++ b/uv.lock @@ -716,6 +716,7 @@ sync-pause = [ ] testing = [ { name = "pexpect" }, + { name = "psutil" }, { name = "pytest" }, { name = "pytest-timeout" }, ] @@ -761,6 +762,7 @@ subints = [{ name = "msgspec", marker = "python_full_version >= '3.14'", specifi sync-pause = [{ name = "greenback", marker = "python_full_version == '3.13.*'", specifier = ">=1.2.1,<2" }] testing = [ { name = "pexpect", specifier = ">=4.9.0,<5" }, + { name = "psutil", specifier = ">=7.0.0" }, { name = "pytest", specifier = ">=8.3.5" }, { name = "pytest-timeout", specifier = ">=2.3" }, ]