Add `_testing._reap` + auto-reap fixture

Zombie-subactor cleanup for the test suite, SC-polite discipline
(`SIGINT` first, bounded grace, `SIGKILL` only on survivors). Two parts:
a shared reaper module + an autouse session-end fixture that runs it.

Deats,
- new `tractor/_testing/_reap.py` (+230 LOC) — Linux- only reaper using
  `/proc/<pid>/{status,cwd,cmdline}` inspection. Two detection modes:
  - `find_descendants(parent_pid)` for the in-session case
    (PPid-direct-match while pytest is still alive).
  - `find_orphans(repo_root)` for the CLI / post- mortem case (`PPid==1`
    reparented to init + `cwd` filter to repo root + `python` cmdline
    filter).
- `reap(pids, *, grace=3.0, poll=0.25)` does the signal ladder: SIGINT
  all, poll up to `grace` for exit, SIGKILL any survivors. Returns
  `(signalled, killed)` for caller-side reporting.
- new `_reap_orphaned_subactors` session-scoped autouse fixture in
  `tractor/_testing/pytest.py` — after `yield`, runs
  `find_descendants(os.getpid())` + `reap(...)` so each pytest session
  leaves no surviving forks.
- companion CLI scaffolding lives at `scripts/tractor-reap` (separate
  commit) for the pytest-died-mid-session case where the in-session
  fixture didn't get to run.

Also,
- promote `from tractor.spawn._spawn import SpawnMethodKey` to
  module-top in `pytest.py` (was inline-imported inside
  `pytest_generate_tests`), and reuse it in
  `pytest_collection_modifyitems` to assert each `skipon_spawn_backend`
  mark arg is a valid spawn-method literal — catches typos at collection
  time.
- inline `# ?TODO` flags running these through the `try_set_backend`
  checker for stronger validation.

Cross-refs `feedback_sc_graceful_cancel_first.md` for the
SIGINT-before-SIGKILL discipline rationale.

(this patch was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
subint_forkserver_backend
Gud Boi 2026-04-25 00:05:58 -04:00
parent 44bdb1697c
commit eae478f3d5
2 changed files with 273 additions and 2 deletions

View File

@ -0,0 +1,230 @@
# tractor: structured concurrent "actors".
# Copyright 2018-eternity Tyler Goodlet.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
'''
Zombie-subactor reaper SC-polite (SIGINT first, SIGKILL
as last resort with a bounded grace window).
Shared implementation between the `tractor-reap` CLI
(`scripts/tractor-reap`) and the pytest session-scoped
auto-fixture that guards the test suite against leftover
subactor processes.
Design notes
------------
- Linux-only: reads `/proc/<pid>/{status,cwd,cmdline}`.
- Two detection modes:
1. **descendant-mode** when invoked from a still-live
parent (e.g. a pytest session-end fixture), match by
`PPid == parent_pid`. Direct + precise; the target
PIDs are still reparented to the live pytest process
at teardown time, before pytest exits.
2. **orphan-mode** when invoked after the parent died
(e.g. the `tractor-reap` CLI run post-Ctrl+C), match
by `PPid == 1` (reparented to init) AND `cwd ==
<repo-root>` AND cmdline contains `python`. The cwd
filter is what keeps the heuristic from sweeping up
unrelated init-children on the box.
- Escalation: for every matched PID, SIGINT, poll for up
to `grace` seconds, then SIGKILL any survivors. The
two-phase pattern is the SC-graceful-cancel discipline
documented in `feedback_sc_graceful_cancel_first.md`
we want the subactor runtime to run its trio cancel
shield + IPC teardown paths where it can.
'''
from __future__ import annotations
import os
import pathlib
import signal
import time
def _read_status_ppid(pid: int) -> int | None:
'''
Return the parent-pid from `/proc/<pid>/status` or
`None` if the proc went away / is unreadable.
'''
try:
with open(f'/proc/{pid}/status') as f:
for line in f:
if line.startswith('PPid:'):
return int(line.split()[1])
except (FileNotFoundError, PermissionError, ProcessLookupError):
return None
return None
def _read_cwd(pid: int) -> str | None:
try:
return os.readlink(f'/proc/{pid}/cwd')
except (FileNotFoundError, PermissionError, ProcessLookupError):
return None
def _read_cmdline(pid: int) -> str:
try:
with open(f'/proc/{pid}/cmdline', 'rb') as f:
return f.read().replace(b'\0', b' ').decode(errors='replace')
except (FileNotFoundError, PermissionError, ProcessLookupError):
return ''
def _iter_live_pids() -> list[int]:
'''
Enumerate currently-alive pids from `/proc`.
'''
try:
entries: list[str] = os.listdir('/proc')
except OSError:
return []
return [int(e) for e in entries if e.isdigit()]
def find_descendants(
parent_pid: int,
) -> list[int]:
'''
PIDs whose `PPid == parent_pid` i.e. direct
children of the given pid. Used by the pytest
session-end fixture where `parent_pid` is still
alive as the pytest-python process.
'''
return [
pid
for pid in _iter_live_pids()
if _read_status_ppid(pid) == parent_pid
]
def find_orphans(
repo_root: pathlib.Path,
) -> list[int]:
'''
PIDs that are:
- reparented to init (`PPid == 1`),
- have `cwd == <repo_root>`,
- and have a `python` in their cmdline.
This is the "pytest-died-mid-session" case where the
subactor forks got reparented. The cwd filter is the
critical bit that keeps us from sweeping up unrelated
init-children on the box.
'''
repo: str = str(repo_root)
hits: list[int] = []
for pid in _iter_live_pids():
if _read_status_ppid(pid) != 1:
continue
cwd: str | None = _read_cwd(pid)
if cwd != repo:
continue
cmd: str = _read_cmdline(pid)
if 'python' not in cmd:
continue
hits.append(pid)
return hits
def reap(
pids: list[int],
*,
grace: float = 3.0,
poll: float = 0.25,
log=print,
) -> tuple[list[int], list[int]]:
'''
Deliver SIGINT to each pid, wait up to `grace`
seconds for them to exit, then SIGKILL any that
survive.
Returns `(signalled, survivors_killed)` so callers
can report / assert.
`log` is the logger function for user-visible
progress lines default `print`; pytest fixture
swaps it for a `pytest`-friendly writer.
'''
if not pids:
return ([], [])
signalled: list[int] = []
for pid in pids:
try:
os.kill(pid, signal.SIGINT)
signalled.append(pid)
except ProcessLookupError:
# raced — already gone
pass
if signalled:
log(
f'[tractor-reap] SIGINT → {len(signalled)} '
f'proc(s): {signalled}'
)
deadline: float = time.monotonic() + grace
while time.monotonic() < deadline:
time.sleep(poll)
alive: list[int] = [
pid for pid in signalled if _is_alive(pid)
]
if not alive:
return (signalled, [])
survivors: list[int] = [
pid for pid in signalled if _is_alive(pid)
]
if survivors:
log(
f'[tractor-reap] SIGKILL (after {grace}s '
f'grace) → {survivors}'
)
for pid in survivors:
try:
os.kill(pid, signal.SIGKILL)
except ProcessLookupError:
pass
return (signalled, survivors)
def _is_alive(pid: int) -> bool:
'''
True iff `/proc/<pid>` still exists AND the proc
isn't already a zombie (Z state).
'''
try:
with open(f'/proc/{pid}/status') as f:
for line in f:
if line.startswith('State:'):
# e.g. 'State:\tZ (zombie)'
return 'Z' not in line.split()[1]
except (FileNotFoundError, ProcessLookupError):
return False
return True

View File

@ -32,6 +32,7 @@ from typing import (
import pytest
import tractor
from tractor.spawn._spawn import SpawnMethodKey
import trio
@ -274,7 +275,12 @@ def pytest_collection_modifyitems(
default_reason: str = f'Borked on --spawn-backend={backend!r}'
for item in items:
for mark in item.iter_markers(name='skipon_spawn_backend'):
if backend in mark.args:
skip_backends: tuple[str] = mark.args
for skip_backend in skip_backends:
assert skip_backend in get_args(SpawnMethodKey)
# ?TODO, run these through the try-set-backend checker to
# avoid typos?
if backend in skip_backends:
reason: str = mark.kwargs.get(
'reason',
default_reason,
@ -285,6 +291,42 @@ def pytest_collection_modifyitems(
break
@pytest.fixture(
scope='session',
autouse=True,
)
def _reap_orphaned_subactors():
'''
Session-scoped autouse fixture: after the whole test
session finishes, SIGINT any subactor processes still
parented to this `pytest` process, wait a bounded
grace window, then SIGKILL survivors.
Rationale: under fork-based spawn backends (notably
`subint_forkserver`), a test that times out or bails
mid-teardown can leave subactor forks alive. Without
this reap, they linger across sessions and compete
for ports / inherit pytest's capture-pipe fds — which
flakifies later tests. SC-polite discipline: SIGINT
first to let the subactor's trio cancel shield + IPC
teardown paths run before we escalate.
Matching companion CLI: `scripts/tractor-reap` for
the pytest-died-mid-session case.
'''
import os
parent_pid: int = os.getpid()
yield
from tractor._testing._reap import (
find_descendants,
reap,
)
pids: list[int] = find_descendants(parent_pid)
if pids:
reap(pids, grace=3.0)
@pytest.fixture(scope='session')
def debug_mode(
request: pytest.FixtureRequest,
@ -398,7 +440,6 @@ def pytest_generate_tests(
# drive the valid-backend set from the canonical `Literal` so
# adding a new spawn backend (e.g. `'subint'`) doesn't require
# touching the harness.
from tractor.spawn._spawn import SpawnMethodKey
assert spawn_backend in get_args(SpawnMethodKey)
# NOTE: used-to-be-used-to dyanmically parametrize tests for when