Add per-test runaway-subactor CPU detector to `_reap`

New `find_runaway_subactors()` helper + autouse
`_detect_runaway_subactors_per_test` fixture that
samples `psutil.cpu_percent()` on descendants to
catch tight-loop bugs (e.g. #452-class `recvfrom`
on a closed socket). Checks both at setup
(leftovers from a prior hung test) and teardown
(spawned by this test).

Intentionally does NOT kill the runaway — emits
a loud warning with diag commands (`strace`,
`lsof`, `ss`, `kill`) so the pid stays alive for
hands-on investigation. Session-end reaper still
SIGINT/SIGKILL survivors on normal exit.

(this commit msg was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
subint_forkserver_backend
Gud Boi 2026-05-04 10:15:55 -04:00
parent 32e89c67ee
commit 5cf0312c78
1 changed files with 175 additions and 2 deletions

View File

@ -218,6 +218,64 @@ def find_descendants(
] ]
def find_runaway_subactors(
parent_pid: int,
*,
cpu_threshold: float = 95.0,
sample_interval: float = 0.5,
only_pids: set[int]|None = None,
) -> list[tuple[int, float, str]]:
'''
Return `(pid, cpu_pct, cmdline)` for any descendant
of `parent_pid` currently burning CPU above
`cpu_threshold` (default 95%) the smoking-gun
signature of a runaway tight-loop bug (e.g. a C-level
`recvfrom` loop on a closed socket that missed EOF
detection; #452-class issue).
`cpu_percent(interval=sample_interval)` is the
canonical psutil API for a "what %CPU is this proc
using NOW" answer — it samples twice with a delta to
compute true utilization.
`only_pids` filters to a specific pre-snapshotted set
(e.g. "pids spawned during this test only"); when
`None`, all live descendants are checked.
Returns `[]` when `psutil` isn't installed or no
descendants match.
'''
try:
import psutil
except ImportError:
return []
candidates: list[int] = find_descendants(parent_pid)
if only_pids is not None:
candidates = [p for p in candidates if p in only_pids]
if not candidates:
return []
runaways: list[tuple[int, float, str]] = []
for pid in candidates:
try:
proc = psutil.Process(pid)
cpu: float = proc.cpu_percent(
interval=sample_interval,
)
if cpu < cpu_threshold:
continue
cmdline: str = ' '.join(proc.cmdline())
runaways.append((pid, cpu, cmdline))
except (
psutil.NoSuchProcess,
psutil.AccessDenied,
):
continue
return runaways
def find_orphans( def find_orphans(
repo_root: pathlib.Path, repo_root: pathlib.Path,
) -> list[int]: ) -> list[int]:
@ -728,14 +786,129 @@ def _track_orphaned_uds_per_test():
if new_orphans: if new_orphans:
import warnings import warnings
warnings.warn( warnings.warn(
f'UDS sock-file LEAK detected from test ' 'UDS sock-file LEAK detected from test '
f'(reaping):\n ' '(reaping):\n '
+ '\n '.join(new_orphans), + '\n '.join(new_orphans),
stacklevel=1, stacklevel=1,
) )
reap_uds(new_orphans) reap_uds(new_orphans)
@pytest.fixture(
scope='function',
autouse=True,
)
def _detect_runaway_subactors_per_test():
'''
Per-test (function-scoped) autouse runaway-subactor
detector.
Snapshots descendant pids before+after each test;
for any pid spawned during the test that's still
ALIVE at teardown AND burning >95% CPU, emits a loud
warning with `pid`, sampled `cpu%`, full `cmdline`,
AND copy-pastable diag commands (`strace`, `lsof`,
`ss`, `kill`).
**Does NOT kill the runaway** by design.
The point of this fixture is to make tight-loop bugs
(e.g. C-level `recvfrom` loop on a closed socket
that missed EOF detection issue #452-class) loudly
visible AT the test that triggers, while keeping
the live pid available for hands-on diagnosis. The
session-end `_reap_orphaned_subactors` fixture will
SIGINT-then-SIGKILL any survivors when the test
session completes normally; if the user Ctrl-C's
pytest mid-warning, the pid stays alive for as long
as needed.
Cost: one extra `os.listdir('/proc')` snapshot
pre-test, one snapshot + N×`psutil.cpu_percent(0.5)`
post-test (only when there ARE new descendants
most tests don't trigger any sampling). Skips
silently when `psutil` isn't installed.
'''
parent_pid: int = os.getpid()
def _emit_runaway_warning(
runaways: list[tuple[int, float, str]],
when: str,
) -> None:
'''
Format + emit the runaway warning. Shared between
the SETUP-side (pre-yield, catches survivors of a
prior hung test) and TEARDOWN-side (post-yield,
catches normally-completing tests that left a
runaway behind) detection passes.
'''
msg_lines: list[str] = [
f'RUNAWAY subactor(s) detected at {when}'
f'burning CPU (>95%):',
]
for pid, cpu, cmdline in runaways:
msg_lines.extend([
f' pid={pid} cpu={cpu:.1f}% cmdline={cmdline!r}',
f' diagnose live (pid stays alive — NOT killed):',
f' sudo strace -p {pid} -f -tt -e trace=recvfrom,epoll_wait,read,write',
f' sudo readlink /proc/{pid}/fd/* 2>/dev/null | head -20',
f' sudo ss -tnp | grep {pid}',
f' sudo lsof -p {pid}',
f' manual kill when done:',
f' kill -SIGINT {pid} # graceful first',
f' kill -SIGKILL {pid} # if SIGINT ignored (busy in C)',
'',
])
import warnings
warnings.warn(
'\n'.join(msg_lines),
stacklevel=1,
)
# SETUP-side detection: catches runaways inherited
# from a PRIOR test that hung (and the user
# Ctrl-C'd or pytest-timeout fired) — those tests'
# teardown-side detector never ran, but the
# subactor is still burning CPU when the next test
# starts. The warning comes ONE TEST LATE which is
# imperfect but better than silence.
pre_existing: set[int] = set(find_descendants(parent_pid))
pre_runaways: list[tuple[int, float, str]] = (
find_runaway_subactors(
parent_pid,
only_pids=pre_existing,
)
)
if pre_runaways:
_emit_runaway_warning(
pre_runaways,
when='test SETUP (leftover from prior hung test)',
)
yield
# TEARDOWN-side detection: catches runaways spawned
# by THIS test that survived a normal teardown
# (i.e. parent's `hard_kill` SIGKILL didn't actually
# stop the runaway because it was in C tight-loop
# somewhere unreachable to signals — see issue #452
# forkserver-worker post-fork-close gap).
post_runaways: list[tuple[int, float, str]] = (
find_runaway_subactors(
parent_pid,
only_pids=set(
find_descendants(parent_pid)
) - pre_existing,
)
)
if post_runaways:
_emit_runaway_warning(
post_runaways,
when='test teardown',
)
@pytest.fixture @pytest.fixture
def reap_subactors_per_test() -> int: def reap_subactors_per_test() -> int:
''' '''