Add per-test runaway-subactor CPU detector to `_reap`
New `find_runaway_subactors()` helper + autouse `_detect_runaway_subactors_per_test` fixture that samples `psutil.cpu_percent()` on descendants to catch tight-loop bugs (e.g. #452-class `recvfrom` on a closed socket). Checks both at setup (leftovers from a prior hung test) and teardown (spawned by this test). Intentionally does NOT kill the runaway — emits a loud warning with diag commands (`strace`, `lsof`, `ss`, `kill`) so the pid stays alive for hands-on investigation. Session-end reaper still SIGINT/SIGKILL survivors on normal exit. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-codesubint_forkserver_backend
parent
32e89c67ee
commit
5cf0312c78
|
|
@ -218,6 +218,64 @@ def find_descendants(
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def find_runaway_subactors(
|
||||||
|
parent_pid: int,
|
||||||
|
*,
|
||||||
|
cpu_threshold: float = 95.0,
|
||||||
|
sample_interval: float = 0.5,
|
||||||
|
only_pids: set[int]|None = None,
|
||||||
|
) -> list[tuple[int, float, str]]:
|
||||||
|
'''
|
||||||
|
Return `(pid, cpu_pct, cmdline)` for any descendant
|
||||||
|
of `parent_pid` currently burning CPU above
|
||||||
|
`cpu_threshold` (default 95%) — the smoking-gun
|
||||||
|
signature of a runaway tight-loop bug (e.g. a C-level
|
||||||
|
`recvfrom` loop on a closed socket that missed EOF
|
||||||
|
detection; #452-class issue).
|
||||||
|
|
||||||
|
`cpu_percent(interval=sample_interval)` is the
|
||||||
|
canonical psutil API for a "what %CPU is this proc
|
||||||
|
using NOW" answer — it samples twice with a delta to
|
||||||
|
compute true utilization.
|
||||||
|
|
||||||
|
`only_pids` filters to a specific pre-snapshotted set
|
||||||
|
(e.g. "pids spawned during this test only"); when
|
||||||
|
`None`, all live descendants are checked.
|
||||||
|
|
||||||
|
Returns `[]` when `psutil` isn't installed or no
|
||||||
|
descendants match.
|
||||||
|
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
return []
|
||||||
|
|
||||||
|
candidates: list[int] = find_descendants(parent_pid)
|
||||||
|
if only_pids is not None:
|
||||||
|
candidates = [p for p in candidates if p in only_pids]
|
||||||
|
if not candidates:
|
||||||
|
return []
|
||||||
|
|
||||||
|
runaways: list[tuple[int, float, str]] = []
|
||||||
|
for pid in candidates:
|
||||||
|
try:
|
||||||
|
proc = psutil.Process(pid)
|
||||||
|
cpu: float = proc.cpu_percent(
|
||||||
|
interval=sample_interval,
|
||||||
|
)
|
||||||
|
if cpu < cpu_threshold:
|
||||||
|
continue
|
||||||
|
cmdline: str = ' '.join(proc.cmdline())
|
||||||
|
runaways.append((pid, cpu, cmdline))
|
||||||
|
except (
|
||||||
|
psutil.NoSuchProcess,
|
||||||
|
psutil.AccessDenied,
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
return runaways
|
||||||
|
|
||||||
|
|
||||||
def find_orphans(
|
def find_orphans(
|
||||||
repo_root: pathlib.Path,
|
repo_root: pathlib.Path,
|
||||||
) -> list[int]:
|
) -> list[int]:
|
||||||
|
|
@ -728,14 +786,129 @@ def _track_orphaned_uds_per_test():
|
||||||
if new_orphans:
|
if new_orphans:
|
||||||
import warnings
|
import warnings
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
f'UDS sock-file LEAK detected from test '
|
'UDS sock-file LEAK detected from test '
|
||||||
f'(reaping):\n '
|
'(reaping):\n '
|
||||||
+ '\n '.join(new_orphans),
|
+ '\n '.join(new_orphans),
|
||||||
stacklevel=1,
|
stacklevel=1,
|
||||||
)
|
)
|
||||||
reap_uds(new_orphans)
|
reap_uds(new_orphans)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(
|
||||||
|
scope='function',
|
||||||
|
autouse=True,
|
||||||
|
)
|
||||||
|
def _detect_runaway_subactors_per_test():
|
||||||
|
'''
|
||||||
|
Per-test (function-scoped) autouse runaway-subactor
|
||||||
|
detector.
|
||||||
|
|
||||||
|
Snapshots descendant pids before+after each test;
|
||||||
|
for any pid spawned during the test that's still
|
||||||
|
ALIVE at teardown AND burning >95% CPU, emits a loud
|
||||||
|
warning with `pid`, sampled `cpu%`, full `cmdline`,
|
||||||
|
AND copy-pastable diag commands (`strace`, `lsof`,
|
||||||
|
`ss`, `kill`).
|
||||||
|
|
||||||
|
**Does NOT kill the runaway** — by design.
|
||||||
|
The point of this fixture is to make tight-loop bugs
|
||||||
|
(e.g. C-level `recvfrom` loop on a closed socket
|
||||||
|
that missed EOF detection — issue #452-class) loudly
|
||||||
|
visible AT the test that triggers, while keeping
|
||||||
|
the live pid available for hands-on diagnosis. The
|
||||||
|
session-end `_reap_orphaned_subactors` fixture will
|
||||||
|
SIGINT-then-SIGKILL any survivors when the test
|
||||||
|
session completes normally; if the user Ctrl-C's
|
||||||
|
pytest mid-warning, the pid stays alive for as long
|
||||||
|
as needed.
|
||||||
|
|
||||||
|
Cost: one extra `os.listdir('/proc')` snapshot
|
||||||
|
pre-test, one snapshot + N×`psutil.cpu_percent(0.5)`
|
||||||
|
post-test (only when there ARE new descendants —
|
||||||
|
most tests don't trigger any sampling). Skips
|
||||||
|
silently when `psutil` isn't installed.
|
||||||
|
|
||||||
|
'''
|
||||||
|
parent_pid: int = os.getpid()
|
||||||
|
|
||||||
|
def _emit_runaway_warning(
|
||||||
|
runaways: list[tuple[int, float, str]],
|
||||||
|
when: str,
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
Format + emit the runaway warning. Shared between
|
||||||
|
the SETUP-side (pre-yield, catches survivors of a
|
||||||
|
prior hung test) and TEARDOWN-side (post-yield,
|
||||||
|
catches normally-completing tests that left a
|
||||||
|
runaway behind) detection passes.
|
||||||
|
|
||||||
|
'''
|
||||||
|
msg_lines: list[str] = [
|
||||||
|
f'RUNAWAY subactor(s) detected at {when} — '
|
||||||
|
f'burning CPU (>95%):',
|
||||||
|
]
|
||||||
|
for pid, cpu, cmdline in runaways:
|
||||||
|
msg_lines.extend([
|
||||||
|
f' pid={pid} cpu={cpu:.1f}% cmdline={cmdline!r}',
|
||||||
|
f' diagnose live (pid stays alive — NOT killed):',
|
||||||
|
f' sudo strace -p {pid} -f -tt -e trace=recvfrom,epoll_wait,read,write',
|
||||||
|
f' sudo readlink /proc/{pid}/fd/* 2>/dev/null | head -20',
|
||||||
|
f' sudo ss -tnp | grep {pid}',
|
||||||
|
f' sudo lsof -p {pid}',
|
||||||
|
f' manual kill when done:',
|
||||||
|
f' kill -SIGINT {pid} # graceful first',
|
||||||
|
f' kill -SIGKILL {pid} # if SIGINT ignored (busy in C)',
|
||||||
|
'',
|
||||||
|
])
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
'\n'.join(msg_lines),
|
||||||
|
stacklevel=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
# SETUP-side detection: catches runaways inherited
|
||||||
|
# from a PRIOR test that hung (and the user
|
||||||
|
# Ctrl-C'd or pytest-timeout fired) — those tests'
|
||||||
|
# teardown-side detector never ran, but the
|
||||||
|
# subactor is still burning CPU when the next test
|
||||||
|
# starts. The warning comes ONE TEST LATE which is
|
||||||
|
# imperfect but better than silence.
|
||||||
|
pre_existing: set[int] = set(find_descendants(parent_pid))
|
||||||
|
pre_runaways: list[tuple[int, float, str]] = (
|
||||||
|
find_runaway_subactors(
|
||||||
|
parent_pid,
|
||||||
|
only_pids=pre_existing,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if pre_runaways:
|
||||||
|
_emit_runaway_warning(
|
||||||
|
pre_runaways,
|
||||||
|
when='test SETUP (leftover from prior hung test)',
|
||||||
|
)
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
# TEARDOWN-side detection: catches runaways spawned
|
||||||
|
# by THIS test that survived a normal teardown
|
||||||
|
# (i.e. parent's `hard_kill` SIGKILL didn't actually
|
||||||
|
# stop the runaway because it was in C tight-loop
|
||||||
|
# somewhere unreachable to signals — see issue #452
|
||||||
|
# forkserver-worker post-fork-close gap).
|
||||||
|
post_runaways: list[tuple[int, float, str]] = (
|
||||||
|
find_runaway_subactors(
|
||||||
|
parent_pid,
|
||||||
|
only_pids=set(
|
||||||
|
find_descendants(parent_pid)
|
||||||
|
) - pre_existing,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if post_runaways:
|
||||||
|
_emit_runaway_warning(
|
||||||
|
post_runaways,
|
||||||
|
when='test teardown',
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def reap_subactors_per_test() -> int:
|
def reap_subactors_per_test() -> int:
|
||||||
'''
|
'''
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue