Add `subint` cancellation + hard-kill test audit
Lock in the escape-hatch machinery added to `tractor.spawn._subint` during the Phase B.2/B.3 bringup (issue #379) so future stdlib regressions or our own refactors don't silently re-introduce the mid-suite hangs. Deats, - `test_subint_happy_teardown`: baseline — spawn a subactor, one portal RPC, clean teardown. If this breaks, something's wrong unrelated to the hard-kill shields. - `test_subint_non_checkpointing_child`: cancel a subactor stuck in a non-checkpointing Python loop (`threading.Event.wait()` releases the GIL but never inserts a trio checkpoint). Validates the bounded-shield + daemon-driver-thread combo abandons the thread after `_HARD_KILL_TIMEOUT`. Every test is wrapped in `trio.fail_after()` for a deterministic per-test wall-clock ceiling (an unbounded audit would defeat itself) and arms `tractor.devx.dump_on_hang()` so a hang captures a stack dump — pytest's stderr capture swallows `faulthandler` output by default. Gated via `pytest.importorskip('concurrent.interpreters')` and a module-level skip when `--spawn-backend` isn't `'subint'`. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-codesubint_spawner_backend
parent
79390a4e3a
commit
baf7ec54ac
|
|
@ -0,0 +1,197 @@
|
||||||
|
'''
|
||||||
|
Cancellation + hard-kill semantics audit for the `subint` spawn
|
||||||
|
backend.
|
||||||
|
|
||||||
|
Exercises the escape-hatch machinery added to
|
||||||
|
`tractor.spawn._subint` (module-level `_HARD_KILL_TIMEOUT`,
|
||||||
|
bounded shields around the soft-kill / thread-join sites, daemon
|
||||||
|
driver-thread abandonment) so that future stdlib regressions or
|
||||||
|
our own refactors don't silently re-introduce the hangs first
|
||||||
|
diagnosed during the Phase B.2/B.3 bringup (issue #379).
|
||||||
|
|
||||||
|
Every test in this module:
|
||||||
|
- is wrapped in `trio.fail_after()` for a deterministic per-test
|
||||||
|
wall-clock ceiling (the whole point of these tests is to fail
|
||||||
|
fast when our escape hatches regress; an unbounded test would
|
||||||
|
defeat itself),
|
||||||
|
- arms `tractor.devx.dump_on_hang()` to capture a stack dump on
|
||||||
|
failure — without it, a hang here is opaque because pytest's
|
||||||
|
stderr capture swallows `faulthandler` output by default
|
||||||
|
(hard-won lesson from the original diagnosis),
|
||||||
|
- skips on py<3.13 (no `_interpreters`) and on any
|
||||||
|
`--spawn-backend` other than `'subint'` (these tests are
|
||||||
|
subint-specific by design — they'd be nonsense under `trio` or
|
||||||
|
`mp_*`).
|
||||||
|
|
||||||
|
'''
|
||||||
|
from __future__ import annotations
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import trio
|
||||||
|
import tractor
|
||||||
|
from tractor.devx import dump_on_hang
|
||||||
|
|
||||||
|
|
||||||
|
# Gate: the `subint` backend requires py3.14+. Check the
|
||||||
|
# public stdlib wrapper's presence (added in 3.14) rather than
|
||||||
|
# the private `_interpreters` module (which exists on 3.13 but
|
||||||
|
# wedges under tractor's usage — see `tractor.spawn._subint`).
|
||||||
|
pytest.importorskip('concurrent.interpreters')
|
||||||
|
|
||||||
|
# Subint-only: read the spawn method that `pytest_configure`
|
||||||
|
# committed via `try_set_start_method()`. By the time this module
|
||||||
|
# imports, the CLI backend choice has been applied.
|
||||||
|
from tractor.spawn._spawn import _spawn_method # noqa: E402
|
||||||
|
|
||||||
|
if _spawn_method != 'subint':
|
||||||
|
pytestmark = pytest.mark.skip(
|
||||||
|
reason=(
|
||||||
|
"subint-specific cancellation audit — "
|
||||||
|
"pass `--spawn-backend=subint` to run."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# child-side task bodies (run inside the spawned subint)
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def _trivial_rpc() -> str:
|
||||||
|
'''
|
||||||
|
Minimal RPC body for the baseline happy-teardown test.
|
||||||
|
'''
|
||||||
|
return 'hello from subint'
|
||||||
|
|
||||||
|
|
||||||
|
async def _spin_without_trio_checkpoints() -> None:
|
||||||
|
'''
|
||||||
|
Block the main task with NO trio-visible checkpoints so any
|
||||||
|
`Portal.cancel_actor()` arriving over IPC has nothing to hand
|
||||||
|
off to.
|
||||||
|
|
||||||
|
`threading.Event.wait(timeout)` releases the GIL (so other
|
||||||
|
threads — including trio's IO/RPC tasks — can progress) but
|
||||||
|
does NOT insert a trio checkpoint, so the subactor's main
|
||||||
|
task never notices cancellation.
|
||||||
|
|
||||||
|
This is the exact "stuck subint" scenario the hard-kill
|
||||||
|
shields exist to survive.
|
||||||
|
'''
|
||||||
|
import threading
|
||||||
|
never_set = threading.Event()
|
||||||
|
while not never_set.is_set():
|
||||||
|
# 1s re-check granularity; low enough not to waste CPU,
|
||||||
|
# high enough that even a pathologically slow
|
||||||
|
# `_HARD_KILL_TIMEOUT` won't accidentally align with a
|
||||||
|
# wake.
|
||||||
|
never_set.wait(timeout=1.0)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# parent-side harnesses (driven inside `trio.run(...)`)
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def _happy_path(
|
||||||
|
reg_addr: tuple[str, int|str],
|
||||||
|
deadline: float,
|
||||||
|
) -> None:
|
||||||
|
with trio.fail_after(deadline):
|
||||||
|
async with (
|
||||||
|
tractor.open_root_actor(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
),
|
||||||
|
tractor.open_nursery() as an,
|
||||||
|
):
|
||||||
|
portal: tractor.Portal = await an.run_in_actor(
|
||||||
|
_trivial_rpc,
|
||||||
|
name='subint-happy',
|
||||||
|
)
|
||||||
|
result: str = await portal.wait_for_result()
|
||||||
|
assert result == 'hello from subint'
|
||||||
|
|
||||||
|
|
||||||
|
async def _spawn_stuck_then_cancel(
|
||||||
|
reg_addr: tuple[str, int|str],
|
||||||
|
deadline: float,
|
||||||
|
) -> None:
|
||||||
|
with trio.fail_after(deadline):
|
||||||
|
async with (
|
||||||
|
tractor.open_root_actor(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
),
|
||||||
|
tractor.open_nursery() as an,
|
||||||
|
):
|
||||||
|
await an.run_in_actor(
|
||||||
|
_spin_without_trio_checkpoints,
|
||||||
|
name='subint-stuck',
|
||||||
|
)
|
||||||
|
# Give the child time to reach its non-checkpointing
|
||||||
|
# loop before we cancel; the precise value doesn't
|
||||||
|
# matter as long as it's a handful of trio schedule
|
||||||
|
# ticks.
|
||||||
|
await trio.sleep(0.5)
|
||||||
|
an.cancel_scope.cancel()
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# tests
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_subint_happy_teardown(
|
||||||
|
reg_addr: tuple[str, int|str],
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
Baseline: spawn a subactor, do one portal RPC, close nursery
|
||||||
|
cleanly. No cancel, no faults.
|
||||||
|
|
||||||
|
If this regresses we know something's wrong at the
|
||||||
|
spawn/teardown layer unrelated to the hard-kill escape
|
||||||
|
hatches.
|
||||||
|
|
||||||
|
'''
|
||||||
|
deadline: float = 10.0
|
||||||
|
with dump_on_hang(
|
||||||
|
seconds=deadline,
|
||||||
|
path='/tmp/subint_cancellation_happy.dump',
|
||||||
|
):
|
||||||
|
trio.run(partial(_happy_path, reg_addr, deadline))
|
||||||
|
|
||||||
|
|
||||||
|
def test_subint_non_checkpointing_child(
|
||||||
|
reg_addr: tuple[str, int|str],
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
Cancel a subactor whose main task is stuck in a non-
|
||||||
|
checkpointing Python loop.
|
||||||
|
|
||||||
|
`Portal.cancel_actor()` may be delivered over IPC but the
|
||||||
|
main task never checkpoints to observe the Cancelled —
|
||||||
|
so the subint's `trio.run()` can't exit gracefully.
|
||||||
|
|
||||||
|
The parent `subint_proc` bounded-shield + daemon-driver-
|
||||||
|
thread combo should abandon the thread after
|
||||||
|
`_HARD_KILL_TIMEOUT` and let the parent return cleanly.
|
||||||
|
|
||||||
|
Wall-clock budget:
|
||||||
|
- ~0.5s: settle time for child to enter the stuck loop
|
||||||
|
- ~3s: `_HARD_KILL_TIMEOUT` (soft-kill wait)
|
||||||
|
- ~3s: `_HARD_KILL_TIMEOUT` (thread-join wait)
|
||||||
|
- margin
|
||||||
|
|
||||||
|
'''
|
||||||
|
deadline: float = 15.0
|
||||||
|
with dump_on_hang(
|
||||||
|
seconds=deadline,
|
||||||
|
path='/tmp/subint_cancellation_stuck.dump',
|
||||||
|
):
|
||||||
|
trio.run(
|
||||||
|
partial(
|
||||||
|
_spawn_stuck_then_cancel,
|
||||||
|
reg_addr,
|
||||||
|
deadline,
|
||||||
|
),
|
||||||
|
)
|
||||||
Loading…
Reference in New Issue