Add signal-alarm guard to `test_dynamic_pub_sub`

Outer `signal.alarm` cap that fires even when trio's
`fail_after` is blocked by a shielded-await deadlock
(the bug-class-3 hang under MTF backends). Only armed
for fork-based spawners where the bug lives.

Deats,
- `_DIAG_CAP_S = fail_after_s + 5` — slightly larger than the
  trio-native guard so it always loses when the in-band path works.
- `test_log.cancel()` breadcrumbs at each cancel-scope boundary so the
  last-fired breadcrumb names the swallow point on hang.
- try/finally wrapping around each scope level for deterministic
  breadcrumb emission.
- add `is_forking_spawner`, `set_fork_aware_capture` fixture params.
- rework `fail_after_s`: 4s for fork, 12s for trio (was 30/12).

Also,
- `test_sigint_both_stream_types`: `assert 0` -> `pytest.fail()`, add
  TODO re `pytest.raises()`.

(this patch was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
subint_forkserver_backend
Gud Boi 2026-05-13 11:33:49 -04:00
parent 83b6a3373a
commit 10db117864
1 changed files with 127 additions and 62 deletions

View File

@ -147,6 +147,9 @@ def test_dynamic_pub_sub(
test_log: tractor.log.StackLevelAdapter,
reap_subactors_per_test: int,
expect_cancel_exc: Type[BaseException],
is_forking_spawner: bool,
set_fork_aware_capture,
):
failed_to_raise_report: str = (
f'Never got a {expect_cancel_exc!r} ??'
@ -157,79 +160,138 @@ def test_dynamic_pub_sub(
from multiprocessing import cpu_count
cpus = cpu_count()
# Hard safety cap via trio's own cancellation — see the
# module-level NOTE on why we avoid `pytest-timeout` for
# this test. Picked backend-aware: under `trio` backend
# spawn is cheap (~1s for `cpus` actors) but fork-based
# backends pay a per-spawn cost (forkserver round-trip +
# IPC peer-handshake) that can stack up over `cpus - 1`
# sequential `n.run_in_actor()` calls — especially on UDS
# under cross-pytest contention (#451 / #452). Empirically
# 12s flakes on `main_thread_forkserver`; 30s gives
# plenty of headroom while still failing-loud on a real
# hang.
from tractor.spawn import _spawn as _spawn_mod
# Hard safety cap via trio's own cancellation. NOTE see the
# module-level note on why we avoid `pytest-timeout` for this
# test. Picked backend-aware: under `trio` backend spawn is
# cheap (~1s for `cpus` actors) but fork-based backends pay
# a per-spawn cost (forkserver round-trip + IPC peer-handshake)
# that can stack up over `cpus - 1` sequential `n.run_in_actor()`
# calls — especially on UDS under cross-pytest contention
# (#451 / #452). Empirically a flat 15s flakes on
# `main_thread_forkserver` for many-cpu hosts (a single bad
# spawn-stack puts total run-time at ~15.5s, just over);
# 30s gives plenty of headroom while still failing-loud on
# a real hang.
#
# XXX caveat: this is an *inner* `trio.fail_after` — its
# `Cancelled` cannot reach a task parked in a shielded `await`
# (e.g. inside actor-nursery teardown). When the in-band cancel
# path is itself buggy (the bug-class-3 `raise KBI` swallow we're
# currently chasing) this guard does NOT fire and the test sits
# forever until external SIGINT. The `_DIAG_CAP_S` outer guard
# below is the AFK-safety counterpart.
fail_after_s: int = (
30
if _spawn_mod._spawn_method == 'main_thread_forkserver'
4
if is_forking_spawner
else 12
)
# outer guard: when the inner fail_after fails to fire because of
# a shielded-await deadlock, this cap *aborts the trio run via
# signal.alarm → KBI* so AFK runs don't sit for >20min on the
# bug-class-3 hang. Slightly larger than `fail_after_s` so the
# trio-native path always wins when it works.
_DIAG_CAP_S: int = fail_after_s + 5
async def main():
with trio.fail_after(fail_after_s):
async with tractor.open_nursery(
registry_addrs=[reg_addr],
debug_mode=debug_mode,
) as n:
# bug-class-3 breadcrumb: tag each level of the cancel path
# so when the run hangs and we capture cancel-level logs, the
# *last* breadcrumb that fired names the swallow point.
test_log.cancel('test_dynamic_pub_sub: enter main()')
try:
with trio.fail_after(fail_after_s):
test_log.cancel(
f'test_dynamic_pub_sub: '
f'enter `trio.fail_after({fail_after_s})` scope'
)
try:
async with tractor.open_nursery(
registry_addrs=[reg_addr],
debug_mode=debug_mode,
) as n:
test_log.cancel(
'test_dynamic_pub_sub: '
'actor nursery opened'
)
# name of this actor will be same as target func
await n.run_in_actor(publisher)
# name of this actor will be same as target func
await n.run_in_actor(publisher)
for i, sub in zip(
range(cpus - 2),
itertools.cycle(_registry.keys())
):
await n.run_in_actor(
consumer,
name=f'consumer_{sub}',
subs=[sub],
for i, sub in zip(
range(cpus - 2),
itertools.cycle(_registry.keys())
):
await n.run_in_actor(
consumer,
name=f'consumer_{sub}',
subs=[sub],
)
# make one dynamic subscriber
await n.run_in_actor(
consumer,
name='consumer_dynamic',
subs=list(_registry.keys()),
)
# block until "cancelled by user"
await trio.sleep(3)
test_log.warning(
f'Raising user cancel exc: '
f'{expect_cancel_exc!r}'
)
test_log.cancel(
f'test_dynamic_pub_sub: '
f'ABOUT TO RAISE {expect_cancel_exc!r}'
)
raise expect_cancel_exc('simulate user cancel!')
finally:
test_log.cancel(
'test_dynamic_pub_sub: '
'actor nursery `__aexit__` returned'
)
test_log.cancel(
'test_dynamic_pub_sub: `fail_after` scope exited'
)
finally:
test_log.cancel(
'test_dynamic_pub_sub: leaving `main()`'
)
# make one dynamic subscriber
await n.run_in_actor(
consumer,
name='consumer_dynamic',
subs=list(_registry.keys()),
)
# block until "cancelled by user"
await trio.sleep(3)
test_log.warning(
f'Raising user cancel exc: '
f'{expect_cancel_exc!r}'
)
raise expect_cancel_exc('simulate user cancel!')
# outer signal-based guard — survives a shielded-await deadlock
# since `signal.alarm` raises in the main thread regardless of
# trio's scope state. ONLY armed under fork-based backends since
# the bug we're chasing is MTF-specific.
import signal
armed_alarm: bool = bool(is_forking_spawner)
if armed_alarm:
signal.alarm(_DIAG_CAP_S)
try:
trio.run(main)
pytest.fail(failed_to_raise_report)
except expect_cancel_exc:
# parent-side raised the user-cancel exc directly and
# it propagated unwrapped; clean path.
test_log.exception('Got user-cancel exc AS EXPECTED')
except BaseExceptionGroup as err:
# under fork-based backends the user-raised cancel
# can race with subactor-side stream teardown
# (`trio.EndOfChannel` from a publisher's `send()`
# whose remote half got cut). The expected exc may
# then be nested deeper in the group rather than at
# the top level. `BaseExceptionGroup.split()` walks
# the exc tree recursively (Python 3.11+).
matched, _ = err.split(expect_cancel_exc)
if matched is None:
try:
trio.run(main)
pytest.fail(failed_to_raise_report)
except expect_cancel_exc:
# parent-side raised the user-cancel exc directly and
# it propagated unwrapped; clean path.
test_log.exception('Got user-cancel exc AS EXPECTED')
except BaseExceptionGroup as err:
# under fork-based backends the user-raised cancel
# can race with subactor-side stream teardown
# (`trio.EndOfChannel` from a publisher's `send()`
# whose remote half got cut). The expected exc may
# then be nested deeper in the group rather than at
# the top level. `BaseExceptionGroup.split()` walks
# the exc tree recursively (Python 3.11+).
matched, _ = err.split(expect_cancel_exc)
if matched is None:
pytest.fail(failed_to_raise_report)
test_log.exception('Got user-cancel exc AS EXPECTED')
test_log.exception('Got user-cancel exc AS EXPECTED')
finally:
# always disarm so a passing test doesn't get killed
# post-trio.run by a stale alarm.
if armed_alarm:
signal.alarm(0)
@tractor.context
@ -361,9 +423,12 @@ def test_sigint_both_stream_types():
resp = await stream.receive()
assert resp == msg
raise KeyboardInterrupt
# TODO, use pytest.raises() here instead?
# (why weren't we originally?)
try:
trio.run(main)
assert 0, "Didn't receive KBI!?"
pytest.fail("Didn't receive KBI!?")
except KeyboardInterrupt:
pass