Add signal-alarm guard to `test_dynamic_pub_sub`
Outer `signal.alarm` cap that fires even when trio's `fail_after` is blocked by a shielded-await deadlock (the bug-class-3 hang under MTF backends). Only armed for fork-based spawners where the bug lives. Deats, - `_DIAG_CAP_S = fail_after_s + 5` — slightly larger than the trio-native guard so it always loses when the in-band path works. - `test_log.cancel()` breadcrumbs at each cancel-scope boundary so the last-fired breadcrumb names the swallow point on hang. - try/finally wrapping around each scope level for deterministic breadcrumb emission. - add `is_forking_spawner`, `set_fork_aware_capture` fixture params. - rework `fail_after_s`: 4s for fork, 12s for trio (was 30/12). Also, - `test_sigint_both_stream_types`: `assert 0` -> `pytest.fail()`, add TODO re `pytest.raises()`. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-codesubint_forkserver_backend
parent
83b6a3373a
commit
10db117864
|
|
@ -147,6 +147,9 @@ def test_dynamic_pub_sub(
|
||||||
test_log: tractor.log.StackLevelAdapter,
|
test_log: tractor.log.StackLevelAdapter,
|
||||||
reap_subactors_per_test: int,
|
reap_subactors_per_test: int,
|
||||||
expect_cancel_exc: Type[BaseException],
|
expect_cancel_exc: Type[BaseException],
|
||||||
|
|
||||||
|
is_forking_spawner: bool,
|
||||||
|
set_fork_aware_capture,
|
||||||
):
|
):
|
||||||
failed_to_raise_report: str = (
|
failed_to_raise_report: str = (
|
||||||
f'Never got a {expect_cancel_exc!r} ??'
|
f'Never got a {expect_cancel_exc!r} ??'
|
||||||
|
|
@ -157,30 +160,59 @@ def test_dynamic_pub_sub(
|
||||||
from multiprocessing import cpu_count
|
from multiprocessing import cpu_count
|
||||||
cpus = cpu_count()
|
cpus = cpu_count()
|
||||||
|
|
||||||
# Hard safety cap via trio's own cancellation — see the
|
# Hard safety cap via trio's own cancellation. NOTE see the
|
||||||
# module-level NOTE on why we avoid `pytest-timeout` for
|
# module-level note on why we avoid `pytest-timeout` for this
|
||||||
# this test. Picked backend-aware: under `trio` backend
|
# test. Picked backend-aware: under `trio` backend spawn is
|
||||||
# spawn is cheap (~1s for `cpus` actors) but fork-based
|
# cheap (~1s for `cpus` actors) but fork-based backends pay
|
||||||
# backends pay a per-spawn cost (forkserver round-trip +
|
# a per-spawn cost (forkserver round-trip + IPC peer-handshake)
|
||||||
# IPC peer-handshake) that can stack up over `cpus - 1`
|
# that can stack up over `cpus - 1` sequential `n.run_in_actor()`
|
||||||
# sequential `n.run_in_actor()` calls — especially on UDS
|
# calls — especially on UDS under cross-pytest contention
|
||||||
# under cross-pytest contention (#451 / #452). Empirically
|
# (#451 / #452). Empirically a flat 15s flakes on
|
||||||
# 12s flakes on `main_thread_forkserver`; 30s gives
|
# `main_thread_forkserver` for many-cpu hosts (a single bad
|
||||||
# plenty of headroom while still failing-loud on a real
|
# spawn-stack puts total run-time at ~15.5s, just over);
|
||||||
# hang.
|
# 30s gives plenty of headroom while still failing-loud on
|
||||||
from tractor.spawn import _spawn as _spawn_mod
|
# a real hang.
|
||||||
|
#
|
||||||
|
# XXX caveat: this is an *inner* `trio.fail_after` — its
|
||||||
|
# `Cancelled` cannot reach a task parked in a shielded `await`
|
||||||
|
# (e.g. inside actor-nursery teardown). When the in-band cancel
|
||||||
|
# path is itself buggy (the bug-class-3 `raise KBI` swallow we're
|
||||||
|
# currently chasing) this guard does NOT fire and the test sits
|
||||||
|
# forever until external SIGINT. The `_DIAG_CAP_S` outer guard
|
||||||
|
# below is the AFK-safety counterpart.
|
||||||
fail_after_s: int = (
|
fail_after_s: int = (
|
||||||
30
|
4
|
||||||
if _spawn_mod._spawn_method == 'main_thread_forkserver'
|
if is_forking_spawner
|
||||||
else 12
|
else 12
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# outer guard: when the inner fail_after fails to fire because of
|
||||||
|
# a shielded-await deadlock, this cap *aborts the trio run via
|
||||||
|
# signal.alarm → KBI* so AFK runs don't sit for >20min on the
|
||||||
|
# bug-class-3 hang. Slightly larger than `fail_after_s` so the
|
||||||
|
# trio-native path always wins when it works.
|
||||||
|
_DIAG_CAP_S: int = fail_after_s + 5
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
# bug-class-3 breadcrumb: tag each level of the cancel path
|
||||||
|
# so when the run hangs and we capture cancel-level logs, the
|
||||||
|
# *last* breadcrumb that fired names the swallow point.
|
||||||
|
test_log.cancel('test_dynamic_pub_sub: enter main()')
|
||||||
|
try:
|
||||||
with trio.fail_after(fail_after_s):
|
with trio.fail_after(fail_after_s):
|
||||||
|
test_log.cancel(
|
||||||
|
f'test_dynamic_pub_sub: '
|
||||||
|
f'enter `trio.fail_after({fail_after_s})` scope'
|
||||||
|
)
|
||||||
|
try:
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
) as n:
|
) as n:
|
||||||
|
test_log.cancel(
|
||||||
|
'test_dynamic_pub_sub: '
|
||||||
|
'actor nursery opened'
|
||||||
|
)
|
||||||
|
|
||||||
# name of this actor will be same as target func
|
# name of this actor will be same as target func
|
||||||
await n.run_in_actor(publisher)
|
await n.run_in_actor(publisher)
|
||||||
|
|
@ -208,8 +240,33 @@ def test_dynamic_pub_sub(
|
||||||
f'Raising user cancel exc: '
|
f'Raising user cancel exc: '
|
||||||
f'{expect_cancel_exc!r}'
|
f'{expect_cancel_exc!r}'
|
||||||
)
|
)
|
||||||
|
test_log.cancel(
|
||||||
|
f'test_dynamic_pub_sub: '
|
||||||
|
f'ABOUT TO RAISE {expect_cancel_exc!r}'
|
||||||
|
)
|
||||||
raise expect_cancel_exc('simulate user cancel!')
|
raise expect_cancel_exc('simulate user cancel!')
|
||||||
|
finally:
|
||||||
|
test_log.cancel(
|
||||||
|
'test_dynamic_pub_sub: '
|
||||||
|
'actor nursery `__aexit__` returned'
|
||||||
|
)
|
||||||
|
test_log.cancel(
|
||||||
|
'test_dynamic_pub_sub: `fail_after` scope exited'
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
test_log.cancel(
|
||||||
|
'test_dynamic_pub_sub: leaving `main()`'
|
||||||
|
)
|
||||||
|
|
||||||
|
# outer signal-based guard — survives a shielded-await deadlock
|
||||||
|
# since `signal.alarm` raises in the main thread regardless of
|
||||||
|
# trio's scope state. ONLY armed under fork-based backends since
|
||||||
|
# the bug we're chasing is MTF-specific.
|
||||||
|
import signal
|
||||||
|
armed_alarm: bool = bool(is_forking_spawner)
|
||||||
|
if armed_alarm:
|
||||||
|
signal.alarm(_DIAG_CAP_S)
|
||||||
|
try:
|
||||||
try:
|
try:
|
||||||
trio.run(main)
|
trio.run(main)
|
||||||
pytest.fail(failed_to_raise_report)
|
pytest.fail(failed_to_raise_report)
|
||||||
|
|
@ -230,6 +287,11 @@ def test_dynamic_pub_sub(
|
||||||
pytest.fail(failed_to_raise_report)
|
pytest.fail(failed_to_raise_report)
|
||||||
|
|
||||||
test_log.exception('Got user-cancel exc AS EXPECTED')
|
test_log.exception('Got user-cancel exc AS EXPECTED')
|
||||||
|
finally:
|
||||||
|
# always disarm so a passing test doesn't get killed
|
||||||
|
# post-trio.run by a stale alarm.
|
||||||
|
if armed_alarm:
|
||||||
|
signal.alarm(0)
|
||||||
|
|
||||||
|
|
||||||
@tractor.context
|
@tractor.context
|
||||||
|
|
@ -361,9 +423,12 @@ def test_sigint_both_stream_types():
|
||||||
resp = await stream.receive()
|
resp = await stream.receive()
|
||||||
assert resp == msg
|
assert resp == msg
|
||||||
raise KeyboardInterrupt
|
raise KeyboardInterrupt
|
||||||
|
|
||||||
|
# TODO, use pytest.raises() here instead?
|
||||||
|
# (why weren't we originally?)
|
||||||
try:
|
try:
|
||||||
trio.run(main)
|
trio.run(main)
|
||||||
assert 0, "Didn't receive KBI!?"
|
pytest.fail("Didn't receive KBI!?")
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue