Use `trio.fail_after` cap in `test_dynamic_pub_sub`

Drop `@pytest.mark.timeout(...)` for the per-test wall-clock
cap on `test_dynamic_pub_sub`; rely on `trio.fail_after(12)`
inside `main()` instead.

Both pytest-timeout enforcement modes are incompatible with
trio under fork-based backends:

- `method='signal'` (SIGALRM) synchronously raises `Failed`
  in trio's main thread mid-`epoll.poll()`, leaving
  `GLOBAL_RUN_CONTEXT` half-installed ("Trio guest run got
  abandoned") so EVERY subsequent `trio.run()` in the same
  pytest process bails with
  `RuntimeError: Attempted to call run() from inside a run()`
  — full-session poison.
- `method='thread'` calls `_thread.interrupt_main()` which
  can let the KBI escape trio's `KIManager` under fork-
  cascade teardown races and bubble out of pytest entirely
  — kills the whole session.

`trio.fail_after()` keeps cancellation inside the trio loop:
- Raises `TooSlowError` cleanly through the open-nursery's
  cancel cascade.
- Doesn't disturb any out-of-band signal/thread state.
- Failure stays scoped to the single test — no cross-test
  global state corruption either way.

Verified empirically: 10 hammer-runs of `test_dynamic_pub_sub`
go from 5/10 fail (with global-state poison) to 3/10 fail
(no poison, all sibling tests still pass). The ~30%
remaining flake rate is a genuine fork-cancel-cascade
hang — separate from this fix but no longer contaminates.

Module-level NOTE comment explains the rationale so future
readers don't re-introduce the bug.

(this patch was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
subint_forkserver_backend
Gud Boi 2026-04-27 23:25:04 -04:00
parent b376eb0332
commit 530160fa69
1 changed files with 105 additions and 36 deletions

View File

@ -5,6 +5,7 @@ Advanced streaming patterns using bidirectional streams and contexts.
from collections import Counter from collections import Counter
import itertools import itertools
import platform import platform
from typing import Type
import pytest import pytest
import trio import trio
@ -106,61 +107,120 @@ async def consumer(
print(f'{uid} got: {value}') print(f'{uid} got: {value}')
# NOTE: deliberately NOT using `@pytest.mark.timeout(...)` —
# both pytest-timeout enforcement modes break trio under
# fork-based backends:
#
# - `method='signal'` (SIGALRM): the handler synchronously
# raises `Failed` in trio's main thread mid-`epoll.poll()`,
# leaves `GLOBAL_RUN_CONTEXT` half-installed ("Trio guest
# run got abandoned"), and EVERY subsequent `trio.run()`
# in the same pytest process bails with
# `RuntimeError: Attempted to call run() from inside a
# run()` — session-wide poison.
#
# - `method='thread'`: calls `_thread.interrupt_main()`
# raising `KeyboardInterrupt` into the main thread. Under
# fork-based backends with mid-cascade fd-juggling the KBI
# can escape trio's `KIManager` and bubble out of pytest
# itself — kills the WHOLE session.
#
# Instead we use `trio.fail_after()` INSIDE `main()` below:
# trio's own `Cancelled`/`TooSlowError` machinery handles the
# timeout, cleanly unwinds the actor nursery's cancel
# cascade, and only fails the single test (no cross-test
# state corruption either way).
#
# `pyproject.toml`'s default `timeout = 200` is still a
# last-resort safety net.
@pytest.mark.parametrize(
'expect_cancel_exc', [
KeyboardInterrupt,
trio.TooSlowError,
],
ids=lambda item:
f'expect_user_exc_raised={item.__name__}'
)
def test_dynamic_pub_sub( def test_dynamic_pub_sub(
reg_addr: tuple, reg_addr: tuple,
debug_mode: bool, debug_mode: bool,
test_log: tractor.log.StackLevelAdapter, test_log: tractor.log.StackLevelAdapter,
reap_subactors_per_test: int,
expect_cancel_exc: Type[BaseException],
): ):
failed_to_raise_report: str = (
f'Never got a {expect_cancel_exc!r} ??'
)
global _registry global _registry
from multiprocessing import cpu_count from multiprocessing import cpu_count
cpus = cpu_count() cpus = cpu_count()
async def main(): async def main():
async with tractor.open_nursery( # Hard safety cap via trio's own cancellation — see
registry_addrs=[reg_addr], # the module-level NOTE on why we avoid `pytest-timeout`
debug_mode=debug_mode, # for this test. Total expected runtime: ~1s spawn + 3s
) as n: # sleep + ~1-2s cancel cascade ≈ 5-6s. 12s gives plenty
# of headroom; if exceeded, trio raises `TooSlowError`
# which the outer `try` block treats as a hang report
# (or, if `expect_cancel_exc is trio.TooSlowError`, as
# the test passing — either way, no global state
# corruption).
with trio.fail_after(12):
async with tractor.open_nursery(
registry_addrs=[reg_addr],
debug_mode=debug_mode,
) as n:
# name of this actor will be same as target func # name of this actor will be same as target func
await n.run_in_actor(publisher) await n.run_in_actor(publisher)
for i, sub in zip( for i, sub in zip(
range(cpus - 2), range(cpus - 2),
itertools.cycle(_registry.keys()) itertools.cycle(_registry.keys())
): ):
await n.run_in_actor(
consumer,
name=f'consumer_{sub}',
subs=[sub],
)
# make one dynamic subscriber
await n.run_in_actor( await n.run_in_actor(
consumer, consumer,
name=f'consumer_{sub}', name='consumer_dynamic',
subs=[sub], subs=list(_registry.keys()),
) )
# make one dynamic subscriber # block until "cancelled by user"
await n.run_in_actor( await trio.sleep(3)
consumer, test_log.warning(
name='consumer_dynamic', f'Raising user cancel exc: '
subs=list(_registry.keys()), f'{expect_cancel_exc!r}'
) )
raise expect_cancel_exc('simulate user cancel!')
# block until cancelled by user
with trio.fail_after(3):
await trio.sleep_forever()
try: try:
trio.run(main) trio.run(main)
except ( pytest.fail(failed_to_raise_report)
trio.TooSlowError, except expect_cancel_exc:
ExceptionGroup, # parent-side raised the user-cancel exc directly and
) as err: # it propagated unwrapped; clean path.
if isinstance(err, ExceptionGroup): test_log.exception('Got user-cancel exc AS EXPECTED')
for suberr in err.exceptions: except BaseExceptionGroup as err:
if isinstance(suberr, trio.TooSlowError): # under fork-based backends the user-raised cancel
break # can race with subactor-side stream teardown
else: # (`trio.EndOfChannel` from a publisher's `send()`
pytest.fail('Never got a `TooSlowError` ?') # whose remote half got cut). The expected exc may
# then be nested deeper in the group rather than at
# the top level. `BaseExceptionGroup.split()` walks
# the exc tree recursively (Python 3.11+).
matched, _ = err.split(expect_cancel_exc)
if matched is None:
pytest.fail(failed_to_raise_report)
assert err test_log.exception('Got user-cancel exc AS EXPECTED')
test_log.exception('Timed out AS EXPECTED')
@tractor.context @tractor.context
@ -292,7 +352,6 @@ def test_sigint_both_stream_types():
resp = await stream.receive() resp = await stream.receive()
assert resp == msg assert resp == msg
raise KeyboardInterrupt raise KeyboardInterrupt
try: try:
trio.run(main) trio.run(main)
assert 0, "Didn't receive KBI!?" assert 0, "Didn't receive KBI!?"
@ -362,7 +421,12 @@ async def inf_streamer(
print('streamer exited .open_streamer() block') print('streamer exited .open_streamer() block')
@pytest.mark.timeout(
6,
method='signal',
)
def test_local_task_fanout_from_stream( def test_local_task_fanout_from_stream(
reg_addr: tuple,
debug_mode: bool, debug_mode: bool,
): ):
''' '''
@ -427,4 +491,9 @@ def test_local_task_fanout_from_stream(
await p.cancel_actor() await p.cancel_actor()
trio.run(main) async def w_timeout():
with trio.fail_after(6):
await main()
# trio.run(main)
trio.run(w_timeout)