Compare commits

..

No commits in common. "7ee0dc2e8fcd2709832a5900329ca42b46d4b112" and "9bbb6f796bbed4a9e9ae898308ba191118d67621" have entirely different histories.

8 changed files with 82 additions and 284 deletions

View File

@ -57,7 +57,6 @@ from tractor.msg._ops import (
limit_plds,
)
def enc_nsp(obj: Any) -> Any:
actor: Actor = tractor.current_actor(
err_on_no_runtime=False,
@ -618,17 +617,6 @@ def test_ext_types_over_ipc(
debug_mode: bool,
pld_spec: Union[Type],
add_hooks: bool,
set_fork_aware_capture,
# ^^XXX? for forking spawners
# capfd: pytest.CaptureFixture,
# ^^NOTE, super interesting that if
# we disable this below then the tpt-layer
# suffers as an "unclean EOF"??
# ?TODO, determine why/how that mks sense when addressing,
# https://github.com/pytest-dev/pytest/issues/14444
#
):
'''
Ensure we can support extension types coverted using
@ -737,26 +725,18 @@ def test_ext_types_over_ipc(
await p.cancel_actor()
async def fa_main():
with (
trio.fail_after(2),
# ?TODO, investigate? see NOTE above..
# capfd.disabled(),
):
await main()
if (
NamespacePath in pld_types
and
add_hooks
):
trio.run(fa_main)
trio.run(main)
else:
with pytest.raises(
expected_exception=tractor.RemoteActorError,
) as excinfo:
trio.run(fa_main)
trio.run(main)
exc = excinfo.value
# bc `.started(nsp: NamespacePath)` will raise

View File

@ -284,11 +284,6 @@ def test_basic_payload_spec(
return_value: str|None,
started_value: int|PldMsg,
pld_check_started_value: bool,
set_fork_aware_capture,
# ^XXX TODO? for forking spawners, seems to prevent hangs when
# --capture=sys not set, but only for a while then the problem
# accumulates?
):
'''
Validate the most basic `PldRx` msg-type-spec semantics around

View File

@ -147,9 +147,6 @@ def test_dynamic_pub_sub(
test_log: tractor.log.StackLevelAdapter,
reap_subactors_per_test: int,
expect_cancel_exc: Type[BaseException],
is_forking_spawner: bool,
set_fork_aware_capture,
):
failed_to_raise_report: str = (
f'Never got a {expect_cancel_exc!r} ??'
@ -160,138 +157,79 @@ def test_dynamic_pub_sub(
from multiprocessing import cpu_count
cpus = cpu_count()
# Hard safety cap via trio's own cancellation. NOTE see the
# module-level note on why we avoid `pytest-timeout` for this
# test. Picked backend-aware: under `trio` backend spawn is
# cheap (~1s for `cpus` actors) but fork-based backends pay
# a per-spawn cost (forkserver round-trip + IPC peer-handshake)
# that can stack up over `cpus - 1` sequential `n.run_in_actor()`
# calls — especially on UDS under cross-pytest contention
# (#451 / #452). Empirically a flat 15s flakes on
# `main_thread_forkserver` for many-cpu hosts (a single bad
# spawn-stack puts total run-time at ~15.5s, just over);
# 30s gives plenty of headroom while still failing-loud on
# a real hang.
#
# XXX caveat: this is an *inner* `trio.fail_after` — its
# `Cancelled` cannot reach a task parked in a shielded `await`
# (e.g. inside actor-nursery teardown). When the in-band cancel
# path is itself buggy (the bug-class-3 `raise KBI` swallow we're
# currently chasing) this guard does NOT fire and the test sits
# forever until external SIGINT. The `_DIAG_CAP_S` outer guard
# below is the AFK-safety counterpart.
# Hard safety cap via trio's own cancellation — see the
# module-level NOTE on why we avoid `pytest-timeout` for
# this test. Picked backend-aware: under `trio` backend
# spawn is cheap (~1s for `cpus` actors) but fork-based
# backends pay a per-spawn cost (forkserver round-trip +
# IPC peer-handshake) that can stack up over `cpus - 1`
# sequential `n.run_in_actor()` calls — especially on UDS
# under cross-pytest contention (#451 / #452). Empirically
# 12s flakes on `main_thread_forkserver`; 30s gives
# plenty of headroom while still failing-loud on a real
# hang.
from tractor.spawn import _spawn as _spawn_mod
fail_after_s: int = (
4
if is_forking_spawner
30
if _spawn_mod._spawn_method == 'main_thread_forkserver'
else 12
)
# outer guard: when the inner fail_after fails to fire because of
# a shielded-await deadlock, this cap *aborts the trio run via
# signal.alarm → KBI* so AFK runs don't sit for >20min on the
# bug-class-3 hang. Slightly larger than `fail_after_s` so the
# trio-native path always wins when it works.
_DIAG_CAP_S: int = fail_after_s + 5
async def main():
# bug-class-3 breadcrumb: tag each level of the cancel path
# so when the run hangs and we capture cancel-level logs, the
# *last* breadcrumb that fired names the swallow point.
test_log.cancel('test_dynamic_pub_sub: enter main()')
try:
with trio.fail_after(fail_after_s):
test_log.cancel(
f'test_dynamic_pub_sub: '
f'enter `trio.fail_after({fail_after_s})` scope'
)
try:
async with tractor.open_nursery(
registry_addrs=[reg_addr],
debug_mode=debug_mode,
) as n:
test_log.cancel(
'test_dynamic_pub_sub: '
'actor nursery opened'
)
with trio.fail_after(fail_after_s):
async with tractor.open_nursery(
registry_addrs=[reg_addr],
debug_mode=debug_mode,
) as n:
# name of this actor will be same as target func
await n.run_in_actor(publisher)
# name of this actor will be same as target func
await n.run_in_actor(publisher)
for i, sub in zip(
range(cpus - 2),
itertools.cycle(_registry.keys())
):
await n.run_in_actor(
consumer,
name=f'consumer_{sub}',
subs=[sub],
)
# make one dynamic subscriber
await n.run_in_actor(
consumer,
name='consumer_dynamic',
subs=list(_registry.keys()),
)
# block until "cancelled by user"
await trio.sleep(3)
test_log.warning(
f'Raising user cancel exc: '
f'{expect_cancel_exc!r}'
)
test_log.cancel(
f'test_dynamic_pub_sub: '
f'ABOUT TO RAISE {expect_cancel_exc!r}'
)
raise expect_cancel_exc('simulate user cancel!')
finally:
test_log.cancel(
'test_dynamic_pub_sub: '
'actor nursery `__aexit__` returned'
for i, sub in zip(
range(cpus - 2),
itertools.cycle(_registry.keys())
):
await n.run_in_actor(
consumer,
name=f'consumer_{sub}',
subs=[sub],
)
test_log.cancel(
'test_dynamic_pub_sub: `fail_after` scope exited'
)
finally:
test_log.cancel(
'test_dynamic_pub_sub: leaving `main()`'
)
# outer signal-based guard — survives a shielded-await deadlock
# since `signal.alarm` raises in the main thread regardless of
# trio's scope state. ONLY armed under fork-based backends since
# the bug we're chasing is MTF-specific.
import signal
armed_alarm: bool = bool(is_forking_spawner)
if armed_alarm:
signal.alarm(_DIAG_CAP_S)
# make one dynamic subscriber
await n.run_in_actor(
consumer,
name='consumer_dynamic',
subs=list(_registry.keys()),
)
# block until "cancelled by user"
await trio.sleep(3)
test_log.warning(
f'Raising user cancel exc: '
f'{expect_cancel_exc!r}'
)
raise expect_cancel_exc('simulate user cancel!')
try:
try:
trio.run(main)
trio.run(main)
pytest.fail(failed_to_raise_report)
except expect_cancel_exc:
# parent-side raised the user-cancel exc directly and
# it propagated unwrapped; clean path.
test_log.exception('Got user-cancel exc AS EXPECTED')
except BaseExceptionGroup as err:
# under fork-based backends the user-raised cancel
# can race with subactor-side stream teardown
# (`trio.EndOfChannel` from a publisher's `send()`
# whose remote half got cut). The expected exc may
# then be nested deeper in the group rather than at
# the top level. `BaseExceptionGroup.split()` walks
# the exc tree recursively (Python 3.11+).
matched, _ = err.split(expect_cancel_exc)
if matched is None:
pytest.fail(failed_to_raise_report)
except expect_cancel_exc:
# parent-side raised the user-cancel exc directly and
# it propagated unwrapped; clean path.
test_log.exception('Got user-cancel exc AS EXPECTED')
except BaseExceptionGroup as err:
# under fork-based backends the user-raised cancel
# can race with subactor-side stream teardown
# (`trio.EndOfChannel` from a publisher's `send()`
# whose remote half got cut). The expected exc may
# then be nested deeper in the group rather than at
# the top level. `BaseExceptionGroup.split()` walks
# the exc tree recursively (Python 3.11+).
matched, _ = err.split(expect_cancel_exc)
if matched is None:
pytest.fail(failed_to_raise_report)
test_log.exception('Got user-cancel exc AS EXPECTED')
finally:
# always disarm so a passing test doesn't get killed
# post-trio.run by a stale alarm.
if armed_alarm:
signal.alarm(0)
test_log.exception('Got user-cancel exc AS EXPECTED')
@tractor.context
@ -423,12 +361,9 @@ def test_sigint_both_stream_types():
resp = await stream.receive()
assert resp == msg
raise KeyboardInterrupt
# TODO, use pytest.raises() here instead?
# (why weren't we originally?)
try:
trio.run(main)
pytest.fail("Didn't receive KBI!?")
assert 0, "Didn't receive KBI!?"
except KeyboardInterrupt:
pass

View File

@ -77,7 +77,6 @@ async def worker(
@tractor_test
async def test_streaming_to_actor_cluster(
tpt_proto: str,
is_forking_spawner: bool,
):
'''
Open an actor "cluster" using the (experimental) `._clustering`
@ -89,11 +88,7 @@ async def test_streaming_to_actor_cluster(
f'Test currently fails with tpt-proto={tpt_proto!r}\n'
)
delay: float = (
10 if is_forking_spawner
else 6
)
with trio.fail_after(delay):
with trio.fail_after(6):
async with (
open_actor_cluster(modules=[__name__]) as portals,

View File

@ -188,16 +188,9 @@ def test_simple_context(
pointlessly_open_stream,
reg_addr: tuple,
debug_mode: bool,
is_forking_spawner: bool,
):
timeout: float = 1.5
# windows and forking-spawner both have "slower but more
# deterministic" cancel teardown.
if platform.system() == 'Windows':
timeout = 4
elif is_forking_spawner:
timeout = 3
timeout = 1.5 if not platform.system() == 'Windows' else 4
async def main():

View File

@ -45,12 +45,6 @@ from tractor._testing import expect_ctxc
# `test_legacy_one_way_streaming`, etc.).
pytestmark = pytest.mark.usefixtures(
'reap_subactors_per_test',
# NOTE, asyncio cancel cascade has historically
# triggered both UDS sockfile leaks (SIGKILL path)
# AND the trio `WakeupSocketpair.drain()` busy-loop
# — see `test_aio_simple_error`'s history.
'track_orphaned_uds_per_test',
'detect_runaway_subactors_per_test',
)
@ -58,7 +52,6 @@ pytestmark = pytest.mark.usefixtures(
scope='module',
)
def delay(debug_mode: bool) -> int:
return 1e3
if debug_mode:
return 999
else:
@ -833,19 +826,13 @@ async def trio_to_aio_echo_server(
@pytest.mark.parametrize(
'raise_error_mid_stream',
[
False,
Exception,
KeyboardInterrupt,
],
[False, Exception, KeyboardInterrupt],
ids='raise_error={}'.format,
)
def test_echoserver_detailed_mechanics(
reg_addr: tuple[str, int],
debug_mode: bool,
raise_error_mid_stream,
is_forking_spawner: bool,
):
async def main():
async with tractor.open_nursery(
@ -893,34 +880,12 @@ def test_echoserver_detailed_mechanics(
# is cancelled by kbi or out of task cancellation
await p.cancel_actor()
# NOTE: under fork-based backends the cancel-cascade
# path is structurally slower than `trio`'s subproc-exec
# (per-spawn forkserver-handshake compounds during
# teardown). Bump the cap so cross-test contamination
# doesn't flake this — see
# `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
timeout: float = (
999 if tractor.debug_mode()
else 4 if is_forking_spawner
else 1
)
with_timeout: bool = (
True
# False
)
async def fa_main():
if with_timeout:
with trio.fail_after(timeout):
await main()
else:
await main()
if raise_error_mid_stream:
with pytest.raises(raise_error_mid_stream):
trio.run(fa_main)
trio.run(main)
else:
trio.run(fa_main)
trio.run(main)
@tractor.context
@ -1073,7 +1038,7 @@ def test_sigint_closes_lifetime_stack(
bg_aio_task: bool,
trio_side_is_shielded: bool,
send_sigint_to: str,
is_forking_spawner: bool,
start_method: str,
):
'''
Ensure that an infected child can use the `Actor.lifetime_stack`
@ -1088,14 +1053,6 @@ def test_sigint_closes_lifetime_stack(
if debug_mode
else 1
)
# pre-init so the `except (KeyboardInterrupt, ContextCancelled)`
# handler below doesn't `UnboundLocalError` if KBI fires BEFORE
# we ever enter the `as (ctx, first)` body (e.g. when
# `p.open_context().__aenter__` is hung waiting for the
# subactor's `StartAck` due to a fork-child IPC race —
# see `dynamic_pub_sub_spawn_time_transport_close_under_mtf_issue.md`).
tmp_file: Path|None = None
ctx: tractor.Context|None = None
try:
an: tractor.ActorNursery
async with tractor.open_nursery(
@ -1121,7 +1078,7 @@ def test_sigint_closes_lifetime_stack(
) as (ctx, first):
path_str, cpid = first
tmp_file = Path(path_str)
tmp_file: Path = Path(path_str)
assert tmp_file.exists()
# XXX originally to simulate what (hopefully)
@ -1172,7 +1129,7 @@ def test_sigint_closes_lifetime_stack(
if (
send_sigint_to == 'child'
and
is_forking_spawner
start_method == 'main_thread_forkserver'
):
pytest.xfail(
reason=(
@ -1199,21 +1156,6 @@ def test_sigint_closes_lifetime_stack(
KeyboardInterrupt,
ContextCancelled,
):
# If we got here BEFORE entering the ctx body (e.g.
# spawn-time IPC race hung `open_context.__aenter__` and
# the AFK-guard `signal.alarm` fired KBI from outside the
# trio loop), `tmp_file`/`ctx` are still `None` — surface
# that fact directly instead of `UnboundLocalError`.
if tmp_file is None:
pytest.fail(
'KBI/ctxc fired BEFORE `p.open_context()` returned '
"the child's `started` value — likely fork-child "
'IPC race; see '
'`ai/conc-anal/'
'dynamic_pub_sub_spawn_time_transport_close_'
'under_mtf_issue.md`'
)
# XXX CASE 2: without the bug fixed, in the
# KBI-raised-in-parent case, the actor teardown should
# never get run (silently abaondoned by `asyncio`..) and
@ -1221,32 +1163,7 @@ def test_sigint_closes_lifetime_stack(
assert not tmp_file.exists()
assert ctx.maybe_error
# outer signal-based AFK-safety guard. mirrors the pattern in
# `tests/test_advanced_streaming.py::test_dynamic_pub_sub`: when
# the in-band trio cancel path doesn't fire (e.g. parent is
# parked in a shielded `await` inside actor-nursery teardown, or
# `open_context.__aenter__` hangs waiting for a child's
# `StartAck` that never comes), `signal.alarm` raises KBI in the
# main thread regardless of trio's scope state. This caps the
# absolute wall-clock so an AFK run can't sit for an hour on a
# forkserver-launchpad-contamination hang. Only armed under fork-
# based backends since the bug class is MTF-specific.
_AFK_CAP_S: int = (
999 if debug_mode
else 10
)
armed_alarm: bool = (
not debug_mode
and
is_forking_spawner
)
if armed_alarm:
signal.alarm(_AFK_CAP_S)
try:
trio.run(main)
finally:
if armed_alarm:
signal.alarm(0)
trio.run(main)

View File

@ -26,30 +26,14 @@ from tractor._testing import (
from .conftest import cpu_scaling_factor
pytestmark = [
pytest.mark.skipon_spawn_backend(
'subint',
reason=(
'XXX SUBINT GIL-CONTENTION HANGING TEST XXX\n'
'Inter-peer cancel cascades under '
'`--spawn-backend=subint` trip the abandoned-subint '
'GIL-hostage class — see\n'
' - `ai/conc-anal/subint_sigint_starvation_issue.md` '
'(GIL-hostage, SIGINT-unresponsive)\n'
' - `ai/conc-anal/subint_cancel_delivery_hang_issue.md` '
'(sibling: parent parks on dead chan)\n'
' - https://github.com/goodboy/tractor/issues/379 '
'(subint umbrella)\n'
)
),
# NOTE, inter-peer cancellation tests stress the
# multi-actor cancel cascade which under SIGKILL
# leaves UDS sock-files orphaned. Track per-test
# for blame attribution.
pytest.mark.usefixtures(
'track_orphaned_uds_per_test',
),
]
pytestmark = pytest.mark.skipon_spawn_backend(
'subint',
reason=(
'XXX SUBINT GIL-CONTENTION HANGING TEST XXX\n'
'See oustanding issue(s)\n'
# TODO, put issue link!
)
)
# XXX TODO cases:
# - [x] WE cancelled the peer and thus should not see any raised

View File

@ -653,7 +653,6 @@ def pytest_generate_tests(
# scope='module',
# )
def _is_forking_spawner(
start_method: str,
) -> bool:
@ -671,7 +670,7 @@ def is_forking_spawner(
Is the `pytest` run using a `fork()`ing process spawning-backend?
'''
return _is_forking_spawner(start_method)
return _is_forking_spawner
def maybe_xfail_for_spawner(