From 7b518fe4e1fec061e5997f595cbd51e4be0a8172 Mon Sep 17 00:00:00 2001 From: goodboy Date: Thu, 18 Jun 2026 11:32:14 -0400 Subject: [PATCH] Make `cpu_scaling_factor()` CI-aware for timing tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GH Actions (and most shared) CI runners are slow + noisy and — unlike a throttled local box — don't expose CPU-freq scaling via sysfs, so `cpu_scaling_factor()` read `1.0` and the timing- sensitive deadlines/asserts that key off it got NO headroom on CI (a class of `TooSlowError` / `assert diff < this_fast` flakes), - add a flat `_ci_env` x2 bump inside `cpu_scaling_factor()` so every test already using it (quad streaming, SIGINT-cancel, docs examples, ...) gets CI headroom for free — compounds with any local-throttle factor. - route the `time_quad_ex` cancel-deadline through it instead of a bespoke per-test `ci_env` bump. - fix a real bug in `test_nested_multierrors`: its OUTER `@tractor_test(timeout=10)` was *smaller* than the inner `fail_after_w_trace` budget (trio depth=3 = 12s), so the outer wall fired first and pre-empted the snapshot-capturing inner deadline -> `FAILED` instead of dumping. Bump the outer to `40` (> max inner budget) and scale the inner budgets by `cpu_scaling_factor()` too. Verified locally with `CI=1` (quad + both `test_nested_multierrors` depths + `test_cancel_via_SIGINT_other_task` green). (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tests/conftest.py | 46 ++++++++++++++++---------- tests/test_cancellation.py | 22 ++++++++---- tests/test_legacy_one_way_streaming.py | 6 ++++ 3 files changed, 50 insertions(+), 24 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 7d9f01b8..28a7ef64 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -96,28 +96,38 @@ def cpu_scaling_factor() -> float: much to inflate time-limits when CPU-freq scaling is active on linux. - When no scaling info is available (non-linux, missing sysfs), - returns 1.0 (i.e. no headroom adjustment needed). + When no local scaling info is available (non-linux, missing + sysfs) the base factor is 1.0; a flat CI bump is then applied + on top (see below). ''' - if _non_linux: - return 1. + factor: float = 1. + if not _non_linux: + mx = get_cpu_state() + cur = get_cpu_state(setting='scaling_max_freq') + if ( + mx is not None + and + cur is not None + ): + _mx_pth, max_freq = mx + _cur_pth, cur_freq = cur + cpu_scaled: float = int(cur_freq) / int(max_freq) + if cpu_scaled != 1.: + factor = 1. / ( + cpu_scaled * 2 # <- bc likely "dual threaded" + ) - mx = get_cpu_state() - cur = get_cpu_state(setting='scaling_max_freq') - if mx is None or cur is None: - return 1. + # XXX, GH Actions (and most shared) CI runners are slow + noisy + # and — unlike a throttled local box — do NOT expose CPU-freq + # scaling via sysfs, so the probe above reads 1.0 and adds no + # headroom. Apply a flat CI bump so every timing-test deadline + # /assert that keys off this factor gets headroom on CI HW + # (compounds with any local-throttle factor). + if _ci_env: + factor *= 2 - _mx_pth, max_freq = mx - _cur_pth, cur_freq = cur - cpu_scaled: float = int(cur_freq) / int(max_freq) - - if cpu_scaled != 1.: - return 1. / ( - cpu_scaled * 2 # <- bc likely "dual threaded" - ) - - return 1. + return factor # NOTE, the `--ll`/`--tl` CLI flags + the `loglevel`, `test_log` diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index f35d7bc0..371d70ad 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -515,14 +515,18 @@ async def spawn_and_error( ids='depth={}'.format, ) @tractor_test( - # bumped from the 30s default to cover fork-based - # cancel-cascade flakes; 2 spawners × 2 errorers × depth 1+ - # cascade through 6 portal-wait_for_result paths each - # paying `terminate_after=1.6s` + UDS sock-unlink under - # MTF/UDS contention can easily blow past 30s. + # XXX this OUTER `trio.fail_after` wall MUST exceed the + # largest INNER `fail_after_w_trace()` budget set in the body + # below (max = the MTF depth=3 == 30s case, further scaled by + # `cpu_scaling_factor()` on CI/throttle). Otherwise it fires + # FIRST and pre-empts the inner snapshot-capturing deadline, + # turning a graceful `TooSlowError`+ptree-dump into an opaque + # outer timeout-kill (the prior `timeout=10` did exactly this + # — it was *smaller* than the 12s trio depth=3 budget, so the + # depth-3 case `FAILED` on slow CI instead of dumping). # Trio backend is fast and won't notice the extra budget. # See `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`. - timeout=10, + timeout=40, ) async def test_nested_multierrors( reg_addr: tuple, @@ -632,6 +636,12 @@ async def test_nested_multierrors( case ('main_thread_forkserver', 3): timeout = 30 + # headroom for CPU-freq scaling AND/OR slow CI so the inner + # snapshot-capturing budget doesn't fire spuriously on a + # sluggish runner; see `cpu_scaling_factor()`. + from .conftest import cpu_scaling_factor + timeout *= cpu_scaling_factor() + async with fail_after_w_trace(timeout): try: async with tractor.open_nursery() as nursery: diff --git a/tests/test_legacy_one_way_streaming.py b/tests/test_legacy_one_way_streaming.py index d3787e97..648ba1b8 100644 --- a/tests/test_legacy_one_way_streaming.py +++ b/tests/test_legacy_one_way_streaming.py @@ -326,6 +326,12 @@ def time_quad_ex( ): timeout += 1 + # inflate the cancel-deadline for CPU-freq scaling AND/OR CI + # latency (see `cpu_scaling_factor()`) so the example isn't + # cancelled mid-stream on a throttled/CI runner. + from .conftest import cpu_scaling_factor + timeout *= cpu_scaling_factor() + start: float = time.time() results: list[int] = trio.run(partial( cancel_after,