Make `cpu_scaling_factor()` CI-aware for timing tests
GH Actions (and most shared) CI runners are slow + noisy and — unlike a throttled local box — don't expose CPU-freq scaling via sysfs, so `cpu_scaling_factor()` read `1.0` and the timing- sensitive deadlines/asserts that key off it got NO headroom on CI (a class of `TooSlowError` / `assert diff < this_fast` flakes), - add a flat `_ci_env` x2 bump inside `cpu_scaling_factor()` so every test already using it (quad streaming, SIGINT-cancel, docs examples, ...) gets CI headroom for free — compounds with any local-throttle factor. - route the `time_quad_ex` cancel-deadline through it instead of a bespoke per-test `ci_env` bump. - fix a real bug in `test_nested_multierrors`: its OUTER `@tractor_test(timeout=10)` was *smaller* than the inner `fail_after_w_trace` budget (trio depth=3 = 12s), so the outer wall fired first and pre-empted the snapshot-capturing inner deadline -> `FAILED` instead of dumping. Bump the outer to `40` (> max inner budget) and scale the inner budgets by `cpu_scaling_factor()` too. Verified locally with `CI=1` (quad + both `test_nested_multierrors` depths + `test_cancel_via_SIGINT_other_task` green). (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-codetrionics_start_or_cancel
parent
b0ac681245
commit
7b518fe4e1
|
|
@ -96,28 +96,38 @@ def cpu_scaling_factor() -> float:
|
||||||
much to inflate time-limits when CPU-freq scaling is active on
|
much to inflate time-limits when CPU-freq scaling is active on
|
||||||
linux.
|
linux.
|
||||||
|
|
||||||
When no scaling info is available (non-linux, missing sysfs),
|
When no local scaling info is available (non-linux, missing
|
||||||
returns 1.0 (i.e. no headroom adjustment needed).
|
sysfs) the base factor is 1.0; a flat CI bump is then applied
|
||||||
|
on top (see below).
|
||||||
|
|
||||||
'''
|
'''
|
||||||
if _non_linux:
|
factor: float = 1.
|
||||||
return 1.
|
if not _non_linux:
|
||||||
|
mx = get_cpu_state()
|
||||||
|
cur = get_cpu_state(setting='scaling_max_freq')
|
||||||
|
if (
|
||||||
|
mx is not None
|
||||||
|
and
|
||||||
|
cur is not None
|
||||||
|
):
|
||||||
|
_mx_pth, max_freq = mx
|
||||||
|
_cur_pth, cur_freq = cur
|
||||||
|
cpu_scaled: float = int(cur_freq) / int(max_freq)
|
||||||
|
if cpu_scaled != 1.:
|
||||||
|
factor = 1. / (
|
||||||
|
cpu_scaled * 2 # <- bc likely "dual threaded"
|
||||||
|
)
|
||||||
|
|
||||||
mx = get_cpu_state()
|
# XXX, GH Actions (and most shared) CI runners are slow + noisy
|
||||||
cur = get_cpu_state(setting='scaling_max_freq')
|
# and — unlike a throttled local box — do NOT expose CPU-freq
|
||||||
if mx is None or cur is None:
|
# scaling via sysfs, so the probe above reads 1.0 and adds no
|
||||||
return 1.
|
# headroom. Apply a flat CI bump so every timing-test deadline
|
||||||
|
# /assert that keys off this factor gets headroom on CI HW
|
||||||
|
# (compounds with any local-throttle factor).
|
||||||
|
if _ci_env:
|
||||||
|
factor *= 2
|
||||||
|
|
||||||
_mx_pth, max_freq = mx
|
return factor
|
||||||
_cur_pth, cur_freq = cur
|
|
||||||
cpu_scaled: float = int(cur_freq) / int(max_freq)
|
|
||||||
|
|
||||||
if cpu_scaled != 1.:
|
|
||||||
return 1. / (
|
|
||||||
cpu_scaled * 2 # <- bc likely "dual threaded"
|
|
||||||
)
|
|
||||||
|
|
||||||
return 1.
|
|
||||||
|
|
||||||
|
|
||||||
# NOTE, the `--ll`/`--tl` CLI flags + the `loglevel`, `test_log`
|
# NOTE, the `--ll`/`--tl` CLI flags + the `loglevel`, `test_log`
|
||||||
|
|
|
||||||
|
|
@ -515,14 +515,18 @@ async def spawn_and_error(
|
||||||
ids='depth={}'.format,
|
ids='depth={}'.format,
|
||||||
)
|
)
|
||||||
@tractor_test(
|
@tractor_test(
|
||||||
# bumped from the 30s default to cover fork-based
|
# XXX this OUTER `trio.fail_after` wall MUST exceed the
|
||||||
# cancel-cascade flakes; 2 spawners × 2 errorers × depth 1+
|
# largest INNER `fail_after_w_trace()` budget set in the body
|
||||||
# cascade through 6 portal-wait_for_result paths each
|
# below (max = the MTF depth=3 == 30s case, further scaled by
|
||||||
# paying `terminate_after=1.6s` + UDS sock-unlink under
|
# `cpu_scaling_factor()` on CI/throttle). Otherwise it fires
|
||||||
# MTF/UDS contention can easily blow past 30s.
|
# FIRST and pre-empts the inner snapshot-capturing deadline,
|
||||||
|
# turning a graceful `TooSlowError`+ptree-dump into an opaque
|
||||||
|
# outer timeout-kill (the prior `timeout=10` did exactly this
|
||||||
|
# — it was *smaller* than the 12s trio depth=3 budget, so the
|
||||||
|
# depth-3 case `FAILED` on slow CI instead of dumping).
|
||||||
# Trio backend is fast and won't notice the extra budget.
|
# Trio backend is fast and won't notice the extra budget.
|
||||||
# See `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
|
# See `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
|
||||||
timeout=10,
|
timeout=40,
|
||||||
)
|
)
|
||||||
async def test_nested_multierrors(
|
async def test_nested_multierrors(
|
||||||
reg_addr: tuple,
|
reg_addr: tuple,
|
||||||
|
|
@ -632,6 +636,12 @@ async def test_nested_multierrors(
|
||||||
case ('main_thread_forkserver', 3):
|
case ('main_thread_forkserver', 3):
|
||||||
timeout = 30
|
timeout = 30
|
||||||
|
|
||||||
|
# headroom for CPU-freq scaling AND/OR slow CI so the inner
|
||||||
|
# snapshot-capturing budget doesn't fire spuriously on a
|
||||||
|
# sluggish runner; see `cpu_scaling_factor()`.
|
||||||
|
from .conftest import cpu_scaling_factor
|
||||||
|
timeout *= cpu_scaling_factor()
|
||||||
|
|
||||||
async with fail_after_w_trace(timeout):
|
async with fail_after_w_trace(timeout):
|
||||||
try:
|
try:
|
||||||
async with tractor.open_nursery() as nursery:
|
async with tractor.open_nursery() as nursery:
|
||||||
|
|
|
||||||
|
|
@ -326,6 +326,12 @@ def time_quad_ex(
|
||||||
):
|
):
|
||||||
timeout += 1
|
timeout += 1
|
||||||
|
|
||||||
|
# inflate the cancel-deadline for CPU-freq scaling AND/OR CI
|
||||||
|
# latency (see `cpu_scaling_factor()`) so the example isn't
|
||||||
|
# cancelled mid-stream on a throttled/CI runner.
|
||||||
|
from .conftest import cpu_scaling_factor
|
||||||
|
timeout *= cpu_scaling_factor()
|
||||||
|
|
||||||
start: float = time.time()
|
start: float = time.time()
|
||||||
results: list[int] = trio.run(partial(
|
results: list[int] = trio.run(partial(
|
||||||
cancel_after,
|
cancel_after,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue