From 5e5d785b8a8f74ffcaea3b5752b40eedc166b2cd Mon Sep 17 00:00:00 2001 From: goodboy Date: Fri, 12 Jun 2026 13:46:31 -0400 Subject: [PATCH] Add `cpu-perf-check` sustained-throttle gate script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standalone CLI companion to `cpu_perf_headroom()` (20cb99ec): idle freq snapshots LIE — every static knob (`governor`, EPP, `platform_profile`, `scaling_max_freq`) can read "performance" while a firmware/EC power cap (AMD PPT/STAPM + friends) clamps the package to ~30% the moment a sustained multi-core load lands, masquerading as a `trio`-backend deadline-miss "regression" on byte-identical code. Deats, - burns every core for `CPU_PERF_SECS` (default 4s) and samples the ACHIEVED `scaling_cur_freq` steady-state (post boost-ramp) vs the package max ceiling, - exits 0 when the sustained fraction clears `CPU_PERF_HEALTHY_FRAC` (default 0.45), 1 when throttled — so it gates a suite run: `scripts/cpu-perf-check && pytest tests/ ...`, - prints the static knobs first (to show they all read fine) then the remediation list on failure (`platform_profile` bounce, USB-C PD replug, `ryzenadj`, reboot) w/ the key reminder: do NOT bump test budgets — the box is slow, not the code. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- scripts/cpu-perf-check | 159 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100755 scripts/cpu-perf-check diff --git a/scripts/cpu-perf-check b/scripts/cpu-perf-check new file mode 100755 index 00000000..fee9bd02 --- /dev/null +++ b/scripts/cpu-perf-check @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +# tractor: distributed structured concurrency. +# Copyright 2018-eternity Tyler Goodlet. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +''' +`cpu-perf-check` — sustained-load CPU throttle detector. + +Idle freq snapshots LIE. A laptop can read +`governor=performance`, `EPP=performance`, +`platform_profile=performance`, `scaling_max_freq=` +and momentarily clock a P-core at 5GHz — while a +firmware/EC power cap (AMD PPT/STAPM and friends) clamps +the whole package to ~1.5GHz the instant a sustained +multi-core load lands. That throttle masquerades as a +`trio`-backend test *regression*: a wave of `fail_after` / +`TooSlowError` / `Cancelled(source='deadline')` deadline +misses on spawn-heavy tests, on byte-identical code that +was green yesterday. + +The existing `tests/conftest.py:cpu_scaling_factor()` only +reads STATIC `scaling_max_freq` vs `*_pstate_max_freq`, so +it returns `1.0` (no throttle) during exactly this failure +— it can't see the cap. This script complements it by +BURNING every core for a few seconds and sampling the +ACHIEVED `scaling_cur_freq`, which is the only thing that +exposes the clamp. + +Exit code: `0` if sustained perf looks restored, `1` if +throttled — so it gates a test run: + + py313/bin/python scripts/cpu-perf-check && pytest tests/ ... + +Tunables (env-overridable): + CPU_PERF_SECS load duration (default 4.0) + CPU_PERF_HEALTHY_FRAC sustained/max floor (default 0.45) + +''' +from __future__ import annotations +import glob +import os +import time +import multiprocessing as mp + + +def _read(path: str) -> str | None: + try: + with open(path) as f: + return f.read().strip() + except OSError: + return None + + +def _cur_freqs_mhz() -> list[int]: + out: list[int] = [] + for f in glob.glob( + '/sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_cur_freq' + ): + if (v := _read(f)): + out.append(int(v) // 1000) + return out + + +def _pkg_max_mhz() -> int: + ''' + Highest per-core ceiling across the package — the + P-core max on hybrid parts. + + ''' + mxs: list[int] = [] + for f in glob.glob( + '/sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_max_freq' + ): + if (v := _read(f)): + mxs.append(int(v) // 1000) + return max(mxs) if mxs else 0 + + +def _burn(stop: float) -> None: + x: int = 1 + while time.perf_counter() < stop: + x += x * x ^ 0x5 + + +def main( + secs: float = float(os.environ.get('CPU_PERF_SECS', 4.0)), + # sustained aggregate must clear this fraction of the + # package max-freq ceiling. Throttled (~1.5GHz of 5.1GHz) + # ~= 0.29; a healthy all-core load easily clears 0.5. + healthy_frac: float = float( + os.environ.get('CPU_PERF_HEALTHY_FRAC', 0.45) + ), +) -> int: + if not glob.glob('/sys/devices/system/cpu/cpu0/cpufreq'): + print('no cpufreq sysfs (non-linux?) — skipping, assume OK') + return 0 + + b: str = '/sys/devices/system/cpu/cpu0/cpufreq/' + pkg_max: int = _pkg_max_mhz() + print('=== static knobs (ALL can read fine while throttled) ===') + print(f' governor : {_read(b + "scaling_governor")}') + print(f' EPP : {_read(b + "energy_performance_preference")}') + print(f' platform_profile : ' + f'{_read("/sys/firmware/acpi/platform_profile")}') + print(f' pkg max freq : {pkg_max} MHz') + + ncpu: int = os.cpu_count() or 1 + print(f'\n=== sustained {ncpu}-core load ({secs:.0f}s) — the real test ===') + stop: float = time.perf_counter() + secs + procs = [ + mp.Process(target=_burn, args=(stop,)) + for _ in range(ncpu) + ] + for p in procs: + p.start() + + # skip the initial ~0.6s ramp, then sample steady-state + samples: list[int] = [] + time.sleep(0.6) + while time.perf_counter() < stop - 0.2: + if (fr := _cur_freqs_mhz()): + samples.append(sum(fr) // len(fr)) + time.sleep(0.3) + for p in procs: + p.join() + + if not (samples and pkg_max): + print(' could not sample cur freq — assume OK') + return 0 + + sustained: int = sum(samples) // len(samples) + frac: float = sustained / pkg_max + print(f' aggregate cur-freq samples: {samples}') + print(f' sustained avg : {sustained} MHz ' + f'({frac * 100:.0f}% of {pkg_max} MHz max)') + + if frac < healthy_frac: + print( + f'\n ❌ THROTTLED — sustained {sustained}MHz is only ' + f'{frac * 100:.0f}% of max (< {healthy_frac * 100:.0f}%).\n' + f' Power cap (PPT/STAPM) still engaged. Fixes:\n' + f' - bounce /sys/firmware/acpi/platform_profile\n' + f' (balanced -> performance)\n' + f' - unplug/replug USB-C to re-negotiate PD\n' + f' - ryzenadj to lift STAPM/PPT\n' + f' - else reboot\n' + f' Do NOT bump test budgets — the box is slow, not the code.' + ) + return 1 + + print( + f'\n ✅ PERF OK — sustained {sustained}MHz holds ' + f'{frac * 100:.0f}% of max. Cap looks lifted; safe to run tests.' + ) + return 0 + + +if __name__ == '__main__': + raise SystemExit(main())