Add `cpu-perf-check` sustained-throttle gate script
Standalone CLI companion to `cpu_perf_headroom()` (20cb99ec): idle
freq snapshots LIE — every static knob (`governor`, EPP,
`platform_profile`, `scaling_max_freq`) can read "performance"
while a firmware/EC power cap (AMD PPT/STAPM + friends) clamps the
package to ~30% the moment a sustained multi-core load lands,
masquerading as a `trio`-backend deadline-miss "regression" on
byte-identical code.
Deats,
- burns every core for `CPU_PERF_SECS` (default 4s) and samples the
ACHIEVED `scaling_cur_freq` steady-state (post boost-ramp) vs the
package max ceiling,
- exits 0 when the sustained fraction clears
`CPU_PERF_HEALTHY_FRAC` (default 0.45), 1 when throttled — so it
gates a suite run: `scripts/cpu-perf-check && pytest tests/ ...`,
- prints the static knobs first (to show they all read fine) then
the remediation list on failure (`platform_profile` bounce, USB-C
PD replug, `ryzenadj`, reboot) w/ the key reminder: do NOT bump
test budgets — the box is slow, not the code.
(this patch was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
test_cpu_throttling
parent
20cb99ecd4
commit
5e5d785b8a
|
|
@ -0,0 +1,159 @@
|
|||
#!/usr/bin/env python3
|
||||
# tractor: distributed structured concurrency.
|
||||
# Copyright 2018-eternity Tyler Goodlet.
|
||||
#
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
'''
|
||||
`cpu-perf-check` — sustained-load CPU throttle detector.
|
||||
|
||||
Idle freq snapshots LIE. A laptop can read
|
||||
`governor=performance`, `EPP=performance`,
|
||||
`platform_profile=performance`, `scaling_max_freq=<full>`
|
||||
and momentarily clock a P-core at 5GHz — while a
|
||||
firmware/EC power cap (AMD PPT/STAPM and friends) clamps
|
||||
the whole package to ~1.5GHz the instant a sustained
|
||||
multi-core load lands. That throttle masquerades as a
|
||||
`trio`-backend test *regression*: a wave of `fail_after` /
|
||||
`TooSlowError` / `Cancelled(source='deadline')` deadline
|
||||
misses on spawn-heavy tests, on byte-identical code that
|
||||
was green yesterday.
|
||||
|
||||
The existing `tests/conftest.py:cpu_scaling_factor()` only
|
||||
reads STATIC `scaling_max_freq` vs `*_pstate_max_freq`, so
|
||||
it returns `1.0` (no throttle) during exactly this failure
|
||||
— it can't see the cap. This script complements it by
|
||||
BURNING every core for a few seconds and sampling the
|
||||
ACHIEVED `scaling_cur_freq`, which is the only thing that
|
||||
exposes the clamp.
|
||||
|
||||
Exit code: `0` if sustained perf looks restored, `1` if
|
||||
throttled — so it gates a test run:
|
||||
|
||||
py313/bin/python scripts/cpu-perf-check && pytest tests/ ...
|
||||
|
||||
Tunables (env-overridable):
|
||||
CPU_PERF_SECS load duration (default 4.0)
|
||||
CPU_PERF_HEALTHY_FRAC sustained/max floor (default 0.45)
|
||||
|
||||
'''
|
||||
from __future__ import annotations
|
||||
import glob
|
||||
import os
|
||||
import time
|
||||
import multiprocessing as mp
|
||||
|
||||
|
||||
def _read(path: str) -> str | None:
|
||||
try:
|
||||
with open(path) as f:
|
||||
return f.read().strip()
|
||||
except OSError:
|
||||
return None
|
||||
|
||||
|
||||
def _cur_freqs_mhz() -> list[int]:
|
||||
out: list[int] = []
|
||||
for f in glob.glob(
|
||||
'/sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_cur_freq'
|
||||
):
|
||||
if (v := _read(f)):
|
||||
out.append(int(v) // 1000)
|
||||
return out
|
||||
|
||||
|
||||
def _pkg_max_mhz() -> int:
|
||||
'''
|
||||
Highest per-core ceiling across the package — the
|
||||
P-core max on hybrid parts.
|
||||
|
||||
'''
|
||||
mxs: list[int] = []
|
||||
for f in glob.glob(
|
||||
'/sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_max_freq'
|
||||
):
|
||||
if (v := _read(f)):
|
||||
mxs.append(int(v) // 1000)
|
||||
return max(mxs) if mxs else 0
|
||||
|
||||
|
||||
def _burn(stop: float) -> None:
|
||||
x: int = 1
|
||||
while time.perf_counter() < stop:
|
||||
x += x * x ^ 0x5
|
||||
|
||||
|
||||
def main(
|
||||
secs: float = float(os.environ.get('CPU_PERF_SECS', 4.0)),
|
||||
# sustained aggregate must clear this fraction of the
|
||||
# package max-freq ceiling. Throttled (~1.5GHz of 5.1GHz)
|
||||
# ~= 0.29; a healthy all-core load easily clears 0.5.
|
||||
healthy_frac: float = float(
|
||||
os.environ.get('CPU_PERF_HEALTHY_FRAC', 0.45)
|
||||
),
|
||||
) -> int:
|
||||
if not glob.glob('/sys/devices/system/cpu/cpu0/cpufreq'):
|
||||
print('no cpufreq sysfs (non-linux?) — skipping, assume OK')
|
||||
return 0
|
||||
|
||||
b: str = '/sys/devices/system/cpu/cpu0/cpufreq/'
|
||||
pkg_max: int = _pkg_max_mhz()
|
||||
print('=== static knobs (ALL can read fine while throttled) ===')
|
||||
print(f' governor : {_read(b + "scaling_governor")}')
|
||||
print(f' EPP : {_read(b + "energy_performance_preference")}')
|
||||
print(f' platform_profile : '
|
||||
f'{_read("/sys/firmware/acpi/platform_profile")}')
|
||||
print(f' pkg max freq : {pkg_max} MHz')
|
||||
|
||||
ncpu: int = os.cpu_count() or 1
|
||||
print(f'\n=== sustained {ncpu}-core load ({secs:.0f}s) — the real test ===')
|
||||
stop: float = time.perf_counter() + secs
|
||||
procs = [
|
||||
mp.Process(target=_burn, args=(stop,))
|
||||
for _ in range(ncpu)
|
||||
]
|
||||
for p in procs:
|
||||
p.start()
|
||||
|
||||
# skip the initial ~0.6s ramp, then sample steady-state
|
||||
samples: list[int] = []
|
||||
time.sleep(0.6)
|
||||
while time.perf_counter() < stop - 0.2:
|
||||
if (fr := _cur_freqs_mhz()):
|
||||
samples.append(sum(fr) // len(fr))
|
||||
time.sleep(0.3)
|
||||
for p in procs:
|
||||
p.join()
|
||||
|
||||
if not (samples and pkg_max):
|
||||
print(' could not sample cur freq — assume OK')
|
||||
return 0
|
||||
|
||||
sustained: int = sum(samples) // len(samples)
|
||||
frac: float = sustained / pkg_max
|
||||
print(f' aggregate cur-freq samples: {samples}')
|
||||
print(f' sustained avg : {sustained} MHz '
|
||||
f'({frac * 100:.0f}% of {pkg_max} MHz max)')
|
||||
|
||||
if frac < healthy_frac:
|
||||
print(
|
||||
f'\n ❌ THROTTLED — sustained {sustained}MHz is only '
|
||||
f'{frac * 100:.0f}% of max (< {healthy_frac * 100:.0f}%).\n'
|
||||
f' Power cap (PPT/STAPM) still engaged. Fixes:\n'
|
||||
f' - bounce /sys/firmware/acpi/platform_profile\n'
|
||||
f' (balanced -> performance)\n'
|
||||
f' - unplug/replug USB-C to re-negotiate PD\n'
|
||||
f' - ryzenadj to lift STAPM/PPT\n'
|
||||
f' - else reboot\n'
|
||||
f' Do NOT bump test budgets — the box is slow, not the code.'
|
||||
)
|
||||
return 1
|
||||
|
||||
print(
|
||||
f'\n ✅ PERF OK — sustained {sustained}MHz holds '
|
||||
f'{frac * 100:.0f}% of max. Cap looks lifted; safe to run tests.'
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
raise SystemExit(main())
|
||||
Loading…
Reference in New Issue