tractor/tests/conftest.py

"""
Top level of the testing suites!

"""
from __future__ import annotations
import sys
import subprocess
import os
import signal
import platform
import time
from pathlib import Path
from typing import Literal

import pytest
import tractor
from tractor._testing import (
    examples_dir as examples_dir,
    tractor_test as tractor_test,
    expect_ctxc as expect_ctxc,
)

pytest_plugins: list[str] = [
    'pytester',
    # NOTE, now loaded in `pytest-ini` section of `pyproject.toml`
    # 'tractor._testing.pytest',
]

_ci_env: bool = os.environ.get('CI', False)
_non_linux: bool = platform.system() != 'Linux'

# Sending signal.SIGINT on subprocess fails on windows. Use CTRL_* alternatives
if platform.system() == 'Windows':
    _KILL_SIGNAL = signal.CTRL_BREAK_EVENT
    _INT_SIGNAL = signal.CTRL_C_EVENT
    _INT_RETURN_CODE = 3221225786
else:
    _KILL_SIGNAL = signal.SIGKILL
    _INT_SIGNAL = signal.SIGINT
    _INT_RETURN_CODE = 1 if sys.version_info < (3, 8) else -signal.SIGINT.value


no_windows = pytest.mark.skipif(
    platform.system() == "Windows",
    reason="Test is unsupported on windows",
)
no_macos = pytest.mark.skipif(
    platform.system() == "Darwin",
    reason="Test is unsupported on MacOS",
)


def get_cpu_state(
    icpu: int = 0,
    setting: Literal[
        'scaling_governor',
        '*_pstate_max_freq',
        'scaling_max_freq',
        # 'scaling_cur_freq',
    ] = '*_pstate_max_freq',
) -> tuple[
    Path,
    str|int,
]|None:
    '''
    Attempt to read the (first) CPU's setting according
    to the set `setting` from under the file-sys,

    /sys/devices/system/cpu/cpu0/cpufreq/{setting}

    Useful to determine latency headroom for various perf affected
    test suites.

    '''
    try:
        # Read governor for core 0 (usually same for all)
        setting_path: Path = list(
            Path(f'/sys/devices/system/cpu/cpu{icpu}/cpufreq/')
            .glob(f'{setting}')
        )[0]  # <- XXX must be single match!
        with open(
            setting_path,
            'r',
        ) as f:
            return (
                setting_path,
                f.read().strip(),
            )
    except (FileNotFoundError, IndexError):
        return None


def cpu_scaling_factor() -> float:
    '''
    Return a latency-headroom multiplier (>= 1.0) reflecting how
    much to inflate time-limits when CPU-freq scaling is active on
    linux.

    When no scaling info is available (non-linux, missing sysfs),
    returns 1.0 (i.e. no headroom adjustment needed).

    '''
    if _non_linux:
        return 1.

    mx = get_cpu_state()
    cur = get_cpu_state(setting='scaling_max_freq')
    if mx is None or cur is None:
        return 1.

    _mx_pth, max_freq = mx
    _cur_pth, cur_freq = cur
    cpu_scaled: float = int(cur_freq) / int(max_freq)

    if cpu_scaled != 1.:
        return 1. / (
            cpu_scaled * 2  # <- bc likely "dual threaded"
        )

    return 1.


# session-cached sustained-load throttle multiplier — measured
# once (lazily) on the first `cpu_perf_headroom()` call. `None`
# = not-yet-measured.
_sustained_headroom: float|None = None


def _measure_sustained_headroom(
    secs: float = 0.9,
    # a healthy all-core sustained clock holds AT/ABOVE this
    # fraction of the package single-core max ceiling (boost sags
    # under full multi-core load even un-throttled, but not far);
    # at/above it we assume no throttle and return 1.0.
    throttle_gate: float = 0.6,
    max_headroom: float = 4.,
) -> float:
    '''
    One-shot all-core burn returning a latency multiplier
    (>= 1.0) that reflects *sustained-load* CPU throttle.

    Catches the firmware/EC power-cap clamp (AMD PPT/STAPM &
    friends) that pins achieved `scaling_cur_freq` to a fraction
    of the ceiling under multi-core load while EVERY static knob
    (`governor`, `scaling_max_freq`, `EPP`, `platform_profile`)
    still reads "full performance". That cap is INVISIBLE to
    `cpu_scaling_factor()` and is the gremlin behind mass `trio`
    deadline-miss failures on byte-identical code — see
    `scripts/cpu-perf-check`.

    Best-effort: returns 1.0 on non-linux / missing sysfs / any
    error so it can never break a test run.

    '''
    import glob
    import multiprocessing as mp

    def _read_mhz(path: str) -> int|None:
        try:
            return int(open(path).read()) // 1000
        except OSError:
            return None

    try:
        maxs: list[int] = [
            v for f in glob.glob(
                '/sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_max_freq'
            )
            if (v := _read_mhz(f)) is not None
        ]
        pkg_max: int = max(maxs) if maxs else 0
        if not pkg_max:
            return 1.

        def _burn(stop: float) -> None:
            x: int = 1
            while time.perf_counter() < stop:
                x += x * x ^ 0x5

        # explicit `fork` ctx so we're immune to whatever global
        # mp start-method tractor/the suite may have set (`spawn`
        # would re-exec + re-import 24x — slow and pointless here).
        ctx = mp.get_context('fork')
        ncpu: int = os.cpu_count() or 1
        stop: float = time.perf_counter() + secs
        procs = [
            ctx.Process(target=_burn, args=(stop,), daemon=True)
            for _ in range(ncpu)
        ]
        for p in procs:
            p.start()

        # skip the ~0.4s boost window so we sample the steady
        # state AFTER any power-cap has engaged.
        samples: list[int] = []
        time.sleep(0.4)
        while time.perf_counter() < stop - 0.1:
            curs: list[int] = [
                v for f in glob.glob(
                    '/sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_cur_freq'
                )
                if (v := _read_mhz(f)) is not None
            ]
            if curs:
                samples.append(sum(curs) // len(curs))
            time.sleep(0.15)
        for p in procs:
            p.join()

        if not samples:
            return 1.
        frac: float = (sum(samples) // len(samples)) / pkg_max
        # below the gate we read it as a power-cap throttle. The
        # spawn/IPC/fork-bound work these budgets guard slows ~1:1
        # with the achieved-vs-max freq ratio, so compensate by the
        # FULL inverse fraction (a boost-discounted factor
        # under-shoots and still trips the marginal cases).
        if frac >= throttle_gate:
            return 1.
        return min(max_headroom, 1. / frac)

    except Exception:
        return 1.


def cpu_perf_headroom() -> float:
    '''
    Latency-headroom multiplier (>= 1.0) covering BOTH cpu-perf
    throttle classes — multiply a test's deadline by it, e.g.
    `timeout *= cpu_perf_headroom()`:

      - static cpu-freq scaling — via `cpu_scaling_factor()`
        (governor/policy lowered the `scaling_max_freq` ceiling).

      - sustained-load power-cap throttle — via
        `_measure_sustained_headroom()` (firmware/EC PPT/STAPM
        clamps achieved freq under load while every static knob
        reads "performance"; INVISIBLE to the static check). This
        is the gremlin behind mass `trio` deadline-miss failures
        on unchanged code — see
        `ai/conc-anal/trio_033_cancel_cascade_slowdown_depth3_issue.md`.

    The sustained probe runs ONCE per session (cached); the cost
    is a ~0.9s all-core burn on first call only.

    '''
    global _sustained_headroom
    static: float = cpu_scaling_factor()
    if _non_linux:
        return static
    if _sustained_headroom is None:
        _sustained_headroom = _measure_sustained_headroom()
    return max(static, _sustained_headroom)


# NOTE, the `--ll`/`--tl` CLI flags + the `loglevel`, `test_log`
# and `testing_pkg_name` fixtures have been factored into the
# `tractor._testing.pytest` plugin (loaded via the `-p` entry in
# `pyproject.toml`'s `[tool.pytest.ini_options]`) so downstream
# consuming projects (eg. `modden`) inherit them for free. The
# plugin's `testing_pkg_name` fixture defaults to `'tractor'`, so
# this suite keeps treating `--ll` as the runtime loglevel.


@pytest.fixture(scope='session')
def ci_env() -> bool:
    '''
    Detect CI environment.

    '''
    return _ci_env


def sig_prog(
    proc: subprocess.Popen,
    sig: int,
    canc_timeout: float = 0.2,
    tries: int = 3,
) -> int:
    '''
    Kill the actor-process with `sig`.

    Prefer to kill with the provided signal and
    failing a `canc_timeout`, send a `SIKILL`-like
    to ensure termination.

    '''
    for i in range(tries):
        proc.send_signal(sig)
        if proc.poll() is None:
            print(
                f'WARNING, proc still alive after,\n'
                f'canc_timeout={canc_timeout!r}\n'
                f'sig={sig!r}\n'
                f'\n'
                f'{proc.args!r}\n'
            )
            time.sleep(canc_timeout)
    else:
        # TODO: why sometimes does SIGINT not work on teardown?
        # seems to happen only when trace logging enabled?
        if proc.poll() is None:
            print(
                f'XXX WARNING KILLING PROG WITH SIGINT XXX\n'
                f'canc_timeout={canc_timeout!r}\n'
                f'{proc.args!r}\n'
            )
            proc.send_signal(_KILL_SIGNAL)

    ret: int = proc.wait()
    assert ret


# NOTE, the `daemon` fixture (+ its `_wait_for_daemon_ready`
# helper + the post-yield teardown drain logic) has been
# moved to `tests/discovery/conftest.py` since 100% of its
# consumers are discovery-protocol tests now living under
# that subdir. See:
# - `tests/discovery/test_multi_program.py`
# - `tests/discovery/test_registrar.py`
# - `tests/discovery/test_tpt_bind_addrs.py`


# @pytest.fixture(autouse=True)
# def shared_last_failed(pytestconfig):
#     val = pytestconfig.cache.get("example/value", None)
#     breakpoint()
#     if val is None:
#         pytestconfig.cache.set("example/value", val)
#     return val


# TODO: a way to let test scripts (like from `examples/`)
# guarantee they won't `registry_addrs` collide!
# -[ ] maybe use some kinda standard `def main()` arg-spec that
#     we can introspect from a fixture that is called from the test
#     body?
# -[ ] test and figure out typing for below prototype! Bp
#
# @pytest.fixture
# def set_script_runtime_args(
#     reg_addr: tuple,
# ) -> Callable[[...], None]:

#     def import_n_partial_in_args_n_triorun(
#         script: Path,  # under examples?
#         **runtime_args,
#     ) -> Callable[[], Any]:  # a `partial`-ed equiv of `trio.run()`

#         # NOTE, below is taken from
#         # `.test_advanced_faults.test_ipc_channel_break_during_stream`
#         mod: ModuleType = import_path(
#             examples_dir() / 'advanced_faults'
#             / 'ipc_failure_during_stream.py',
#             root=examples_dir(),
#             consider_namespace_packages=False,
#         )
#         return partial(
#             trio.run,
#             partial(
#                 mod.main,
#                 **runtime_args,
#             )
#         )
#     return import_n_partial_in_args_n_triorun