tractor/tests/conftest.py

"""
Top level of the testing suites!

"""
from __future__ import annotations
import sys
import subprocess
import os
import signal
import socket
import platform
import time
from pathlib import Path
from typing import Literal

import pytest
import tractor
from tractor._testing import (
    examples_dir as examples_dir,
    tractor_test as tractor_test,
    expect_ctxc as expect_ctxc,
)

pytest_plugins: list[str] = [
    'pytester',
    # NOTE, now loaded in `pytest-ini` section of `pyproject.toml`
    # 'tractor._testing.pytest',
]

_ci_env: bool = os.environ.get('CI', False)
_non_linux: bool = platform.system() != 'Linux'

# Sending signal.SIGINT on subprocess fails on windows. Use CTRL_* alternatives
if platform.system() == 'Windows':
    _KILL_SIGNAL = signal.CTRL_BREAK_EVENT
    _INT_SIGNAL = signal.CTRL_C_EVENT
    _INT_RETURN_CODE = 3221225786
    _PROC_SPAWN_WAIT = 2
else:
    _KILL_SIGNAL = signal.SIGKILL
    _INT_SIGNAL = signal.SIGINT
    _INT_RETURN_CODE = 1 if sys.version_info < (3, 8) else -signal.SIGINT.value
    _PROC_SPAWN_WAIT = (
        2 if _ci_env
        else 1
    )


no_windows = pytest.mark.skipif(
    platform.system() == "Windows",
    reason="Test is unsupported on windows",
)
no_macos = pytest.mark.skipif(
    platform.system() == "Darwin",
    reason="Test is unsupported on MacOS",
)


def get_cpu_state(
    icpu: int = 0,
    setting: Literal[
        'scaling_governor',
        '*_pstate_max_freq',
        'scaling_max_freq',
        # 'scaling_cur_freq',
    ] = '*_pstate_max_freq',
) -> tuple[
    Path,
    str|int,
]|None:
    '''
    Attempt to read the (first) CPU's setting according
    to the set `setting` from under the file-sys,

    /sys/devices/system/cpu/cpu0/cpufreq/{setting}

    Useful to determine latency headroom for various perf affected
    test suites.

    '''
    try:
        # Read governor for core 0 (usually same for all)
        setting_path: Path = list(
            Path(f'/sys/devices/system/cpu/cpu{icpu}/cpufreq/')
            .glob(f'{setting}')
        )[0]  # <- XXX must be single match!
        with open(
            setting_path,
            'r',
        ) as f:
            return (
                setting_path,
                f.read().strip(),
            )
    except (FileNotFoundError, IndexError):
        return None


def cpu_scaling_factor() -> float:
    '''
    Return a latency-headroom multiplier (>= 1.0) reflecting how
    much to inflate time-limits when CPU-freq scaling is active on
    linux.

    When no scaling info is available (non-linux, missing sysfs),
    returns 1.0 (i.e. no headroom adjustment needed).

    '''
    if _non_linux:
        return 1.

    mx = get_cpu_state()
    cur = get_cpu_state(setting='scaling_max_freq')
    if mx is None or cur is None:
        return 1.

    _mx_pth, max_freq = mx
    _cur_pth, cur_freq = cur
    cpu_scaled: float = int(cur_freq) / int(max_freq)

    if cpu_scaled != 1.:
        return 1. / (
            cpu_scaled * 2  # <- bc likely "dual threaded"
        )

    return 1.


def pytest_addoption(
    parser: pytest.Parser,
):
    # ?TODO? should this be exposed from our `._testing.pytest`
    # plugin or should we make it more explicit with `--tl` for
    # tractor logging like we do in other client projects?
    parser.addoption(
        "--ll",
        action="store",
        dest='loglevel',
        default=None,
        help="logging level to set when testing",
    )


@pytest.fixture(scope='session', autouse=True)
def loglevel(
    request: pytest.FixtureRequest,
) -> str|None:
    import tractor
    orig = tractor.log._default_loglevel
    flag_level: str|None = request.config.option.loglevel

    if flag_level is not None:
        tractor.log._default_loglevel = flag_level

    log = tractor.log.get_console_log(
        level=flag_level,
        name='tractor',  # <- enable root logger
    )
    log.info(
        f'Test-harness set runtime loglevel: {flag_level!r}\n'
    )
    yield flag_level
    tractor.log._default_loglevel = orig


@pytest.fixture(scope='function')
def test_log(
    request: pytest.FixtureRequest,
    loglevel: str,
) -> tractor.log.StackLevelAdapter:
    '''
    Deliver a per test-module-fn logger instance for reporting from
    within actual test bodies/fixtures.

    For example this can be handy to report certain error cases from
    exception handlers using `test_log.exception()`.

    '''
    modname: str = request.function.__module__
    log = tractor.log.get_logger(
        name=modname,  # <- enable root logger
        # pkg_name='tests',
    )
    _log = tractor.log.get_console_log(
        level=loglevel,
        logger=log,
        name=modname,
        # pkg_name='tests',
    )
    _log.debug(
        f'In-test-logging requested\n'
        f'test_log.name: {log.name!r}\n'
        f'level: {loglevel!r}\n'

    )
    yield _log


@pytest.fixture(scope='session')
def ci_env() -> bool:
    '''
    Detect CI environment.

    '''
    return _ci_env


def sig_prog(
    proc: subprocess.Popen,
    sig: int,
    canc_timeout: float = 0.2,
    tries: int = 3,
) -> int:
    '''
    Kill the actor-process with `sig`.

    Prefer to kill with the provided signal and
    failing a `canc_timeout`, send a `SIKILL`-like
    to ensure termination.

    '''
    for i in range(tries):
        proc.send_signal(sig)
        if proc.poll() is None:
            print(
                f'WARNING, proc still alive after,\n'
                f'canc_timeout={canc_timeout!r}\n'
                f'sig={sig!r}\n'
                f'\n'
                f'{proc.args!r}\n'
            )
            time.sleep(canc_timeout)
    else:
        # TODO: why sometimes does SIGINT not work on teardown?
        # seems to happen only when trace logging enabled?
        if proc.poll() is None:
            print(
                f'XXX WARNING KILLING PROG WITH SIGINT XXX\n'
                f'canc_timeout={canc_timeout!r}\n'
                f'{proc.args!r}\n'
            )
            proc.send_signal(_KILL_SIGNAL)

    ret: int = proc.wait()
    assert ret


def _wait_for_daemon_ready(
    reg_addr: tuple,
    tpt_proto: str,
    *,
    deadline: float = 10.0,
    poll_interval: float = 0.05,
    proc: subprocess.Popen|None = None,
) -> None:
    '''
    Active-poll the daemon's bind address until it
    accepts a connection (proving it has called
    `bind() + listen()` and is ready to handle IPC).

    Replaces the historical blind `time.sleep()` in the
    `daemon` fixture which was racy under load — see
    `ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md`.

    Uses stdlib `socket` directly (no trio runtime
    bootstrap cost) — sufficient because
    `tractor.run_daemon()` doesn't return from
    bootstrap until the runtime is fully ready to
    accept IPC.

    Raises `TimeoutError` on `deadline` exceeded. If
    `proc` is given, ALSO raises early if the daemon
    process exits non-zero before the deadline (catches
    daemon-startup-crash that the blind sleep used to
    silently mask).

    '''
    end: float = time.monotonic() + deadline
    last_exc: Exception|None = None
    while time.monotonic() < end:
        # Daemon-died-during-startup early-exit. Without
        # this, a crashed-on-import daemon would just
        # eat the full deadline before raising opaque
        # TimeoutError.
        if proc is not None and proc.poll() is not None:
            raise RuntimeError(
                f'Daemon proc exited (rc={proc.returncode}) '
                f'before becoming ready to accept on '
                f'{reg_addr!r}'
            )
        try:
            if tpt_proto == 'tcp':
                # `socket.create_connection` does the
                # `socket() + connect()` dance with a
                # builtin timeout — perfect primitive
                # for a one-shot probe.
                with socket.create_connection(
                    reg_addr,
                    timeout=poll_interval,
                ):
                    return
            else:
                # UDS — `reg_addr` is a `(filedir, sockname)`
                # tuple per `tractor.ipc._uds.UDSAddress.unwrap`.
                sockpath: str = os.path.join(*reg_addr)
                sock = socket.socket(socket.AF_UNIX)
                try:
                    sock.settimeout(poll_interval)
                    sock.connect(sockpath)
                    return
                finally:
                    sock.close()
        except (
            ConnectionRefusedError,
            FileNotFoundError,
            OSError,
            socket.timeout,
        ) as exc:
            last_exc = exc
            time.sleep(poll_interval)
    raise TimeoutError(
        f'Daemon never accepted on {reg_addr!r} within '
        f'{deadline}s (last connect-attempt exc: '
        f'{last_exc!r})'
    )


# TODO: factor into @cm and move to `._testing`?
@pytest.fixture
def daemon(
    debug_mode: bool,
    loglevel: str,
    testdir: pytest.Pytester,
    reg_addr: tuple[str, int],
    tpt_proto: str,
    ci_env: bool,
    test_log: tractor.log.StackLevelAdapter,
    # set_fork_aware_capture,

) -> subprocess.Popen:
    '''
    Run a daemon root actor as a separate actor-process tree and
    "remote registrar" for discovery-protocol related tests.

    '''
    # XXX: too much logging will lock up the subproc (smh)
    if loglevel in ('trace', 'debug'):
        test_log.warning(
            f'Test harness log level is too verbose: {loglevel!r}\n'
            f'Reducing to INFO level..'
        )
        loglevel: str = 'info'

    code: str = (
        "import tractor; "
        "tractor.run_daemon([], "
        "registry_addrs={reg_addrs}, "
        "enable_transports={enable_tpts}, "
        "debug_mode={debug_mode}, "
        "loglevel={ll})"
    ).format(
        reg_addrs=str([reg_addr]),
        enable_tpts=str([tpt_proto]),
        ll="'{}'".format(loglevel) if loglevel else None,
        debug_mode=debug_mode,
    )
    cmd: list[str] = [
        sys.executable,
        '-c', code,
    ]
    # breakpoint()
    kwargs = {}
    if platform.system() == 'Windows':
        # without this, tests hang on windows forever
        kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP

    proc: subprocess.Popen = testdir.popen(
        cmd,
        **kwargs,
    )

    # Active-poll the daemon's bind address until it's
    # ready to accept connections — replaces the legacy
    # blind `time.sleep(_PROC_SPAWN_WAIT + uds_bonus)`
    # which was racy under load (see
    # `ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md`).
    #
    # Per-test deadline scales with platform: macOS/CI
    # gets extra headroom; Linux dev boxes need very
    # little.
    deadline: float = (
        15.0 if (_non_linux and ci_env)
        else 10.0
    )
    _wait_for_daemon_ready(
        reg_addr=reg_addr,
        tpt_proto=tpt_proto,
        deadline=deadline,
        proc=proc,
    )

    assert not proc.returncode
    yield proc
    sig_prog(proc, _INT_SIGNAL)

    # XXX! yeah.. just be reaaal careful with this bc sometimes it
    # can lock up on the `_io.BufferedReader` and hang..
    stderr: str = proc.stderr.read().decode()
    stdout: str = proc.stdout.read().decode()
    if (
        stderr
        or
        stdout
    ):
        print(
            f'Daemon actor tree produced output:\n'
            f'{proc.args}\n'
            f'\n'
            f'stderr: {stderr!r}\n'
            f'stdout: {stdout!r}\n'
        )

    if (rc := proc.returncode) != -2:
        msg: str = (
            f'Daemon actor tree was not cancelled !?\n'
            f'proc.args: {proc.args!r}\n'
            f'proc.returncode: {rc!r}\n'
        )
        if rc < 0:
            raise RuntimeError(msg)

        test_log.error(msg)


# @pytest.fixture(autouse=True)
# def shared_last_failed(pytestconfig):
#     val = pytestconfig.cache.get("example/value", None)
#     breakpoint()
#     if val is None:
#         pytestconfig.cache.set("example/value", val)
#     return val


# TODO: a way to let test scripts (like from `examples/`)
# guarantee they won't `registry_addrs` collide!
# -[ ] maybe use some kinda standard `def main()` arg-spec that
#     we can introspect from a fixture that is called from the test
#     body?
# -[ ] test and figure out typing for below prototype! Bp
#
# @pytest.fixture
# def set_script_runtime_args(
#     reg_addr: tuple,
# ) -> Callable[[...], None]:

#     def import_n_partial_in_args_n_triorun(
#         script: Path,  # under examples?
#         **runtime_args,
#     ) -> Callable[[], Any]:  # a `partial`-ed equiv of `trio.run()`

#         # NOTE, below is taken from
#         # `.test_advanced_faults.test_ipc_channel_break_during_stream`
#         mod: ModuleType = import_path(
#             examples_dir() / 'advanced_faults'
#             / 'ipc_failure_during_stream.py',
#             root=examples_dir(),
#             consider_namespace_packages=False,
#         )
#         return partial(
#             trio.run,
#             partial(
#                 mod.main,
#                 **runtime_args,
#             )
#         )
#     return import_n_partial_in_args_n_triorun