# tractor: structured concurrent "actors".
# Copyright 2018-eternity Tyler Goodlet.

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

'''
Zombie-subactor reaper — SC-polite (SIGINT first, SIGKILL
as last resort with a bounded grace window) plus optional
`/dev/shm/` orphan-segment sweep.

Shared implementation between the `tractor-reap` CLI
(`scripts/tractor-reap`) and the pytest session-scoped
auto-fixture that guards the test suite against leftover
subactor processes.

Design notes — process reap
---------------------------

- Linux-only today: reads `/proc/<pid>/{status,cwd,cmdline}`.
  Module imports cleanly elsewhere; calling `find_*` on a
  non-Linux box returns an empty list (no `/proc`
  enumeration). A future xplatform pass could swap this
  for `psutil.Process.children()` /
  `psutil.process_iter()` since `psutil` is already a
  test-time dependency.

- Two detection modes:

  1. **descendant-mode** — when invoked from a still-live
     parent (e.g. a pytest session-end fixture), match by
     `PPid == parent_pid`. Direct + precise; the target
     PIDs are still reparented to the live pytest process
     at teardown time, before pytest exits.

  2. **orphan-mode** — when invoked after the parent died
     (e.g. the `tractor-reap` CLI run post-Ctrl+C), match
     by `PPid == 1` (reparented to init) AND `cwd ==
     <repo-root>` AND cmdline contains `python`. The cwd
     filter is what keeps the heuristic from sweeping up
     unrelated init-children on the box.

- Escalation: for every matched PID, SIGINT, poll for up
  to `grace` seconds, then SIGKILL any survivors. The
  two-phase pattern is the SC-graceful-cancel discipline
  documented in `feedback_sc_graceful_cancel_first.md` —
  we want the subactor runtime to run its trio cancel
  shield + IPC teardown paths where it can.

Design notes — shm sweep
------------------------

Since `tractor/ipc/_mp_bs.disable_mantracker()` turns off
`mp.resource_tracker` entirely, a hard-crashing actor can
leave `/dev/shm/<key>` segments behind that nothing else
GCs (see
`ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md`,
"Trade-offs / known gaps").

The shm sweep is **Linux-/FreeBSD-only**: both expose
POSIX shared-memory segments as regular files under
`/dev/shm`, so `os.stat()` + `os.unlink()` are the
correct primitives. macOS POSIX shm has no fs-visible
path (segments live behind `shm_open`/`shm_unlink`
syscalls only), and Windows is a different story
entirely. Calling the shm helpers on an unsupported
platform raises `NotImplementedError`.

In-use enumeration delegates to `psutil` —
`Process.memory_maps()` (post-mmap) +
`Process.open_files()` (pre-mmap shm-opened fds) —
xplatform, mature, and handles the per-process
permission/race edge cases correctly. Segments matching
neither are genuinely leaked → safe to unlink.

The "nobody has it open" check is the kernel-canonical
test — same answer `lsof /dev/shm/<key>` would give. No
reliance on tractor-specific naming conventions (shm
keys are caller-defined).

'''
from __future__ import annotations

import os
import pathlib
import re
import signal
import stat
import sys
import time

# `/dev/shm` is the POSIX-shm filesystem on Linux + FreeBSD.
# macOS uses `shm_open` syscalls without a fs-visible path,
# so the shm helpers refuse to run there.
_SHM_PLATFORM_OK: bool = sys.platform.startswith(
    ('linux', 'freebsd')
)
SHM_DIR: str = '/dev/shm'

# UDS-socket leak sweep — see `find_orphaned_uds()` /
# `reap_uds()` below. Tractor's UDS transport
# (`tractor.ipc._uds`) creates sock files under
# `${XDG_RUNTIME_DIR}/tractor/<name>@<pid>.sock`; a
# crash / SIGKILL / mid-cancel teardown can leave the
# file behind because `os.unlink()` lives in the
# `_serve_ipc_eps` `finally:` block which doesn't always
# get to run on hard exits. The reaper here is best-effort
# cleanup for the test harness + the `tractor-reap` CLI.
_UDS_SUBDIR: str = 'tractor'
# `<actor-name>@<pid>.sock` — pid is the binder's pid at
# creation time. Special sentinel: `registry@1616.sock`
# uses the magic `1616` not a real pid (the root
# registrar's known address; see `UDSAddress.get_root`).
_UDS_NAME_RE: re.Pattern = re.compile(
    r'^(?P<name>.+)@(?P<pid>\d+)\.sock$'
)
_UDS_REGISTRY_SENTINEL_PID: int = 1616


def _ensure_shm_supported() -> None:
    '''
    Guard for shm helpers — they assume `/dev/shm` exists
    as a tmpfs and `os.unlink()` is the right primitive.
    Both true on Linux + FreeBSD; not true elsewhere.

    '''
    if not _SHM_PLATFORM_OK:
        raise NotImplementedError(
            f'shm reap is only supported on Linux/FreeBSD; '
            f'got sys.platform={sys.platform!r}. macOS '
            f'POSIX shm has no fs-visible path; Windows '
            f'has no /dev/shm equivalent.'
        )


def _read_status_ppid(pid: int) -> int | None:
    '''
    Return the parent-pid from `/proc/<pid>/status` or
    `None` if the proc went away / is unreadable.

    '''
    try:
        with open(f'/proc/{pid}/status') as f:
            for line in f:
                if line.startswith('PPid:'):
                    return int(line.split()[1])
    except (
        FileNotFoundError,
        PermissionError,
        ProcessLookupError,
    ):
        return None
    return None


def _read_cwd(pid: int) -> str | None:
    try:
        return os.readlink(f'/proc/{pid}/cwd')
    except (
        FileNotFoundError,
        PermissionError,
        ProcessLookupError,
    ):
        return None


def _read_cmdline(pid: int) -> str:
    try:
        with open(f'/proc/{pid}/cmdline', 'rb') as f:
            return f.read().replace(b'\0', b' ').decode(
                errors='replace',
            )
    except (
        FileNotFoundError,
        PermissionError,
        ProcessLookupError,
    ):
        return ''


def _read_comm(pid: int) -> str:
    '''
    Read `/proc/<pid>/comm` — the kernel's per-task name
    (truncated to ~15 bytes on Linux). Set by
    `setproctitle.setproctitle()` so this is one of the
    most reliable identifiers for tractor sub-actors —
    notably, **survives zombie state** (kernel preserves
    `comm` even after exit, until reaped) where
    `cmdline`/`environ` may not.

    '''
    try:
        with open(f'/proc/{pid}/comm') as f:
            return f.read().rstrip('\n')
    except (
        FileNotFoundError,
        PermissionError,
        ProcessLookupError,
    ):
        return ''


# Intrinsic markers that identify a tractor sub-actor
# regardless of cwd / venv path / launch context. Used by
# `_is_tractor_subactor()` below.
#
# - cmdline `tractor[`: matches the `setproctitle`-set form
#   (`tractor[<aid.reprol()>]`) — set in
#   `_actor_child_main` for ALL backends, mutates argv via
#   libc so visible in `/proc/<pid>/cmdline`.
# - cmdline `tractor._child`: matches the legacy
#   `python -m tractor._child --uid (...)` form. Catches
#   procs that died before `_actor_child_main` got to call
#   `setproctitle()` — argv from exec is still kernel-
#   visible at that point.
# - comm `tractor[`: same proctitle-set form, but visible
#   via `/proc/<pid>/comm` (kernel-truncated to ~15 bytes,
#   `tractor[doggy:`). Critical for ZOMBIES — kernel
#   preserves `comm` past task-exit until parent reaps,
#   while `cmdline` for zombies often reads as empty.
_TRACTOR_PROC_CMDLINE_MARKERS: tuple[str, ...] = (
    'tractor._child',
    'tractor[',
)
_TRACTOR_PROC_COMM_MARKER: str = 'tractor['


def _is_tractor_subactor(pid: int) -> bool:
    '''
    Detect whether `pid` is a tractor sub-actor process
    using **intrinsic** signals — cmdline → comm — in
    priority order.

    No filesystem-state coupling (cwd / venv path) and no
    env-var dependency: `setproctitle`-mutated argv (set
    in `_actor_child_main`) covers all live + most-zombie
    cases; legacy `python -m tractor._child` cmdline
    catches anything that died before `setproctitle` ran;
    kernel `comm` covers zombies that survived past
    `_actor_child_main` long enough to setproctitle.

    '''
    # 1. cmdline match — catches both `setproctitle`-set
    #    `tractor[<aid>]` (live) AND legacy `python -m
    #    tractor._child` (any) form.
    cmdline: str = _read_cmdline(pid)
    if any(m in cmdline for m in _TRACTOR_PROC_CMDLINE_MARKERS):
        return True

    # 2. Zombie-resilient fallback: kernel-preserved `comm`
    #    (set by setproctitle). Critical for zombies whose
    #    `cmdline` reads as empty post-exit but whose
    #    `comm` survives to `wait()` time.
    comm: str = _read_comm(pid)
    if _TRACTOR_PROC_COMM_MARKER in comm:
        return True

    return False


def _iter_live_pids() -> list[int]:
    '''
    Enumerate currently-alive pids from `/proc`. Returns
    `[]` on systems without `/proc` (e.g. macOS).

    '''
    try:
        entries: list[str] = os.listdir('/proc')
    except OSError:
        return []
    return [int(e) for e in entries if e.isdigit()]


def find_descendants(
    parent_pid: int,
) -> list[int]:
    '''
    PIDs whose `PPid == parent_pid` — i.e. direct
    children of the given pid. Used by the pytest
    session-end fixture where `parent_pid` is still
    alive as the pytest-python process.

    '''
    return [
        pid
        for pid in _iter_live_pids()
        if _read_status_ppid(pid) == parent_pid
    ]


def find_runaway_subactors(
    parent_pid: int,
    *,
    cpu_threshold: float = 95.0,
    sample_interval: float = 0.05,
    only_pids: set[int]|None = None,
) -> list[tuple[int, float, str]]:
    '''
    Return `(pid, cpu_pct, cmdline)` for any descendant
    of `parent_pid` currently burning CPU above
    `cpu_threshold` (default 95%) — the smoking-gun
    signature of a runaway tight-loop bug (e.g. a C-level
    `recvfrom` loop on a closed socket that missed EOF
    detection — see
    `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`).

    `cpu_percent(interval=sample_interval)` is the
    canonical psutil API for a "what %CPU is this proc
    using NOW" answer — it samples twice with a delta to
    compute true utilization. Default `sample_interval`
    of 50ms is enough to register a sustained C-level
    tight-loop at ~100% but cheap enough to run as part
    of an autouse per-test fixture without dominating
    suite wall-clock. Sub-50ms transient bursts are NOT
    the bug class we're hunting (those are normal Python
    work) so the lost sensitivity is fine.

    `only_pids` filters to a specific pre-snapshotted set
    (e.g. "pids spawned during this test only"); when
    `None`, all live descendants are checked. Empty
    `only_pids` returns `[]` IMMEDIATELY — fast path for
    tests that didn't spawn anything new.

    Returns `[]` when `psutil` isn't installed or no
    descendants match.

    '''
    # Fast-path: caller passed empty `only_pids` —
    # nothing to sample. Avoids the psutil import + /proc
    # walk for tests that didn't spawn descendants.
    if only_pids is not None and not only_pids:
        return []

    try:
        import psutil
    except ImportError:
        return []

    candidates: list[int] = find_descendants(parent_pid)
    if only_pids is not None:
        candidates = [p for p in candidates if p in only_pids]
    if not candidates:
        return []

    runaways: list[tuple[int, float, str]] = []
    for pid in candidates:
        try:
            proc = psutil.Process(pid)
            cpu: float = proc.cpu_percent(
                interval=sample_interval,
            )
            if cpu < cpu_threshold:
                continue
            cmdline: str = ' '.join(proc.cmdline())
            runaways.append((pid, cpu, cmdline))
        except (
            psutil.NoSuchProcess,
            psutil.AccessDenied,
        ):
            continue
    return runaways


def _read_status_state(pid: int) -> str | None:
    '''
    Return the single-letter task state from
    `/proc/<pid>/status` (`R`/`S`/`D`/`Z`/`T`/`X`/`I`) or
    `None` if unreadable. `Z` = zombie.

    '''
    try:
        with open(f'/proc/{pid}/status') as f:
            for line in f:
                if line.startswith('State:'):
                    # `State:\tZ (zombie)` -> 'Z'
                    parts = line.split()
                    if len(parts) >= 2:
                        return parts[1]
    except (
        FileNotFoundError,
        PermissionError,
        ProcessLookupError,
    ):
        return None
    return None


def find_orphans(
    repo_root: pathlib.Path|None = None,
) -> list[int]:
    '''
    PIDs that are reparented to init (`PPid == 1`) AND
    are tractor sub-actors per `_is_tractor_subactor()`'s
    intrinsic checks (env-var → cmdline → comm).

    The `repo_root` arg is kept for back-compat with
    callers that previously passed it (the old impl used
    it to filter by cwd) but is no longer required —
    tractor sub-actor identity is intrinsic to the proc,
    not its launch context.

    '''
    # `repo_root` kept in signature for back-compat; today
    # the intrinsic env/cmdline/comm signals identify a
    # tractor sub-actor without coincidence-of-cwd
    # matching. Suppressed-arg stays a no-op so existing
    # callers don't have to change.
    _ = repo_root  # noqa
    hits: list[int] = []
    for pid in _iter_live_pids():
        if _read_status_ppid(pid) != 1:
            continue
        if _is_tractor_subactor(pid):
            hits.append(pid)
    return hits


def find_zombies(
    parent_pid: int|None = None,
) -> list[int]:
    '''
    PIDs in zombie state (`/proc/<pid>/status: State: Z`)
    that are tractor sub-actors per
    `_is_tractor_subactor()`.

    When `parent_pid` is given, restricts to descendants
    of that pid (typical for pytest session-end fixture
    use). When `None`, scans all zombies on the box.

    Detection for zombies relies primarily on
    `/proc/<pid>/comm` (kernel-preserved past zombie
    state, set by `setproctitle`) since
    `cmdline`/`environ` are usually empty post-exit.

    '''
    hits: list[int] = []
    for pid in _iter_live_pids():
        if _read_status_state(pid) != 'Z':
            continue
        if (
            parent_pid is not None
            and _read_status_ppid(pid) != parent_pid
        ):
            continue
        if _is_tractor_subactor(pid):
            hits.append(pid)
    return hits


def reap(
    pids: list[int],
    *,
    grace: float = 3.0,
    poll: float = 0.25,
    log=print,
) -> tuple[list[int], list[int]]:
    '''
    Deliver SIGINT to each pid, wait up to `grace`
    seconds for them to exit, then SIGKILL any that
    survive.

    Returns `(signalled, survivors_killed)` so callers
    can report / assert.

    `log` is the logger function for user-visible
    progress lines — default `print`; pytest fixture
    swaps it for a `pytest`-friendly writer.

    '''
    if not pids:
        return ([], [])

    signalled: list[int] = []
    for pid in pids:
        try:
            os.kill(pid, signal.SIGINT)
            signalled.append(pid)
        except ProcessLookupError:
            # raced — already gone
            pass

    if signalled:
        log(
            f'[tractor-reap] SIGINT → {len(signalled)} '
            f'proc(s): {signalled}'
        )

    deadline: float = time.monotonic() + grace
    while time.monotonic() < deadline:
        time.sleep(poll)
        alive: list[int] = [
            pid for pid in signalled if _is_alive(pid)
        ]
        if not alive:
            return (signalled, [])

    survivors: list[int] = [
        pid for pid in signalled if _is_alive(pid)
    ]
    if survivors:
        log(
            f'[tractor-reap] SIGKILL (after {grace}s '
            f'grace) → {survivors}'
        )
        for pid in survivors:
            try:
                os.kill(pid, signal.SIGKILL)
            except ProcessLookupError:
                pass

    return (signalled, survivors)


def _is_alive(pid: int) -> bool:
    '''
    True iff `/proc/<pid>` still exists AND the proc
    isn't already a zombie (Z state).

    '''
    try:
        with open(f'/proc/{pid}/status') as f:
            for line in f:
                if line.startswith('State:'):
                    # e.g. 'State:\tZ (zombie)'
                    return 'Z' not in line.split()[1]
    except (
        FileNotFoundError,
        ProcessLookupError,
    ):
        return False
    return True


def _enumerate_in_use_shm(
    shm_dir: str = SHM_DIR,
) -> set[str]:
    '''
    Return the set of `<shm_dir>/<file>` paths currently
    held open by any live process — via `psutil`'s
    xplatform `Process.memory_maps()` (post-mmap
    segments) and `Process.open_files()` (pre-mmap
    shm-opened fds).

    Lazy-imports `psutil` so the module stays importable
    on installs without it (it's a `testing` group dep).

    '''
    _ensure_shm_supported()

    # lazy + actionable failure: leaked shm sweep is the
    # only thing in this module that needs psutil; we
    # don't want a top-level ImportError breaking the
    # process-reap path.
    try:
        import psutil
    except ImportError as exc:
        raise RuntimeError(
            'shm reap requires `psutil` — install the '
            '`testing` dep group, e.g. '
            '`uv sync --group testing`.'
        ) from exc

    in_use: set[str] = set()
    prefix: str = shm_dir.rstrip('/') + '/'
    for proc in psutil.process_iter(['pid']):
        try:
            for m in proc.memory_maps(grouped=False):
                if m.path.startswith(prefix):
                    in_use.add(m.path)
            for f in proc.open_files():
                if f.path.startswith(prefix):
                    in_use.add(f.path)
        except (
            psutil.NoSuchProcess,
            psutil.AccessDenied,
            psutil.ZombieProcess,
            FileNotFoundError,
            PermissionError,
        ):
            # raced — proc died or we can't see its
            # mappings (e.g. root-owned). Skip; missing
            # an in-use entry only means we'd preserve
            # something we could reap, never the
            # reverse — safe-by-default.
            continue
    return in_use


def find_orphaned_shm(
    *,
    uid: int | None = None,
    shm_dir: str = SHM_DIR,
) -> list[str]:
    '''
    `<shm_dir>/<file>` paths that are:

    - owned by `uid` (default: the current effective uid),
    - and currently held by NO live process — i.e.
      genuinely leaked.

    Linux/FreeBSD only — see module docstring. No reliance
    on caller-defined shm-key naming, so this works for
    any tractor app (not just the test suite).

    '''
    _ensure_shm_supported()

    if uid is None:
        uid = os.geteuid()

    try:
        entries: list[str] = os.listdir(shm_dir)
    except OSError:
        return []

    in_use: set[str] = _enumerate_in_use_shm(shm_dir=shm_dir)
    leaked: list[str] = []
    prefix: str = shm_dir.rstrip('/') + '/'
    for entry in entries:
        path: str = prefix + entry
        try:
            st: os.stat_result = os.stat(path)
        except OSError:
            continue
        # only regular files — skip subdirs / sockets etc.
        if not stat.S_ISREG(st.st_mode):
            continue
        if st.st_uid != uid:
            continue
        if path in in_use:
            continue
        leaked.append(path)
    return leaked


def reap_shm(
    paths: list[str],
    *,
    log=print,
) -> tuple[list[str], list[tuple[str, OSError]]]:
    '''
    Unlink the given `/dev/shm/...` paths.

    Linux/FreeBSD only — `os.unlink()` is the correct
    primitive on the POSIX-shm tmpfs there. macOS POSIX
    shm has no fs-visible path; the equivalent there is
    `posix_ipc.unlink_shared_memory(name)` (not
    implemented here — see module docstring).

    Returns `(unlinked, errors)` where `errors` is a list
    of `(path, exc)` for paths that could not be removed
    (e.g. permissions). Paths that raced to being already-
    gone are counted as successfully unlinked.

    '''
    _ensure_shm_supported()

    unlinked: list[str] = []
    errors: list[tuple[str, OSError]] = []
    for path in paths:
        try:
            os.unlink(path)
            unlinked.append(path)
        except FileNotFoundError:
            # raced — already gone, treat as success
            unlinked.append(path)
        except OSError as exc:
            errors.append((path, exc))

    if unlinked:
        log(
            f'[tractor-reap] unlinked {len(unlinked)} '
            f'orphaned shm segment(s): {unlinked}'
        )
    for path, exc in errors:
        log(
            f'[tractor-reap] could not unlink {path}: '
            f'{exc!r}'
        )
    return (unlinked, errors)


def get_uds_dir() -> str|None:
    '''
    Path of tractor's per-user UDS sock-file dir
    (`${XDG_RUNTIME_DIR}/tractor/`).

    Returns `None` when `XDG_RUNTIME_DIR` is unset (e.g.
    non-systemd hosts, or inside a container without the
    var plumbed through). Caller should treat that as
    "no UDS leaks possible to detect — skip".

    '''
    xdg: str|None = os.environ.get('XDG_RUNTIME_DIR')
    if not xdg:
        return None
    return os.path.join(xdg, _UDS_SUBDIR)


def _parse_uds_name(filename: str) -> tuple[str, int]|None:
    '''
    Extract `(actor_name, pid)` from a tractor UDS sock
    filename. Returns `None` for unrecognized names.

    '''
    m = _UDS_NAME_RE.match(filename)
    if not m:
        return None
    return (m['name'], int(m['pid']))


def find_orphaned_uds(
    *,
    uds_dir: str|None = None,
) -> list[str]:
    '''
    `<uds_dir>/*.sock` paths whose binder pid is no
    longer alive (orphaned). Includes the
    `registry@1616.sock` sentinel — `1616` is a magic
    sentinel pid (not a real one) so the file's
    presence alone signals a leak from a dead session.

    Returns `[]` on platforms without `XDG_RUNTIME_DIR`
    or when the dir doesn't exist. Files whose name
    doesn't match the `<name>@<pid>.sock` pattern are
    skipped (we don't unlink things we don't recognize).

    '''
    dir_path: str = uds_dir or get_uds_dir()
    if not dir_path:
        return []

    try:
        entries: list[str] = os.listdir(dir_path)
    except OSError:
        return []

    leaked: list[str] = []
    prefix: str = dir_path.rstrip('/') + '/'
    for entry in entries:
        path: str = prefix + entry
        if not entry.endswith('.sock'):
            continue
        try:
            st: os.stat_result = os.stat(path)
        except OSError:
            continue
        # only sockets; skip stray regular files / subdirs
        if not stat.S_ISSOCK(st.st_mode):
            continue
        parsed = _parse_uds_name(entry)
        if parsed is None:
            # unknown naming — skip rather than risk
            # unlinking something we don't own
            continue
        _name, pid = parsed
        if pid == _UDS_REGISTRY_SENTINEL_PID:
            # sentinel — never a real pid; if the file
            # exists nobody live is "owning" it via
            # /proc lookup, so always orphaned
            leaked.append(path)
            continue
        if not _is_alive(pid):
            leaked.append(path)
    return leaked


def reap_uds(
    paths: list[str],
    *,
    log=print,
) -> tuple[list[str], list[tuple[str, OSError]]]:
    '''
    Unlink the given UDS sock-file paths.

    Returns `(unlinked, errors)`; race-already-gone
    `FileNotFoundError`s count as success. Same shape
    as `reap_shm` so callers can pipeline both.

    '''
    unlinked: list[str] = []
    errors: list[tuple[str, OSError]] = []
    for path in paths:
        try:
            os.unlink(path)
            unlinked.append(path)
        except FileNotFoundError:
            unlinked.append(path)
        except OSError as exc:
            errors.append((path, exc))

    if unlinked:
        log(
            f'[tractor-reap] unlinked {len(unlinked)} '
            f'orphaned UDS sock-file(s): {unlinked}'
        )
    for path, exc in errors:
        log(
            f'[tractor-reap] could not unlink {path}: '
            f'{exc!r}'
        )
    return (unlinked, errors)


# ----------------------------------------------------------
# Pytest fixtures — sub-plugin surface
# ----------------------------------------------------------
# Loaded as a pytest plugin via the `pytest_plugins` line in
# `tractor._testing.pytest`. Keeps the reaping infra (helpers
# above + fixtures below) co-located so adding a new reap
# target is a single-file change. Sibling-module
# (`tractor._testing.pytest`) keeps its core
# tractor-tooling surface (option/marker/parametrize hooks,
# `tractor_test` deco, transport / spawn-method fixtures)
# uncluttered.
import pytest


@pytest.fixture(
    scope='session',
    autouse=True,
)
def _reap_orphaned_subactors():
    '''
    Session-scoped autouse fixture: after the whole test
    session finishes, SIGINT any subactor processes still
    parented to this `pytest` process, wait a bounded
    grace window, then SIGKILL survivors.

    Rationale: under fork-based spawn backends (notably
    `main_thread_forkserver`), a test that times out or bails
    mid-teardown can leave subactor forks alive. Without
    this reap, they linger across sessions and compete
    for ports / inherit pytest's capture-pipe fds — which
    flakifies later tests. SC-polite discipline: SIGINT
    first to let the subactor's trio cancel shield + IPC
    teardown paths run before we escalate.

    Matching companion CLI: `scripts/tractor-reap` for
    the pytest-died-mid-session case.

    '''
    parent_pid: int = os.getpid()
    yield
    pids: list[int] = find_descendants(parent_pid)
    if pids:
        reap(pids, grace=3.0)
    # NOTE, sweep UDS sock-files AFTER reaping subactors —
    # killed actors' bind paths only become "orphaned" once
    # their owning pid is gone. See `find_orphaned_uds()`
    # for the leak-detection algorithm + the `1616`
    # registry-sentinel special case.
    leaked_uds: list[str] = find_orphaned_uds()
    if leaked_uds:
        reap_uds(leaked_uds)


@pytest.fixture(
    scope='function',
)
def track_orphaned_uds_per_test():
    '''
    Per-test (function-scoped) UDS sock-file leak
    detector + reaper. **Opt-in**, NOT autouse.

    Apply at module level on UDS-heavy test files via:

        pytestmark = pytest.mark.usefixtures(
            'track_orphaned_uds_per_test',
        )

    The session-end `_reap_orphaned_subactors` fixture
    is the always-on safety net that catches leaks at
    suite teardown; this per-test fixture is for the
    smaller set of modules where blame attribution per
    test matters (i.e. modules with a HISTORY of leaky
    teardown that flakifies sibling tests via
    sock-file rebind races).

    Snapshots `${XDG_RUNTIME_DIR}/tractor/` before and
    after each test; any `<name>@<pid>.sock` files
    created during the test that survive teardown AND
    whose creator pid is dead are surfaced as a loud
    warning AND reaped, so the next test starts with a
    clean dir.

    Why per-test (not just session-scoped): under
    `--tpt-proto=uds`, a single hard-killed subactor
    leaves a sock file that a sibling test's
    `wait_for_actor`/`find_actor` discovery probes can
    accidentally hit (FileExistsError on rebind, or
    epoll register on a half-closed peer-FIN'd fd → see
    issue #454). Catching the leak the test that caused
    it (vs. blanket session-end sweep) makes blame
    obvious + prevents cascade flakiness.

    Cheap: 2x `os.listdir` + a few `os.stat`s per test.
    Skips silently when `XDG_RUNTIME_DIR` isn't set.

    '''
    uds_dir: str|None = get_uds_dir()
    # snapshot pre-test sock-file population so we only
    # blame this test for files it added (others may have
    # been left around by session-scoped fixtures /
    # cross-session leaks pending reaper).
    before: set[str] = set()
    if uds_dir:
        try:
            before = {
                e for e in os.listdir(uds_dir)
                if e.endswith('.sock')
            }
        except OSError:
            pass

    yield

    if not uds_dir:
        return
    try:
        after: set[str] = {
            e for e in os.listdir(uds_dir)
            if e.endswith('.sock')
        }
    except OSError:
        return
    new_files: set[str] = after - before
    if not new_files:
        return
    # only consider files whose binder pid is dead (or the
    # 1616 sentinel) — a still-running test that legit
    # holds a sock open will be ignored here and caught at
    # session-end if it really is leaked.
    orphans: list[str] = find_orphaned_uds(uds_dir=uds_dir)
    new_orphans: list[str] = [
        os.path.join(uds_dir, n) for n in new_files
        if os.path.join(uds_dir, n) in orphans
    ]
    if new_orphans:
        import warnings
        warnings.warn(
            'UDS sock-file LEAK detected from test '
            '(reaping):\n  '
            + '\n  '.join(new_orphans),
            stacklevel=1,
        )
        reap_uds(new_orphans)


@pytest.fixture(
    scope='function',
)
def detect_runaway_subactors_per_test():
    '''
    Per-test (function-scoped) runaway-subactor detector.
    **Opt-in**, NOT autouse.

    Apply at module level on cancellation-cascade-heavy
    test files via:

        pytestmark = pytest.mark.usefixtures(
            'detect_runaway_subactors_per_test',
        )

    Snapshots descendant pids before+after each test;
    for any pid spawned during the test that's still
    ALIVE at teardown AND burning >95% CPU, emits a loud
    warning with `pid`, sampled `cpu%`, full `cmdline`,
    AND copy-pastable diag commands (`strace`, `lsof`,
    `ss`, `kill`).

    **Does NOT kill the runaway** — by design.
    The point of this fixture is to make tight-loop bugs
    (e.g. C-level `recvfrom` loop on a closed socket
    that missed EOF detection — see
    `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`)
    loudly visible AT the test that triggers, while
    keeping the live pid available for hands-on
    diagnosis. The session-end
    `_reap_orphaned_subactors` fixture will
    SIGINT-then-SIGKILL any survivors when the test
    session completes normally; if the user Ctrl-C's
    pytest mid-warning, the pid stays alive for as long
    as needed.

    Cost: one extra `os.listdir('/proc')` snapshot
    pre-test, one snapshot + N×`psutil.cpu_percent(0.05)`
    post-test (only when there ARE new descendants —
    most tests don't trigger any sampling). Skips
    silently when `psutil` isn't installed.

    '''
    parent_pid: int = os.getpid()

    def _emit_runaway_warning(
        runaways: list[tuple[int, float, str]],
        when: str,
    ) -> None:
        '''
        Format + emit the runaway warning. Shared between
        the SETUP-side (pre-yield, catches survivors of a
        prior hung test) and TEARDOWN-side (post-yield,
        catches normally-completing tests that left a
        runaway behind) detection passes.

        '''
        msg_lines: list[str] = [
            f'RUNAWAY subactor(s) detected at {when} — '
            f'burning CPU (>95%):',
        ]
        for pid, cpu, cmdline in runaways:
            msg_lines.extend([(
                f'  pid={pid} cpu={cpu:.1f}% cmdline={cmdline!r}\n'
                f'  diagnose live (pid stays alive — NOT killed):\n'
                f'    sudo strace -p {pid} -f -tt -e trace=recvfrom,epoll_wait,read,write\n'
                f'    sudo readlink /proc/{pid}/fd/* 2>/dev/null | head -20\n'
                f'    sudo ss -tnp | grep {pid}\n'
                f'    sudo lsof -p {pid}\n'
                f'  manual kill when done:\n'
                f'    kill -SIGINT {pid}    # graceful first\n'
                f'    kill -SIGKILL {pid}   # if SIGINT ignored (busy in C)\n'
                f'\n'
            )])
        import warnings
        warnings.warn(
            '\n'.join(msg_lines),
            stacklevel=1,
        )

    # SETUP-side detection: catches runaways inherited
    # from a PRIOR test that hung (and the user
    # Ctrl-C'd or pytest-timeout fired) — those tests'
    # teardown-side detector never ran, but the
    # subactor is still burning CPU when the next test
    # starts. The warning comes ONE TEST LATE which is
    # imperfect but better than silence.
    #
    # NB, in the typical clean case `pre_existing` is
    # empty (no test descendants leftover) and the
    # `find_runaway_subactors` call short-circuits
    # without even loading `psutil`.
    pre_existing: set[int] = set(find_descendants(parent_pid))
    pre_runaways: list[tuple[int, float, str]] = (
        find_runaway_subactors(
            parent_pid,
            only_pids=pre_existing,
        )
    )
    if pre_runaways:
        _emit_runaway_warning(
            pre_runaways,
            when='test SETUP (leftover from prior hung test)',
        )

    yield

    # TEARDOWN-side detection: catches runaways spawned
    # by THIS test that survived a normal teardown
    # (i.e. parent's `hard_kill` SIGKILL didn't actually
    # stop the runaway because it was in C tight-loop
    # somewhere unreachable to signals — see
    # `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`
    # for the canonical fork-spawn forkserver-worker
    # post-fork-close gap).
    #
    # `new_pids` is typically empty for tests that
    # cleanly tore down their subactor tree; the call
    # short-circuits before any `psutil` work.
    new_pids: set[int] = (
        set(find_descendants(parent_pid)) - pre_existing
    )
    post_runaways: list[tuple[int, float, str]] = (
        find_runaway_subactors(
            parent_pid,
            only_pids=new_pids,
        )
    )
    if post_runaways:
        _emit_runaway_warning(
            post_runaways,
            when='test teardown',
        )


@pytest.fixture
def reap_subactors_per_test() -> int:
    '''
    Per-test (function-scoped) zombie-subactor reaper —
    **opt-in**, NOT autouse.

    When a test's teardown fails to fully cancel its actor
    tree (e.g. an asyncio cancel-cascade times out under
    `main_thread_forkserver`, pytest hits its 200s wall-
    clock and abandons), the leftover subactor lingers as a
    direct child of `pytest` and squats on whatever
    registrar port / UDS path / shm segment it had bound.
    Subsequent tests trying to allocate the same resource
    fail — and with backends that bind a session-shared
    `reg_addr`, that means EVERY following test in the
    suite cascades. The session-scoped sibling
    (`_reap_orphaned_subactors`) only kicks in at session
    end which is too late to save the cascade.

    Apply at module-level on the topically-problematic
    test files via:

    ```python
    pytestmark = pytest.mark.usefixtures(
        'reap_subactors_per_test',
    )
    ```

    Or per-test via the same `usefixtures` mark on a
    specific function. Intentionally NOT autouse so the
    fixture's presence on a module signals "this module's
    teardown is known-leaky enough to contaminate
    siblings"; the visibility helps future-us track down
    root causes rather than burying them under blanket
    cleanup.

    '''
    parent_pid: int = os.getpid()
    yield parent_pid
    pids: list[int] = find_descendants(parent_pid)
    if pids:
        reap(pids, grace=3.0)