tractor/tests/discovery/test_multi_program.py

"""
Multiple python programs invoking the runtime.
"""
from __future__ import annotations
import platform
import subprocess
import time
from typing import (
    TYPE_CHECKING,
)

import pytest
import trio
import tractor
from tractor._testing import (
    tractor_test,
)
from tractor import (
    current_actor,
    Actor,
    Context,
    Portal,
)
from tractor.runtime import _state
from ..conftest import (
    sig_prog,
    _INT_SIGNAL,
    _INT_RETURN_CODE,
)

if TYPE_CHECKING:
    from tractor.msg import Aid
    from tractor.discovery._addr import (
        UnwrappedAddress,
    )


_non_linux: bool = platform.system() != 'Linux'


# NOTE, multi-program tests historically triggered both
# UDS sock-file leaks (daemon-subproc SIGKILL paths) AND
# trio `WakeupSocketpair.drain()` busy-loops
# (`test_register_duplicate_name`). Track + detect
# per-test as a regression net.
pytestmark = pytest.mark.usefixtures(
    'track_orphaned_uds_per_test',
    'detect_runaway_subactors_per_test',
)


def test_abort_on_sigint(
    daemon: subprocess.Popen,
):
    assert daemon.returncode is None
    time.sleep(0.1)
    sig_prog(daemon, _INT_SIGNAL)
    assert daemon.returncode == _INT_RETURN_CODE

    # XXX: oddly, couldn't get capfd.readouterr() to work here?
    if platform.system() != 'Windows':
        # don't check stderr on windows as its empty when sending CTRL_C_EVENT
        assert "KeyboardInterrupt" in str(daemon.stderr.read())


@tractor_test
async def test_cancel_remote_registrar(
    daemon: subprocess.Popen,
    reg_addr: UnwrappedAddress,
):
    assert not current_actor().is_registrar
    async with tractor.get_registry(reg_addr) as portal:
        await portal.cancel_actor()

    time.sleep(0.1)
    # the registrar channel server is cancelled but not its main task
    assert daemon.returncode is None

    # no registrar socket should exist
    with pytest.raises(OSError):
        async with tractor.get_registry(reg_addr) as portal:
            pass


def test_register_duplicate_name(
    daemon: subprocess.Popen,
    reg_addr: UnwrappedAddress,
):
    # bug-class-3 breadcrumbs: the *last* `[CANCEL]` line that
    # appears under `--ll cancel`/`TRACTOR_LOG_FILE=...` names the
    # cancel-cascade boundary that's parked. Pair with
    # `_trio_main` entry/exit breadcrumbs in
    # `tractor/spawn/_entry.py` to triangulate the swallow point.
    log = tractor.log.get_logger('tractor.tests.test_multi_program')

    async def main():
        log.cancel('test_register_duplicate_name: enter `main()`')
        try:
            async with tractor.open_nursery(
                registry_addrs=[reg_addr],
            ) as an:
                log.cancel(
                    'test_register_duplicate_name: '
                    'actor nursery opened'
                )

                assert not current_actor().is_registrar

                p1 = await an.start_actor('doggy')
                log.cancel(
                    'test_register_duplicate_name: '
                    'spawned doggy #1'
                )
                p2 = await an.start_actor('doggy')
                log.cancel(
                    'test_register_duplicate_name: '
                    'spawned doggy #2'
                )

                async with tractor.wait_for_actor('doggy') as portal:
                    log.cancel(
                        'test_register_duplicate_name: '
                        '`wait_for_actor` returned'
                    )
                    assert portal.channel.uid in (p2.channel.uid, p1.channel.uid)

                log.cancel(
                    'test_register_duplicate_name: '
                    'ABOUT TO CALL `an.cancel()`'
                )
                await an.cancel()
                log.cancel(
                    'test_register_duplicate_name: '
                    '`an.cancel()` returned'
                )
        finally:
            log.cancel(
                'test_register_duplicate_name: '
                '`open_nursery.__aexit__` returned, leaving `main()`'
            )

    # XXX, run manually since we want to start this root **after**
    # the other "daemon" program with it's own root.
    trio.run(main)


@pytest.mark.parametrize(
    'n_dups',
    [
        2,
        # `n_dups=4` exposes a SEPARATE pre-existing race: under
        # rapid same-name spawning against a forkserver +
        # registrar, ONE of the spawned doggies (typically the
        # 3rd) `sys.exit(2)`s during boot before completing
        # parent-handshake. Surfaces now (post the spawn-time
        # `wait_for_peer_or_proc_death` fix) as `ActorFailure
        # rc=2`; previously it was silently masked by the
        # handshake-wait parking forever.
        #
        # Tracked separately in,
        # https://github.com/goodboy/tractor/issues/456 
        pytest.param(
            4,
            marks=pytest.mark.xfail(
                strict=False,
                reason=(
                    'doggy boot-race rc=2 under rapid same-name '
                    'spawn — separate bug from cancel-cascade'
                ),
            ),
        ),
        8,
    ],
    ids=lambda n: f'n_dups={n}',
)
def test_dup_name_cancel_cascade_escalates_to_hard_kill(
    daemon: subprocess.Popen,
    reg_addr: UnwrappedAddress,
    n_dups: int,
):
    '''
    Regression for the duplicate-name cancel-cascade hang under
    `tcp+main_thread_forkserver`.

    When N actors share a single name and the parent calls
    `an.cancel()`, the daemon registrar gets N `register_actor` RPCs
    in tight succession. Under TCP+MTF, kernel-level socket-buffer
    contention can push at least one sub-actor's cancel-RPC ack past
    `Portal.cancel_timeout` (default 0.5s).

    Pre-fix, `Portal.cancel_actor()` silently returned `False` on
    that timeout, the supervisor's outer `move_on_after(3)` never
    fired (each per-portal task always returned ≤0.5s, never
    exceeded 3s), and `soft_kill()`'s `await wait_func(proc)` parked
    forever — deadlocking nursery `__aexit__`.

    Post-fix, `Portal.cancel_actor()` raises `ActorTooSlowError` on
    the bounded-wait timeout, and `ActorNursery.cancel()`'s
    per-child wrapper escalates to `proc.terminate()` (hard-kill).
    The full nursery teardown therefore stays bounded even under
    pathological timing.

    `n_dups` is parametrized to widen the race window — more
    same-name siblings = more concurrent register-RPCs at the
    daemon = higher probability of hitting the contention path.

    '''
    log = tractor.log.get_logger(
        'tractor.tests.test_multi_program'
    )

    # outer hard ceiling: a regression should fail-fast, NOT hang
    # the test session for minutes. Budget scales with `n_dups`
    # since each extra same-name sibling adds ~spawn-cost +
    # potential cancel-ack-timeout escalation latency under
    # TCP+forkserver. ~5s/sibling + 15s baseline gives plenty of
    # headroom while still failing-loud on a real hang.
    fail_after_s: int = 15 + (5 * n_dups)

    async def main():
        log.cancel(
            f'enter `main()` n_dups={n_dups}'
        )
        with trio.fail_after(fail_after_s):
            async with tractor.open_nursery(
                registry_addrs=[reg_addr],
            ) as an:
                portals: list[Portal] = []
                for i in range(n_dups):
                    p: Portal = await an.start_actor('doggy')
                    portals.append(p)
                    log.cancel(
                        f'spawned doggy #{i + 1}/{n_dups}'
                    )

                # at least one of the N must be discoverable by
                # name; doesn't matter which one (registrar will
                # have last-wins semantics under same-name).
                async with tractor.wait_for_actor('doggy') as portal:
                    expected_uids = {p.channel.uid for p in portals}
                    assert portal.channel.uid in expected_uids

                # critical section: this MUST return within
                # `fail_after_s` even when one or more cancel-RPC
                # acks time out. Pre-fix, this hangs forever.
                log.cancel('about to call `an.cancel()`')
                await an.cancel()
                log.cancel('`an.cancel()` returned')

        # post-teardown sanity: every child proc must be reaped.
        # If escalation worked, even timed-out cancel-RPCs would
        # have triggered `proc.terminate()` and the procs are dead.
        for p in portals:
            # `Portal.channel.connected()` -> False once the
            # underlying chan disconnected (clean exit OR
            # hard-killed proc both produce disconnect).
            assert not p.channel.connected(), (
                f'Portal chan still connected post-teardown?\n'
                f'{p.channel}'
            )

    trio.run(main)


@tractor.context
async def get_root_portal(
    ctx: Context,
):
    '''
    Connect back to the root actor manually (using `._discovery` API)
    and ensure it's contact info is the same as our immediate parent.

    '''
    sub: Actor = current_actor()
    rtvs: dict = _state._runtime_vars
    raddrs: list[UnwrappedAddress] = rtvs['_root_addrs']

    # await tractor.pause()
    # XXX, in case the sub->root discovery breaks you might need
    # this (i know i did Xp)!!
    # from tractor.devx import mk_pdb
    # mk_pdb().set_trace()

    assert (
        len(raddrs) == 1
        and
        list(sub._parent_chan.raddr.unwrap()) in raddrs
    )

    # connect back to our immediate parent which should also
    # be the actor-tree's root.
    from tractor.discovery._api import get_root
    ptl: Portal
    async with get_root() as ptl:
        root_aid: Aid = ptl.chan.aid
        parent_ptl: Portal = current_actor().get_parent()
        assert (
            root_aid.name == 'root'
            and
            parent_ptl.chan.aid == root_aid
        )
        await ctx.started()


def test_non_registrar_spawns_child(
    daemon: subprocess.Popen,
    reg_addr: UnwrappedAddress,
    loglevel: str,
    debug_mode: bool,
    ci_env: bool,
):
    '''
    Ensure a non-regristar (serving) root actor can spawn a sub and
    that sub can connect back (manually) to it's rent that is the
    root without issue.

    More or less this audits the global contact info in
    `._state._runtime_vars`.

    '''
    async def main():

        # XXX, since apparently on macos in GH's CI it can be a race
        # with the `daemon` registrar on grabbing the socket-addr..
        if ci_env and _non_linux:
            await trio.sleep(.5)

        async with tractor.open_nursery(
            registry_addrs=[reg_addr],
            loglevel=loglevel,
            debug_mode=debug_mode,
        ) as an:

            actor: Actor = tractor.current_actor()
            assert not actor.is_registrar
            sub_ptl: Portal = await an.start_actor(
                name='sub',
                enable_modules=[__name__],
            )

            async with sub_ptl.open_context(
                get_root_portal,
            ) as (ctx, _):
                print('Waiting for `sub` to connect back to us..')

            await an.cancel()

    # XXX, run manually since we want to start this root **after**
    # the other "daemon" program with it's own root.
    trio.run(main)