tractor/tractor/_supervise.py

# tractor: structured concurrent "actors".
# Copyright 2018-eternity Tyler Goodlet.

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
``trio`` inspired apis and helpers

"""
from contextlib import asynccontextmanager as acm
from functools import partial
import inspect
from typing import (
    Optional,
    TYPE_CHECKING,
)
import typing
import warnings

from exceptiongroup import BaseExceptionGroup
import trio

from ._debug import maybe_wait_for_debugger
from ._state import current_actor, is_main_process
from .log import get_logger, get_loglevel
from ._runtime import Actor
from ._portal import Portal
from ._exceptions import is_multi_cancelled
from ._root import open_root_actor
from . import _state
from . import _spawn


if TYPE_CHECKING:
    import multiprocessing as mp

log = get_logger(__name__)

_default_bind_addr: tuple[str, int] = ('127.0.0.1', 0)


class ActorNursery:
    '''
    The fundamental actor supervision construct: spawn and manage
    explicit lifetime and capability restricted, bootstrapped,
    ``trio.run()`` scheduled sub-processes.

    Though the concept of a "process nursery" is different in complexity
    and slightly different in semantics then a tradtional single
    threaded task nursery, much of the interface is the same. New
    processes each require a top level "parent" or "root" task which is
    itself no different then any task started by a tradtional
    ``trio.Nursery``. The main difference is that each "actor" (a
    process + ``trio.run()``) contains a full, paralell executing
    ``trio``-task-tree. The following super powers ensue:

    - starting tasks in a child actor are completely independent of
      tasks started in the current process. They execute in *parallel*
      relative to tasks in the current process and are scheduled by their
      own actor's ``trio`` run loop.
    - tasks scheduled in a remote process still maintain an SC protocol
      across memory boundaries using a so called "structured concurrency
      dialogue protocol" which ensures task-hierarchy-lifetimes are linked.
    - remote tasks (in another actor) can fail and relay failure back to
      the caller task (in some other actor) via a seralized
      ``RemoteActorError`` which means no zombie process or RPC
      initiated task can ever go off on its own.

    '''
    def __init__(
        self,
        actor: Actor,
        ria_nursery: trio.Nursery,
        da_nursery: trio.Nursery,
        errors: dict[tuple[str, str], BaseException],
    ) -> None:
        # self.supervisor = supervisor  # TODO
        self._actor: Actor = actor
        self._ria_nursery = ria_nursery
        self._da_nursery = da_nursery
        self._children: dict[
            tuple[str, str],
            tuple[
                Actor,
                trio.Process | mp.Process,
                Optional[Portal],
            ]
        ] = {}
        # portals spawned with ``run_in_actor()`` are
        # cancelled when their "main" result arrives
        self._cancel_after_result_on_exit: set = set()
        self.cancelled: bool = False
        self._join_procs = trio.Event()
        self._at_least_one_child_in_debug: bool = False
        self.errors = errors
        self.exited = trio.Event()

    async def start_actor(
        self,
        name: str,
        *,
        bind_addr: tuple[str, int] = _default_bind_addr,
        rpc_module_paths: list[str] | None = None,
        enable_modules: list[str] | None = None,
        loglevel: str | None = None,  # set log level per subactor
        nursery: trio.Nursery | None = None,
        debug_mode: Optional[bool] | None = None,
        infect_asyncio: bool = False,
    ) -> Portal:
        '''
        Start a (daemon) actor: an process that has no designated
        "main task" besides the runtime.

        '''
        loglevel = loglevel or self._actor.loglevel or get_loglevel()

        # configure and pass runtime state
        _rtv = _state._runtime_vars.copy()
        _rtv['_is_root'] = False

        # allow setting debug policy per actor
        if debug_mode is not None:
            _rtv['_debug_mode'] = debug_mode
            self._at_least_one_child_in_debug = True

        enable_modules = enable_modules or []

        if rpc_module_paths:
            warnings.warn(
                "`rpc_module_paths` is now deprecated, use "
                " `enable_modules` instead.",
                DeprecationWarning,
                stacklevel=2,
            )
            enable_modules.extend(rpc_module_paths)

        subactor = Actor(
            name,
            # modules allowed to invoked funcs from
            enable_modules=enable_modules,
            loglevel=loglevel,
            arbiter_addr=current_actor()._arb_addr,
        )
        parent_addr = self._actor.accept_addr
        assert parent_addr

        # start a task to spawn a process
        # blocks until process has been started and a portal setup
        nursery = nursery or self._da_nursery

        # XXX: the type ignore is actually due to a `mypy` bug
        return await nursery.start(  # type: ignore
            partial(
                _spawn.new_proc,
                name,
                self,
                subactor,
                self.errors,
                bind_addr,
                parent_addr,
                _rtv,  # run time vars
                infect_asyncio=infect_asyncio,
            )
        )

    async def run_in_actor(
        self,

        fn: typing.Callable,
        *,

        name: Optional[str] = None,
        bind_addr: tuple[str, int] = _default_bind_addr,
        rpc_module_paths: list[str] | None = None,
        enable_modules: list[str] | None = None,
        loglevel: str | None = None,  # set log level per subactor
        infect_asyncio: bool = False,

        **kwargs,  # explicit args to ``fn``

    ) -> Portal:
        """Spawn a new actor, run a lone task, then terminate the actor and
        return its result.

        Actors spawned using this method are kept alive at nursery teardown
        until the task spawned by executing ``fn`` completes at which point
        the actor is terminated.
        """
        mod_path = fn.__module__

        if name is None:
            # use the explicit function name if not provided
            name = fn.__name__

        portal = await self.start_actor(
            name,
            enable_modules=[mod_path] + (
                enable_modules or rpc_module_paths or []
            ),
            bind_addr=bind_addr,
            loglevel=loglevel,
            # use the run_in_actor nursery
            nursery=self._ria_nursery,
            infect_asyncio=infect_asyncio,
        )

        # XXX: don't allow stream funcs
        if not (
            inspect.iscoroutinefunction(fn) and
            not getattr(fn, '_tractor_stream_function', False)
        ):
            raise TypeError(f'{fn} must be an async function!')

        # this marks the actor to be cancelled after its portal result
        # is retreived, see logic in `open_nursery()` below.
        self._cancel_after_result_on_exit.add(portal)
        await portal._submit_for_result(
            mod_path,
            fn.__name__,
            **kwargs
        )
        return portal

    async def cancel(self, hard_kill: bool = False) -> None:
        """Cancel this nursery by instructing each subactor to cancel
        itself and wait for all subactors to terminate.

        If ``hard_killl`` is set to ``True`` then kill the processes
        directly without any far end graceful ``trio`` cancellation.
        """
        self.cancelled = True

        log.cancel(f"Cancelling nursery in {self._actor.uid}")
        with trio.move_on_after(3) as cs:

            async with trio.open_nursery() as nursery:

                for subactor, proc, portal in self._children.values():

                    # TODO: are we ever even going to use this or
                    # is the spawning backend responsible for such
                    # things? I'm thinking latter.
                    if hard_kill:
                        proc.terminate()

                    else:
                        if portal is None:  # actor hasn't fully spawned yet
                            event = self._actor._peer_connected[subactor.uid]
                            log.warning(
                                f"{subactor.uid} wasn't finished spawning?")

                            await event.wait()

                            # channel/portal should now be up
                            _, _, portal = self._children[subactor.uid]

                            # XXX should be impossible to get here
                            # unless method was called from within
                            # shielded cancel scope.
                            if portal is None:
                                # cancelled while waiting on the event
                                # to arrive
                                chan = self._actor._peers[subactor.uid][-1]
                                if chan:
                                    portal = Portal(chan)
                                else:  # there's no other choice left
                                    proc.terminate()

                        # spawn cancel tasks for each sub-actor
                        assert portal
                        if portal.channel.connected():
                            nursery.start_soon(portal.cancel_actor)

        # if we cancelled the cancel (we hung cancelling remote actors)
        # then hard kill all sub-processes
        if cs.cancelled_caught:
            log.error(
                f"Failed to cancel {self}\nHard killing process tree!")
            for subactor, proc, portal in self._children.values():
                log.warning(f"Hard killing process {proc}")
                proc.terminate()

        # mark ourselves as having (tried to have) cancelled all subactors
        self._join_procs.set()


@acm
async def _open_and_supervise_one_cancels_all_nursery(
    actor: Actor,

) -> typing.AsyncGenerator[ActorNursery, None]:

    # TODO: yay or nay?
    __tracebackhide__ = True

    # the collection of errors retreived from spawned sub-actors
    errors: dict[tuple[str, str], BaseException] = {}

    # This is the outermost level "deamon actor" nursery. It is awaited
    # **after** the below inner "run in actor nursery". This allows for
    # handling errors that are generated by the inner nursery in
    # a supervisor strategy **before** blocking indefinitely to wait for
    # actors spawned in "daemon mode" (aka started using
    # ``ActorNursery.start_actor()``).

    # errors from this daemon actor nursery bubble up to caller
    async with trio.open_nursery() as da_nursery:
        try:
            # This is the inner level "run in actor" nursery. It is
            # awaited first since actors spawned in this way (using
            # ``ActorNusery.run_in_actor()``) are expected to only
            # return a single result and then complete (i.e. be canclled
            # gracefully). Errors collected from these actors are
            # immediately raised for handling by a supervisor strategy.
            # As such if the strategy propagates any error(s) upwards
            # the above "daemon actor" nursery will be notified.
            async with trio.open_nursery() as ria_nursery:

                anursery = ActorNursery(
                    actor,
                    ria_nursery,
                    da_nursery,
                    errors
                )
                try:
                    # spawning of actors happens in the caller's scope
                    # after we yield upwards
                    yield anursery

                    # When we didn't error in the caller's scope,
                    # signal all process-monitor-tasks to conduct
                    # the "hard join phase".
                    log.runtime(
                        f"Waiting on subactors {anursery._children} "
                        "to complete"
                    )
                    anursery._join_procs.set()

                except BaseException as inner_err:
                    errors[actor.uid] = inner_err

                    # If we error in the root but the debugger is
                    # engaged we don't want to prematurely kill (and
                    # thus clobber access to) the local tty since it
                    # will make the pdb repl unusable.
                    # Instead try to wait for pdb to be released before
                    # tearing down.
                    await maybe_wait_for_debugger(
                        child_in_debug=anursery._at_least_one_child_in_debug
                    )

                    # if the caller's scope errored then we activate our
                    # one-cancels-all supervisor strategy (don't
                    # worry more are coming).
                    anursery._join_procs.set()

                    # XXX: hypothetically an error could be
                    # raised and then a cancel signal shows up
                    # slightly after in which case the `else:`
                    # block here might not complete?  For now,
                    # shield both.
                    with trio.CancelScope(shield=True):
                        etype = type(inner_err)
                        if etype in (
                            trio.Cancelled,
                            KeyboardInterrupt
                        ) or (
                            is_multi_cancelled(inner_err)
                        ):
                            log.cancel(
                                f"Nursery for {current_actor().uid} "
                                f"was cancelled with {etype}")
                        else:
                            log.exception(
                                f"Nursery for {current_actor().uid} "
                                f"errored with")

                        # cancel all subactors
                        await anursery.cancel()

            # ria_nursery scope end

        # TODO: this is the handler around the ``.run_in_actor()``
        # nursery. Ideally we can drop this entirely in the future as
        # the whole ``.run_in_actor()`` API should be built "on top of"
        # this lower level spawn-request-cancel "daemon actor" API where
        # a local in-actor task nursery is used with one-to-one task
        # + `await Portal.run()` calls and the results/errors are
        # handled directly (inline) and errors by the local nursery.
        except (
            Exception,
            BaseExceptionGroup,
            trio.Cancelled

        ) as err:

            # XXX: yet another guard before allowing the cancel
            # sequence in case a (single) child is in debug.
            await maybe_wait_for_debugger(
                child_in_debug=anursery._at_least_one_child_in_debug
            )

            # If actor-local error was raised while waiting on
            # ".run_in_actor()" actors then we also want to cancel all
            # remaining sub-actors (due to our lone strategy:
            # one-cancels-all).
            log.cancel(f"Nursery cancelling due to {err}")
            if anursery._children:
                with trio.CancelScope(shield=True):
                    await anursery.cancel()
            raise
        finally:
            # No errors were raised while awaiting ".run_in_actor()"
            # actors but those actors may have returned remote errors as
            # results (meaning they errored remotely and have relayed
            # those errors back to this parent actor). The errors are
            # collected in ``errors`` so cancel all actors, summarize
            # all errors and re-raise.
            if errors:
                if anursery._children:
                    with trio.CancelScope(shield=True):
                        await anursery.cancel()

                # use `BaseExceptionGroup` as needed
                if len(errors) > 1:
                    raise BaseExceptionGroup(
                        'tractor.ActorNursery errored with',
                        tuple(errors.values()),
                    )
                else:
                    raise list(errors.values())[0]

        # da_nursery scope end - nursery checkpoint
    # final exit


@acm
async def open_nursery(
    **kwargs,

) -> typing.AsyncGenerator[ActorNursery, None]:
    '''
    Create and yield a new ``ActorNursery`` to be used for spawning
    structured concurrent subactors.

    When an actor is spawned a new trio task is started which
    invokes one of the process spawning backends to create and start
    a new subprocess. These tasks are started by one of two nurseries
    detailed below. The reason for spawning processes from within
    a new task is because ``trio_run_in_process`` itself creates a new
    internal nursery and the same task that opens a nursery **must**
    close it. It turns out this approach is probably more correct
    anyway since it is more clear from the following nested nurseries
    which cancellation scopes correspond to each spawned subactor set.

    '''
    implicit_runtime = False

    actor = current_actor(err_on_no_runtime=False)

    try:
        if actor is None and is_main_process():

            # if we are the parent process start the
            # actor runtime implicitly
            log.info("Starting actor runtime!")

            # mark us for teardown on exit
            implicit_runtime = True

            async with open_root_actor(**kwargs) as actor:
                assert actor is current_actor()

                try:
                    async with _open_and_supervise_one_cancels_all_nursery(
                        actor
                    ) as anursery:
                        yield anursery
                finally:
                    anursery.exited.set()

        else:  # sub-nursery case

            try:
                async with _open_and_supervise_one_cancels_all_nursery(
                    actor
                ) as anursery:
                    yield anursery
            finally:
                anursery.exited.set()

    finally:
        log.debug("Nursery teardown complete")

        # shutdown runtime if it was started
        if implicit_runtime:
            log.info("Shutting down actor tree")