piker/piker/service/_daemon.py

# piker: trading gear for hackers
# Copyright (C) Tyler Goodlet (in stewardship for pikers)

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
Daemon-actor spawning "endpoint-hooks".

"""
from __future__ import annotations
from typing import (
    Callable,
    Any,
)
from contextlib import (
    asynccontextmanager as acm,
)
from collections import defaultdict

import tractor
import trio

from ._util import (
    log,  # sub-sys logger
)
from ._mngr import (
    get_service_mngr,
    ServiceMngr,
)
from ._actor_runtime import maybe_open_pikerd
from ._registry import find_service


@acm
async def maybe_spawn_daemon(
    service_name: str,
    service_task_target: Callable,
    spawn_args: dict[str, Any],

    loglevel: str | None = None,
    singleton: bool = False,

    _locks = defaultdict(trio.Lock),
    **pikerd_kwargs,

) -> tractor.Portal:
    '''
    If no ``service_name`` daemon-actor can be found,
    spawn one in a local subactor and return a portal to it.

    If this function is called from a non-pikerd actor, the
    spawned service will persist as long as pikerd does or
    it is requested to be cancelled.

    This can be seen as a service starting api for remote-actor
    clients.

    '''
    # serialize access to this section to avoid
    # 2 or more tasks racing to create a daemon
    lock = _locks[service_name]
    await lock.acquire()

    async with find_service(
        service_name,
        registry_addrs=[('127.0.0.1', 6116)],
    ) as portal:
        if portal is not None:
            lock.release()
            yield portal
            return

    log.warning(
        f"Couldn't find any existing {service_name}\n"
        'Attempting to spawn new daemon-service..'
    )

    # ask root ``pikerd`` daemon to spawn the daemon we need if
    # pikerd is not live we now become the root of the
    # process tree
    async with maybe_open_pikerd(
        loglevel=loglevel,
        **pikerd_kwargs,

    ) as pikerd_portal:

        # we are the root and thus are `pikerd`
        # so spawn the target service directly by calling
        # the provided target routine.
        # XXX: this assumes that the target is well formed and will
        # do the right things to setup both a sub-actor **and** call
        # the ``_Services`` api from above to start the top level
        # service task for that actor.
        started: bool
        if pikerd_portal is None:
            started = await service_task_target(
                loglevel=loglevel,
                **spawn_args,
            )

        else:
            # request a remote `pikerd` (service manager) to start the
            # target daemon-task, the target can't return
            # a non-serializable value since it is expected that service
            # starting is non-blocking and the target task will persist
            # running "under" or "within" the `pikerd` actor tree after
            # the questing client disconnects. in other words this
            # spawns a persistent daemon actor that continues to live
            # for the lifespan of whatever the service manager inside
            # `pikerd` says it should.
            started = await pikerd_portal.run(
                service_task_target,
                loglevel=loglevel,
                **spawn_args,
            )

        if started:
            log.info(f'Service {service_name} started!')

        # block until we can discover (by IPC connection) to the newly
        # spawned daemon-actor and then deliver the portal to the
        # caller.
        async with tractor.wait_for_actor(service_name) as portal:
            lock.release()
            yield portal
            # --- ---- ---
            # XXX NOTE XXX
            # --- ---- ---
            # DO NOT PUT A `portal.cancel_actor()` here (as was prior)!
            #
            # Doing so will cause an "out-of-band" ctxc
            # (`tractor.ContextCancelled`) to be raised inside the
            # `ServiceMngr.open_context_in_task()`'s call to
            # `ctx.wait_for_result()` AND the internal self-ctxc
            # "graceful capture" WILL NOT CATCH IT!
            #
            # This can cause certain types of operations to raise
            # that ctxc BEFORE THEY `return`, resulting in
            # a "false-negative" ctxc being raised when really
            # nothing actually failed, other then our semantic
            # "failure" to suppress an expected, graceful,
            # self-cancel scenario..
            #
            # bUt wHy duZ It WorK lIKe dis..
            # ------------------------------
            # from the perspective of the `tractor.Context` this
            # cancel request was conducted "out of band" since
            # `Context.cancel()` was never called and thus the
            # `._cancel_called: bool` was never set. Despite the
            # remote `.canceller` being set to `pikerd` (i.e. the
            # same `Actor.uid` of the raising service-mngr task) the
            # service-task's ctx itself was never marked as having
            # requested cancellation and thus still raises the ctxc
            # bc it was unaware of any such request.
            #
            # How to make grokin these cases easier tho?
            # ------------------------------------------
            # Because `Portal.cancel_actor()` was called it requests
            # "full-`Actor`-runtime-cancellation" of it's peer
            # process which IS NOT THE SAME as a single inter-actor
            # RPC task cancelling its local context with a remote
            # peer `Task` in that same peer process.
            #
            # ?TODO? It might be better if we do one (or all) of the
            # following:
            #
            # -[ ] at least set a special message for the
            #    `ContextCancelled` when raised locally by the
            #    unaware ctx task such that we check for the
            #    `.canceller` being *our `Actor`* and in the case
            #    where `Context._cancel_called == False` we specially
            #    note that this is likely an "out-of-band"
            #    runtime-cancel request triggered by some call to
            #    `Portal.cancel_actor()`, possibly even reporting the
            #    exact LOC of that caller by tracking it inside our
            #    portal-type?
            # -[ ] possibly add another field `ContextCancelled` like
            #    maybe a,
            #    `.request_type: Literal['os', 'proc', 'actor',
            #    'ctx']` type thing which would allow immediately
            #    being able to tell what kind of cancellation caused
            #    the unexpected ctxc?
            # -[ ] REMOVE THIS COMMENT, once we've settled on how to
            #     better augment `tractor` to be more explicit on this!


async def spawn_emsd(

    loglevel: str | None = None,
    **extra_tractor_kwargs

) -> bool:
    """
    Start the clearing engine under ``pikerd``.

    """
    log.info('Spawning emsd')

    smngr: ServiceMngr = get_service_mngr()
    portal = await smngr.actor_n.start_actor(
        'emsd',
        enable_modules=[
            'piker.clearing._ems',
            'piker.clearing._client',
        ],
        loglevel=loglevel,
        debug_mode=smngr.debug_mode,  # set by pikerd flag
        **extra_tractor_kwargs
    )

    # non-blocking setup of clearing service
    from ..clearing._ems import _setup_persistent_emsd

    await smngr.start_service_task(
        'emsd',
        portal,

        # signature of target root-task endpoint
        _setup_persistent_emsd,
        loglevel=loglevel,
    )
    return True


@acm
async def maybe_open_emsd(

    brokername: str,
    loglevel: str | None = None,

    **pikerd_kwargs,

) -> tractor.Portal:  # noqa

    async with maybe_spawn_daemon(
        'emsd',
        service_task_target=spawn_emsd,
        spawn_args={},
        loglevel=loglevel,
        **pikerd_kwargs,

    ) as portal:
        yield portal