Ensure all sub-services cancel on `pikerd` exit

Previously we were relying on implicit actor termination in
`maybe_spawn_daemon()` but really on `pikerd` teardown we should be sure
to tear down not only all service tasks in each actor but also the actor
runtimes. This adjusts `Services.cancel_service()` to only cancel the
service task scope and wait on the `complete` event and reworks the
`open_context_in_task()` inner closure body to,

- always cancel the service actor at exit.
- not call `.cancel_service()` (potentially causing recursion issues on
  cancellation).
- allocate a `complete: trio.Event` to signal full task + actor termination.
- pop the service task from the `.service_tasks` registry.

Further, add a `maybe_set_global_registry_sockaddr()` helper-cm to do
the work of checking whether a registry socket needs-to/has-been set
and use it for discovery calls to the `pikerd` service tree.
samplerd_service
Tyler Goodlet 2023-01-10 15:25:25 -05:00
parent 6a1bb13feb
commit c8c641a038
1 changed files with 118 additions and 72 deletions

View File

@ -19,7 +19,10 @@ Structured, daemon tree service management.
""" """
from typing import Optional, Union, Callable, Any from typing import Optional, Union, Callable, Any
from contextlib import asynccontextmanager as acm from contextlib import (
asynccontextmanager as acm,
contextmanager as cm,
)
from collections import defaultdict from collections import defaultdict
import tractor import tractor
@ -57,12 +60,21 @@ _root_modules = [
] ]
# TODO: factor this into a ``tractor.highlevel`` extension
# pack for the library.
class Services: class Services:
actor_n: tractor._supervise.ActorNursery actor_n: tractor._supervise.ActorNursery
service_n: trio.Nursery service_n: trio.Nursery
debug_mode: bool # tractor sub-actor debug mode flag debug_mode: bool # tractor sub-actor debug mode flag
service_tasks: dict[str, tuple[trio.CancelScope, tractor.Portal]] = {} service_tasks: dict[
str,
tuple[
trio.CancelScope,
tractor.Portal,
trio.Event,
]
] = {}
locks = defaultdict(trio.Lock) locks = defaultdict(trio.Lock)
@classmethod @classmethod
@ -84,7 +96,12 @@ class Services:
''' '''
async def open_context_in_task( async def open_context_in_task(
task_status: TaskStatus[ task_status: TaskStatus[
trio.CancelScope] = trio.TASK_STATUS_IGNORED, tuple[
trio.CancelScope,
trio.Event,
Any,
]
] = trio.TASK_STATUS_IGNORED,
) -> Any: ) -> Any:
@ -96,28 +113,33 @@ class Services:
) as (ctx, first): ) as (ctx, first):
# unblock once the remote context has started # unblock once the remote context has started
task_status.started((cs, first)) complete = trio.Event()
task_status.started((cs, complete, first))
log.info( log.info(
f'`pikerd` service {name} started with value {first}' f'`pikerd` service {name} started with value {first}'
) )
try: try:
# wait on any context's return value # wait on any context's return value
# and any final portal result from the
# sub-actor.
ctx_res = await ctx.result() ctx_res = await ctx.result()
except tractor.ContextCancelled:
return await self.cancel_service(name) # NOTE: blocks indefinitely until cancelled
else: # either by error from the target context
# wait on any error from the sub-actor # function or by being cancelled here by the
# NOTE: this will block indefinitely until # surrounding cancel scope.
# cancelled either by error from the target
# context function or by being cancelled here by
# the surrounding cancel scope
return (await portal.result(), ctx_res) return (await portal.result(), ctx_res)
cs, first = await self.service_n.start(open_context_in_task) finally:
await portal.cancel_actor()
complete.set()
self.service_tasks.pop(name)
cs, complete, first = await self.service_n.start(open_context_in_task)
# store the cancel scope and portal for later cancellation or # store the cancel scope and portal for later cancellation or
# retstart if needed. # retstart if needed.
self.service_tasks[name] = (cs, portal) self.service_tasks[name] = (cs, portal, complete)
return cs, first return cs, first
@ -127,13 +149,38 @@ class Services:
name: str, name: str,
) -> Any: ) -> Any:
'''
Cancel the service task and actor for the given ``name``.
'''
log.info(f'Cancelling `pikerd` service {name}') log.info(f'Cancelling `pikerd` service {name}')
cs, portal = self.service_tasks[name] cs, portal, complete = self.service_tasks[name]
# XXX: not entirely sure why this is required,
# and should probably be better fine tuned in
# ``tractor``?
cs.cancel() cs.cancel()
return await portal.cancel_actor() await complete.wait()
assert name not in self.service_tasks, \
f'Serice task for {name} not terminated?'
@cm
def maybe_set_global_registry_sockaddr(
registry_addr: None | tuple[str, int] = None,
) -> None:
global _registry_addr
was_set: bool = False
if (
_registry_addr is None
or registry_addr
):
_registry_addr = registry_addr or _default_reg_addr
try:
yield _registry_addr
finally:
# XXX: always clear the global addr if we set it so that the
# next (set of) calls will apply whatever new one is passed
# in.
if was_set:
_registry_addr = None
@acm @acm
@ -155,40 +202,38 @@ async def open_pikerd(
alive underling services (see below). alive underling services (see below).
''' '''
global _registry_addr
if ( with maybe_set_global_registry_sockaddr(registry_addr) as reg_addr:
_registry_addr is None async with (
or registry_addr tractor.open_root_actor(
):
_registry_addr = registry_addr or _default_reg_addr
# XXX: this may open a root actor as well # passed through to ``open_root_actor``
async with ( arbiter_addr=reg_addr,
tractor.open_root_actor( name=_root_dname,
loglevel=loglevel,
debug_mode=debug_mode,
start_method=start_method,
# passed through to ``open_root_actor`` # TODO: eventually we should be able to avoid
arbiter_addr=_registry_addr, # having the root have more then permissions to
name=_root_dname, # spawn other specialized daemons I think?
loglevel=loglevel, enable_modules=_root_modules,
debug_mode=debug_mode, ) as _,
start_method=start_method,
# TODO: eventually we should be able to avoid tractor.open_nursery() as actor_nursery,
# having the root have more then permissions to ):
# spawn other specialized daemons I think? async with trio.open_nursery() as service_nursery:
enable_modules=_root_modules,
) as _,
tractor.open_nursery() as actor_nursery, # assign globally for future daemon/task creation
): Services.actor_n = actor_nursery
async with trio.open_nursery() as service_nursery: Services.service_n = service_nursery
Services.debug_mode = debug_mode
# assign globally for future daemon/task creation try:
Services.actor_n = actor_nursery yield
Services.service_n = service_nursery finally:
Services.debug_mode = debug_mode # if 'samplerd' in Services.service_tasks:
yield # await Services.cancel_service('samplerd')
service_nursery.cancel_scope.cancel()
@acm @acm
@ -209,32 +254,24 @@ async def open_piker_runtime(
existing piker actors on the local link based on configuration. existing piker actors on the local link based on configuration.
''' '''
global _registry_addr with maybe_set_global_registry_sockaddr(registry_addr) as reg_addr:
async with (
tractor.open_root_actor(
if ( # passed through to ``open_root_actor``
_registry_addr is None arbiter_addr=reg_addr,
or registry_addr name=name,
): loglevel=loglevel,
_registry_addr = registry_addr or _default_reg_addr debug_mode=debug_mode,
start_method=start_method,
# XXX: this may open a root actor as well # TODO: eventually we should be able to avoid
async with ( # having the root have more then permissions to
tractor.open_root_actor( # spawn other specialized daemons I think?
enable_modules=_root_modules + enable_modules,
# passed through to ``open_root_actor`` ) as _,
arbiter_addr=_registry_addr, ):
name=name, yield tractor.current_actor()
loglevel=loglevel,
debug_mode=debug_mode,
start_method=start_method,
# TODO: eventually we should be able to avoid
# having the root have more then permissions to
# spawn other specialized daemons I think?
enable_modules=_root_modules + enable_modules,
) as _,
):
yield tractor.current_actor()
@acm @acm
@ -325,6 +362,11 @@ async def find_service(
service_name: str, service_name: str,
) -> Optional[tractor.Portal]: ) -> Optional[tractor.Portal]:
global _registry_addr
if not _registry_addr:
yield None
return
log.info(f'Scanning for service `{service_name}`') log.info(f'Scanning for service `{service_name}`')
# attach to existing daemon by name if possible # attach to existing daemon by name if possible
async with tractor.find_actor( async with tractor.find_actor(
@ -342,6 +384,10 @@ async def check_for_service(
Service daemon "liveness" predicate. Service daemon "liveness" predicate.
''' '''
global _registry_addr
if not _registry_addr:
return None
async with tractor.query_actor( async with tractor.query_actor(
service_name, service_name,
arbiter_sockaddr=_registry_addr, arbiter_sockaddr=_registry_addr,