Ensure all sub-services cancel on `pikerd` exit
Previously we were relying on implicit actor termination in `maybe_spawn_daemon()` but really on `pikerd` teardown we should be sure to tear down not only all service tasks in each actor but also the actor runtimes. This adjusts `Services.cancel_service()` to only cancel the service task scope and wait on the `complete` event and reworks the `open_context_in_task()` inner closure body to, - always cancel the service actor at exit. - not call `.cancel_service()` (potentially causing recursion issues on cancellation). - allocate a `complete: trio.Event` to signal full task + actor termination. - pop the service task from the `.service_tasks` registry. Further, add a `maybe_set_global_registry_sockaddr()` helper-cm to do the work of checking whether a registry socket needs-to/has-been set and use it for discovery calls to the `pikerd` service tree.samplerd_service
							parent
							
								
									6a1bb13feb
								
							
						
					
					
						commit
						c8c641a038
					
				
							
								
								
									
										190
									
								
								piker/_daemon.py
								
								
								
								
							
							
						
						
									
										190
									
								
								piker/_daemon.py
								
								
								
								
							|  | @ -19,7 +19,10 @@ Structured, daemon tree service management. | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
| from typing import Optional, Union, Callable, Any | from typing import Optional, Union, Callable, Any | ||||||
| from contextlib import asynccontextmanager as acm | from contextlib import ( | ||||||
|  |     asynccontextmanager as acm, | ||||||
|  |     contextmanager as cm, | ||||||
|  | ) | ||||||
| from collections import defaultdict | from collections import defaultdict | ||||||
| 
 | 
 | ||||||
| import tractor | import tractor | ||||||
|  | @ -57,12 +60,21 @@ _root_modules = [ | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | # TODO: factor this into a ``tractor.highlevel`` extension | ||||||
|  | # pack for the library. | ||||||
| class Services: | class Services: | ||||||
| 
 | 
 | ||||||
|     actor_n: tractor._supervise.ActorNursery |     actor_n: tractor._supervise.ActorNursery | ||||||
|     service_n: trio.Nursery |     service_n: trio.Nursery | ||||||
|     debug_mode: bool  # tractor sub-actor debug mode flag |     debug_mode: bool  # tractor sub-actor debug mode flag | ||||||
|     service_tasks: dict[str, tuple[trio.CancelScope, tractor.Portal]] = {} |     service_tasks: dict[ | ||||||
|  |         str, | ||||||
|  |         tuple[ | ||||||
|  |             trio.CancelScope, | ||||||
|  |             tractor.Portal, | ||||||
|  |             trio.Event, | ||||||
|  |         ] | ||||||
|  |     ] = {} | ||||||
|     locks = defaultdict(trio.Lock) |     locks = defaultdict(trio.Lock) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|  | @ -84,7 +96,12 @@ class Services: | ||||||
|         ''' |         ''' | ||||||
|         async def open_context_in_task( |         async def open_context_in_task( | ||||||
|             task_status: TaskStatus[ |             task_status: TaskStatus[ | ||||||
|                 trio.CancelScope] = trio.TASK_STATUS_IGNORED, |                 tuple[ | ||||||
|  |                     trio.CancelScope, | ||||||
|  |                     trio.Event, | ||||||
|  |                     Any, | ||||||
|  |                 ] | ||||||
|  |             ] = trio.TASK_STATUS_IGNORED, | ||||||
| 
 | 
 | ||||||
|         ) -> Any: |         ) -> Any: | ||||||
| 
 | 
 | ||||||
|  | @ -96,28 +113,33 @@ class Services: | ||||||
|                 ) as (ctx, first): |                 ) as (ctx, first): | ||||||
| 
 | 
 | ||||||
|                     # unblock once the remote context has started |                     # unblock once the remote context has started | ||||||
|                     task_status.started((cs, first)) |                     complete = trio.Event() | ||||||
|  |                     task_status.started((cs, complete, first)) | ||||||
|                     log.info( |                     log.info( | ||||||
|                         f'`pikerd` service {name} started with value {first}' |                         f'`pikerd` service {name} started with value {first}' | ||||||
|                     ) |                     ) | ||||||
|                     try: |                     try: | ||||||
|                         # wait on any context's return value |                         # wait on any context's return value | ||||||
|  |                         # and any final portal result from the | ||||||
|  |                         # sub-actor. | ||||||
|                         ctx_res = await ctx.result() |                         ctx_res = await ctx.result() | ||||||
|                     except tractor.ContextCancelled: | 
 | ||||||
|                         return await self.cancel_service(name) |                         # NOTE: blocks indefinitely until cancelled | ||||||
|                     else: |                         # either by error from the target context | ||||||
|                         # wait on any error from the sub-actor |                         # function or by being cancelled here by the | ||||||
|                         # NOTE: this will block indefinitely until |                         # surrounding cancel scope. | ||||||
|                         # cancelled either by error from the target |  | ||||||
|                         # context function or by being cancelled here by |  | ||||||
|                         # the surrounding cancel scope |  | ||||||
|                         return (await portal.result(), ctx_res) |                         return (await portal.result(), ctx_res) | ||||||
| 
 | 
 | ||||||
|         cs, first = await self.service_n.start(open_context_in_task) |                     finally: | ||||||
|  |                         await portal.cancel_actor() | ||||||
|  |                         complete.set() | ||||||
|  |                         self.service_tasks.pop(name) | ||||||
|  | 
 | ||||||
|  |         cs, complete, first = await self.service_n.start(open_context_in_task) | ||||||
| 
 | 
 | ||||||
|         # store the cancel scope and portal for later cancellation or |         # store the cancel scope and portal for later cancellation or | ||||||
|         # retstart if needed. |         # retstart if needed. | ||||||
|         self.service_tasks[name] = (cs, portal) |         self.service_tasks[name] = (cs, portal, complete) | ||||||
| 
 | 
 | ||||||
|         return cs, first |         return cs, first | ||||||
| 
 | 
 | ||||||
|  | @ -127,13 +149,38 @@ class Services: | ||||||
|         name: str, |         name: str, | ||||||
| 
 | 
 | ||||||
|     ) -> Any: |     ) -> Any: | ||||||
|  |         ''' | ||||||
|  |         Cancel the service task and actor for the given ``name``. | ||||||
|  | 
 | ||||||
|  |         ''' | ||||||
|         log.info(f'Cancelling `pikerd` service {name}') |         log.info(f'Cancelling `pikerd` service {name}') | ||||||
|         cs, portal = self.service_tasks[name] |         cs, portal, complete = self.service_tasks[name] | ||||||
|         # XXX: not entirely sure why this is required, |  | ||||||
|         # and should probably be better fine tuned in |  | ||||||
|         # ``tractor``? |  | ||||||
|         cs.cancel() |         cs.cancel() | ||||||
|         return await portal.cancel_actor() |         await complete.wait() | ||||||
|  |         assert name not in self.service_tasks, \ | ||||||
|  |             f'Serice task for {name} not terminated?' | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @cm | ||||||
|  | def maybe_set_global_registry_sockaddr( | ||||||
|  |     registry_addr: None | tuple[str, int] = None, | ||||||
|  | ) -> None: | ||||||
|  | 
 | ||||||
|  |     global _registry_addr | ||||||
|  |     was_set: bool = False | ||||||
|  |     if ( | ||||||
|  |         _registry_addr is None | ||||||
|  |         or registry_addr | ||||||
|  |     ): | ||||||
|  |         _registry_addr = registry_addr or _default_reg_addr | ||||||
|  |         try: | ||||||
|  |             yield _registry_addr | ||||||
|  |         finally: | ||||||
|  |             # XXX: always clear the global addr if we set it so that the | ||||||
|  |             # next (set of) calls will apply whatever new one is passed | ||||||
|  |             # in. | ||||||
|  |             if was_set: | ||||||
|  |                 _registry_addr = None | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @acm | @acm | ||||||
|  | @ -155,40 +202,38 @@ async def open_pikerd( | ||||||
|     alive underling services (see below). |     alive underling services (see below). | ||||||
| 
 | 
 | ||||||
|     ''' |     ''' | ||||||
|     global _registry_addr |  | ||||||
| 
 | 
 | ||||||
|     if ( |     with maybe_set_global_registry_sockaddr(registry_addr) as reg_addr: | ||||||
|         _registry_addr is None |         async with ( | ||||||
|         or registry_addr |             tractor.open_root_actor( | ||||||
|     ): |  | ||||||
|         _registry_addr = registry_addr or _default_reg_addr |  | ||||||
| 
 | 
 | ||||||
|     # XXX: this may open a root actor as well |                 # passed through to ``open_root_actor`` | ||||||
|     async with ( |                 arbiter_addr=reg_addr, | ||||||
|         tractor.open_root_actor( |                 name=_root_dname, | ||||||
|  |                 loglevel=loglevel, | ||||||
|  |                 debug_mode=debug_mode, | ||||||
|  |                 start_method=start_method, | ||||||
| 
 | 
 | ||||||
|             # passed through to ``open_root_actor`` |                 # TODO: eventually we should be able to avoid | ||||||
|             arbiter_addr=_registry_addr, |                 # having the root have more then permissions to | ||||||
|             name=_root_dname, |                 # spawn other specialized daemons I think? | ||||||
|             loglevel=loglevel, |                 enable_modules=_root_modules, | ||||||
|             debug_mode=debug_mode, |             ) as _, | ||||||
|             start_method=start_method, |  | ||||||
| 
 | 
 | ||||||
|             # TODO: eventually we should be able to avoid |             tractor.open_nursery() as actor_nursery, | ||||||
|             # having the root have more then permissions to |         ): | ||||||
|             # spawn other specialized daemons I think? |             async with trio.open_nursery() as service_nursery: | ||||||
|             enable_modules=_root_modules, |  | ||||||
|         ) as _, |  | ||||||
| 
 | 
 | ||||||
|         tractor.open_nursery() as actor_nursery, |                 # assign globally for future daemon/task creation | ||||||
|     ): |                 Services.actor_n = actor_nursery | ||||||
|         async with trio.open_nursery() as service_nursery: |                 Services.service_n = service_nursery | ||||||
| 
 |                 Services.debug_mode = debug_mode | ||||||
|             # assign globally for future daemon/task creation |                 try: | ||||||
|             Services.actor_n = actor_nursery |                     yield | ||||||
|             Services.service_n = service_nursery |                 finally: | ||||||
|             Services.debug_mode = debug_mode |                     # if 'samplerd' in Services.service_tasks: | ||||||
|             yield |                     #     await Services.cancel_service('samplerd') | ||||||
|  |                     service_nursery.cancel_scope.cancel() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @acm | @acm | ||||||
|  | @ -209,32 +254,24 @@ async def open_piker_runtime( | ||||||
|     existing piker actors on the local link based on configuration. |     existing piker actors on the local link based on configuration. | ||||||
| 
 | 
 | ||||||
|     ''' |     ''' | ||||||
|     global _registry_addr |     with maybe_set_global_registry_sockaddr(registry_addr) as reg_addr: | ||||||
|  |         async with ( | ||||||
|  |             tractor.open_root_actor( | ||||||
| 
 | 
 | ||||||
|     if ( |                 # passed through to ``open_root_actor`` | ||||||
|         _registry_addr is None |                 arbiter_addr=reg_addr, | ||||||
|         or registry_addr |                 name=name, | ||||||
|     ): |                 loglevel=loglevel, | ||||||
|         _registry_addr = registry_addr or _default_reg_addr |                 debug_mode=debug_mode, | ||||||
|  |                 start_method=start_method, | ||||||
| 
 | 
 | ||||||
|     # XXX: this may open a root actor as well |                 # TODO: eventually we should be able to avoid | ||||||
|     async with ( |                 # having the root have more then permissions to | ||||||
|         tractor.open_root_actor( |                 # spawn other specialized daemons I think? | ||||||
| 
 |                 enable_modules=_root_modules + enable_modules, | ||||||
|             # passed through to ``open_root_actor`` |             ) as _, | ||||||
|             arbiter_addr=_registry_addr, |         ): | ||||||
|             name=name, |             yield tractor.current_actor() | ||||||
|             loglevel=loglevel, |  | ||||||
|             debug_mode=debug_mode, |  | ||||||
|             start_method=start_method, |  | ||||||
| 
 |  | ||||||
|             # TODO: eventually we should be able to avoid |  | ||||||
|             # having the root have more then permissions to |  | ||||||
|             # spawn other specialized daemons I think? |  | ||||||
|             enable_modules=_root_modules + enable_modules, |  | ||||||
|         ) as _, |  | ||||||
|     ): |  | ||||||
|         yield tractor.current_actor() |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @acm | @acm | ||||||
|  | @ -325,6 +362,11 @@ async def find_service( | ||||||
|     service_name: str, |     service_name: str, | ||||||
| ) -> Optional[tractor.Portal]: | ) -> Optional[tractor.Portal]: | ||||||
| 
 | 
 | ||||||
|  |     global _registry_addr | ||||||
|  |     if not _registry_addr: | ||||||
|  |         yield None | ||||||
|  |         return | ||||||
|  | 
 | ||||||
|     log.info(f'Scanning for service `{service_name}`') |     log.info(f'Scanning for service `{service_name}`') | ||||||
|     # attach to existing daemon by name if possible |     # attach to existing daemon by name if possible | ||||||
|     async with tractor.find_actor( |     async with tractor.find_actor( | ||||||
|  | @ -342,6 +384,10 @@ async def check_for_service( | ||||||
|     Service daemon "liveness" predicate. |     Service daemon "liveness" predicate. | ||||||
| 
 | 
 | ||||||
|     ''' |     ''' | ||||||
|  |     global _registry_addr | ||||||
|  |     if not _registry_addr: | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|     async with tractor.query_actor( |     async with tractor.query_actor( | ||||||
|         service_name, |         service_name, | ||||||
|         arbiter_sockaddr=_registry_addr, |         arbiter_sockaddr=_registry_addr, | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue