From 4f3479778024c21f4167656fbafc3ec8f616a6b9 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Wed, 21 Aug 2024 12:16:17 -0400 Subject: [PATCH] Drop `.cancel_actor()` from `maybe_spawn_daemon()` Since `tractor`'s new and improved inter-actor cancellation semantics are much more pedantic, AND bc we use the `ServiceMngr` for spawning service actors on-demand, the caller of `maybe_spawn_daemon()` should NEVER conduct a so called "out of band" `Actor`-runtime cancel request since this is precisely the job of our `ServiceMngr` XD Add a super in depth note explaining the underlying issue and adding a todo list of how we should prolly augment `tractor` to make such cases easier to grok and fix in the future! --- piker/service/_daemon.py | 60 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/piker/service/_daemon.py b/piker/service/_daemon.py index a76918ec..0cb57291 100644 --- a/piker/service/_daemon.py +++ b/piker/service/_daemon.py @@ -134,7 +134,65 @@ async def maybe_spawn_daemon( async with tractor.wait_for_actor(service_name) as portal: lock.release() yield portal - await portal.cancel_actor() + # --- ---- --- + # XXX NOTE XXX + # --- ---- --- + # DO NOT PUT A `portal.cancel_actor()` here (as was prior)! + # + # Doing so will cause an "out-of-band" ctxc + # (`tractor.ContextCancelled`) to be raised inside the + # `ServiceMngr.open_context_in_task()`'s call to + # `ctx.wait_for_result()` AND the internal self-ctxc + # "graceful capture" WILL NOT CATCH IT! + # + # This can cause certain types of operations to raise + # that ctxc BEFORE THEY `return`, resulting in + # a "false-negative" ctxc being raised when really + # nothing actually failed, other then our semantic + # "failure" to suppress an expected, graceful, + # self-cancel scenario.. + # + # bUt wHy duZ It WorK lIKe dis.. + # ------------------------------ + # from the perspective of the `tractor.Context` this + # cancel request was conducted "out of band" since + # `Context.cancel()` was never called and thus the + # `._cancel_called: bool` was never set. Despite the + # remote `.canceller` being set to `pikerd` (i.e. the + # same `Actor.uid` of the raising service-mngr task) the + # service-task's ctx itself was never marked as having + # requested cancellation and thus still raises the ctxc + # bc it was unaware of any such request. + # + # How to make grokin these cases easier tho? + # ------------------------------------------ + # Because `Portal.cancel_actor()` was called it requests + # "full-`Actor`-runtime-cancellation" of it's peer + # process which IS NOT THE SAME as a single inter-actor + # RPC task cancelling its local context with a remote + # peer `Task` in that same peer process. + # + # ?TODO? It might be better if we do one (or all) of the + # following: + # + # -[ ] at least set a special message for the + # `ContextCancelled` when raised locally by the + # unaware ctx task such that we check for the + # `.canceller` being *our `Actor`* and in the case + # where `Context._cancel_called == False` we specially + # note that this is likely an "out-of-band" + # runtime-cancel request triggered by some call to + # `Portal.cancel_actor()`, possibly even reporting the + # exact LOC of that caller by tracking it inside our + # portal-type? + # -[ ] possibly add another field `ContextCancelled` like + # maybe a, + # `.request_type: Literal['os', 'proc', 'actor', + # 'ctx']` type thing which would allow immediately + # being able to tell what kind of cancellation caused + # the unexpected ctxc? + # -[ ] REMOVE THIS COMMENT, once we've settled on how to + # better augment `tractor` to be more explicit on this! async def spawn_emsd(