From de6189da4d34c4663c0fd317000caca456e19712 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 21 Aug 2024 12:16:17 -0400
Subject: [PATCH] Drop `.cancel_actor()` from `maybe_spawn_daemon()`

Since `tractor`'s new and improved inter-actor cancellation semantics
are much more pedantic, AND bc we use the `ServiceMngr` for spawning
service actors on-demand, the caller of `maybe_spawn_daemon()` should
NEVER conduct a so called "out of band" `Actor`-runtime cancel request
since this is precisely the job of our `ServiceMngr` XD

Add a super in depth note explaining the underlying issue and adding
a todo list of how we should prolly augment `tractor` to make such cases
easier to grok and fix in the future!
---
 piker/service/_daemon.py | 60 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/piker/service/_daemon.py b/piker/service/_daemon.py
index a76918ec..0cb57291 100644
--- a/piker/service/_daemon.py
+++ b/piker/service/_daemon.py
@@ -134,7 +134,65 @@ async def maybe_spawn_daemon(
         async with tractor.wait_for_actor(service_name) as portal:
             lock.release()
             yield portal
-            await portal.cancel_actor()
+            # --- ---- ---
+            # XXX NOTE XXX
+            # --- ---- ---
+            # DO NOT PUT A `portal.cancel_actor()` here (as was prior)!
+            #
+            # Doing so will cause an "out-of-band" ctxc
+            # (`tractor.ContextCancelled`) to be raised inside the
+            # `ServiceMngr.open_context_in_task()`'s call to
+            # `ctx.wait_for_result()` AND the internal self-ctxc
+            # "graceful capture" WILL NOT CATCH IT!
+            #
+            # This can cause certain types of operations to raise
+            # that ctxc BEFORE THEY `return`, resulting in
+            # a "false-negative" ctxc being raised when really
+            # nothing actually failed, other then our semantic
+            # "failure" to suppress an expected, graceful,
+            # self-cancel scenario..
+            #
+            # bUt wHy duZ It WorK lIKe dis..
+            # ------------------------------
+            # from the perspective of the `tractor.Context` this
+            # cancel request was conducted "out of band" since
+            # `Context.cancel()` was never called and thus the
+            # `._cancel_called: bool` was never set. Despite the
+            # remote `.canceller` being set to `pikerd` (i.e. the
+            # same `Actor.uid` of the raising service-mngr task) the
+            # service-task's ctx itself was never marked as having
+            # requested cancellation and thus still raises the ctxc
+            # bc it was unaware of any such request.
+            #
+            # How to make grokin these cases easier tho?
+            # ------------------------------------------
+            # Because `Portal.cancel_actor()` was called it requests
+            # "full-`Actor`-runtime-cancellation" of it's peer
+            # process which IS NOT THE SAME as a single inter-actor
+            # RPC task cancelling its local context with a remote
+            # peer `Task` in that same peer process.
+            #
+            # ?TODO? It might be better if we do one (or all) of the
+            # following:
+            #
+            # -[ ] at least set a special message for the
+            #    `ContextCancelled` when raised locally by the
+            #    unaware ctx task such that we check for the
+            #    `.canceller` being *our `Actor`* and in the case
+            #    where `Context._cancel_called == False` we specially
+            #    note that this is likely an "out-of-band"
+            #    runtime-cancel request triggered by some call to
+            #    `Portal.cancel_actor()`, possibly even reporting the
+            #    exact LOC of that caller by tracking it inside our
+            #    portal-type?
+            # -[ ] possibly add another field `ContextCancelled` like
+            #    maybe a,
+            #    `.request_type: Literal['os', 'proc', 'actor',
+            #    'ctx']` type thing which would allow immediately
+            #    being able to tell what kind of cancellation caused
+            #    the unexpected ctxc?
+            # -[ ] REMOVE THIS COMMENT, once we've settled on how to
+            #     better augment `tractor` to be more explicit on this!
 
 
 async def spawn_emsd(