From 534e5d150d24fa965319193ca52415a9c2bd505a Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 17 Oct 2023 15:30:16 -0400 Subject: [PATCH] Drop `msg` kwarg from `Context.cancel()` Well first off, turns out it's never used and generally speaking doesn't seem to help much with "runtime hacking/debugging"; why would we need to "fabricate" a msg when `.cancel()` is called to self-cancel? Also (and since `._maybe_cancel_and_set_remote_error()` now takes an `error: BaseException` as input and thus expects error-msg unpacking prior to being called), we now manually set `Context._cancel_msg: dict` just prior to any remote error assignment - so any case where we would have fabbed a "cancel msg" near calling `.cancel()`, just do the manual assign. In this vein some other subtle changes: - obviously don't set `._cancel_msg` in `.cancel()` since it's no longer an input. - generally do walrus-style `error := unpack_error()` before applying and setting remote error-msg state. - always raise any `._remote_error` in `.result()` instead of returning the exception instance and check before AND after the underlying mem chan read. - add notes/todos around `raise self._remote_error from None` masking of (runtime) errors in `._maybe_raise_remote_err()` and use it inside `.result()` since we had the inverse duplicate logic there anyway.. Further, this adds and extends a ton of (internal) interface docs and details comments around the `Context` API including many subtleties pertaining to calling `._maybe_cancel_and_set_remote_error()`. --- tractor/_context.py | 288 ++++++++++++++++++++++++++++---------------- 1 file changed, 183 insertions(+), 105 deletions(-) diff --git a/tractor/_context.py b/tractor/_context.py index 0df1e80..c14f16b 100644 --- a/tractor/_context.py +++ b/tractor/_context.py @@ -86,30 +86,51 @@ class Context: ''' chan: Channel - cid: str + cid: str # "context id", more or less a unique linked-task-pair id - # these are the "feeder" channels for delivering - # message values to the local task from the runtime - # msg processing loop. + # the "feeder" channels for delivering message values to the + # local task from the runtime's msg processing loop. _recv_chan: trio.MemoryReceiveChannel _send_chan: trio.MemorySendChannel + # the "invocation type" of the far end task-entry-point + # function, normally matching a logic block inside + # `._runtime.invoke()`. _remote_func_type: str | None = None - # only set on the caller side - _portal: Portal | None = None # type: ignore # noqa + # NOTE: (for now) only set (a portal) on the caller side since + # the callee doesn't generally need a ref to one and should + # normally need to explicitly ask for handle to its peer if + # more the the `Context` is needed? + _portal: Portal | None = None + + # NOTE: each side of the context has its own cancel scope + # which is exactly the primitive that allows for + # cross-actor-task-supervision and thus SC. + _scope: trio.CancelScope | None = None _result: Any | int = None _remote_error: BaseException | None = None # cancellation state _cancel_called: bool = False # did WE cancel the far end? _cancelled_remote: tuple[str, str] | None = None - _cancel_msg: str | None = None - _scope: trio.CancelScope | None = None - # NOTE: this is set by the `.devx._debug` machinery - # to indicate whether code in `._runtime` should handle - # cancelled context crashes in the pdbp REPL. + # NOTE: we try to ensure assignment of a "cancel msg" since + # there's always going to be an "underlying reason" that any + # context was closed due to either a remote side error or + # a call to `.cancel()` which triggers `ContextCancelled`. + _cancel_msg: str | dict | None = None + + # NOTE: this state var used by the runtime to determine if the + # `pdbp` REPL is allowed to engage on contexts terminated via + # a `ContextCancelled` due to a call to `.cancel()` triggering + # "graceful closure" on either side: + # - `._runtime._invoke()` will check this flag before engaging + # the crash handler REPL in such cases where the "callee" + # raises the cancellation, + # - `.devx._debug.lock_tty_for_child()` will set it to `False` if + # the global tty-lock has been configured to filter out some + # actors from being able to acquire the debugger lock. _enter_debugger_on_cancel: bool = True @property @@ -177,36 +198,71 @@ class Context: async def _maybe_cancel_and_set_remote_error( self, - error_msg: dict[str, Any], + error: BaseException, ) -> None: ''' - (Maybe) unpack and raise a msg error into the local scope - nursery for this context. + (Maybe) cancel this local scope due to a received remote + error (normally via an IPC msg) which the actor runtime + routes to this context. - Acts as a form of "relay" for a remote error raised - in the corresponding remote callee task. + Acts as a form of "relay" for a remote error raised in the + corresponding remote task's `Context` wherein the next time + the local task exectutes a checkpoint, a `trio.Cancelled` + will be raised and depending on the type and source of the + original remote error, and whether or not the local task + called `.cancel()` itself prior, an equivalent + `ContextCancelled` or `RemoteActorError` wrapping the + remote error may be raised here by any of, + + - `Portal.open_context()` + - `Portal.result()` + - `Context.open_stream()` + - `Context.result()` + + when called/closed by actor local task(s). + + NOTEs & TODOs: + - It is expected that the caller has previously unwrapped + the remote error using a call to `unpack_error()` and + provides that output exception value as the input + `error` argument here. + - If this is an error message from a context opened by + `Portal.open_context()` we want to interrupt any + ongoing local tasks operating within that `Context`'s + cancel-scope so as to be notified ASAP of the remote + error and engage any caller handling (eg. for + cross-process task supervision). + - In some cases we may want to raise the remote error + immediately since there is no guarantee the locally + operating task(s) will attempt to execute a checkpoint + any time soon; in such cases there are 2 possible + approaches depending on the current task's work and + wrapping "thread" type: + + - `trio`-native-and-graceful: only ever wait for tasks + to exec a next `trio.lowlevel.checkpoint()` assuming + that any such task must do so to interact with the + actor runtime and IPC interfaces. + + - (NOT IMPLEMENTED) system-level-aggressive: maybe we + could eventually interrupt sync code (invoked using + `trio.to_thread` or some other adapter layer) with + a signal (a custom unix one for example? + https://stackoverflow.com/a/5744185) depending on the + task's wrapping thread-type such that long running + sync code should never cause the delay of actor + supervision tasks such as cancellation and respawn + logic. ''' - # If this is an error message from a context opened by - # ``Portal.open_context()`` we want to interrupt any ongoing - # (child) tasks within that context to be notified of the remote - # error relayed here. - # - # The reason we may want to raise the remote error immediately - # is that there is no guarantee the associated local task(s) - # will attempt to read from any locally opened stream any time - # soon. - # - # NOTE: this only applies when - # ``Portal.open_context()`` has been called since it is assumed - # (currently) that other portal APIs (``Portal.run()``, - # ``.run_in_actor()``) do their own error checking at the point - # of the call and result processing. - error = unpack_error( - error_msg, - self.chan, - ) + # XXX: currently this should only be used when + # `Portal.open_context()` has been opened since it's + # assumed that other portal APIs like, + # - `Portal.run()`, + # - `ActorNursery.run_in_actor()` + # do their own error checking at their own call points and + # result processing. # XXX: set the remote side's error so that after we cancel # whatever task is the opener of this context it can raise @@ -236,35 +292,25 @@ class Context: else: log.error( f'Remote context error for {self.chan.uid}:{self.cid}:\n' - f'{error_msg["error"]["tb_str"]}' + f'{error}' ) # TODO: tempted to **not** do this by-reraising in a # nursery and instead cancel a surrounding scope, detect # the cancellation, then lookup the error that was set? # YES! this is way better and simpler! - if ( - self._scope - ): + if self._scope: # from trio.testing import wait_all_tasks_blocked # await wait_all_tasks_blocked() # self._cancelled_remote = self.chan.uid self._scope.cancel() - # NOTE: this usage actually works here B) - # from .devx._debug import breakpoint - # await breakpoint() - - # XXX: this will break early callee results sending - # since when `.result()` is finally called, this - # chan will be closed.. - # if self._recv_chan: - # await self._recv_chan.aclose() + # this REPL usage actually works here BD + # from .devx._debug import pause + # await pause() async def cancel( self, - msg: str | None = None, timeout: float = 0.616, - # timeout: float = 1000, ) -> None: ''' @@ -274,15 +320,12 @@ class Context: Timeout quickly in an attempt to sidestep 2-generals... ''' - side = 'caller' if self._portal else 'callee' - if msg: - assert side == 'callee', 'Only callee side can provide cancel msg' + side: str = 'caller' if self._portal else 'callee' + log.cancel( + f'Cancelling {side} side of context to {self.chan.uid}' + ) - log.cancel(f'Cancelling {side} side of context to {self.chan.uid}') - - self._cancel_called = True - # await devx._debug.breakpoint() - # breakpoint() + self._cancel_called: bool = True if side == 'caller': if not self._portal: @@ -290,12 +333,13 @@ class Context: "No portal found, this is likely a callee side context" ) - cid = self.cid + cid: str = self.cid with trio.move_on_after(timeout) as cs: cs.shield = True log.cancel( - f"Cancelling stream {cid} to " - f"{self._portal.channel.uid}") + f'Cancelling stream {cid} to ' + f'{self._portal.channel.uid}' + ) # NOTE: we're telling the far end actor to cancel a task # corresponding to *this actor*. The far end local channel @@ -314,17 +358,17 @@ class Context: # if not self._portal.channel.connected(): if not self.chan.connected(): log.cancel( - "May have failed to cancel remote task " - f"{cid} for {self._portal.channel.uid}") + 'May have failed to cancel remote task ' + f'{cid} for {self._portal.channel.uid}' + ) else: log.cancel( - "Timed out on cancelling remote task " - f"{cid} for {self._portal.channel.uid}") + 'Timed out on cancel request of remote task ' + f'{cid} for {self._portal.channel.uid}' + ) # callee side remote task else: - self._cancel_msg = msg - # TODO: should we have an explicit cancel message # or is relaying the local `trio.Cancelled` as an # {'error': trio.Cancelled, cid: "blah"} enough? @@ -335,7 +379,6 @@ class Context: @acm async def open_stream( - self, allow_overruns: bool | None = False, msg_buffer_size: int | None = None, @@ -354,10 +397,10 @@ class Context: ``Portal.open_context()``. In the future this may change but currently there seems to be no obvious reason to support "re-opening": - - pausing a stream can be done with a message. - - task errors will normally require a restart of the entire - scope of the inter-actor task context due to the nature of - ``trio``'s cancellation system. + - pausing a stream can be done with a message. + - task errors will normally require a restart of the entire + scope of the inter-actor task context due to the nature of + ``trio``'s cancellation system. ''' actor = current_actor() @@ -439,18 +482,19 @@ class Context: self, err: Exception, ) -> None: + ''' + Maybe raise a remote error depending on who (which task from + which actor) requested a cancellation (if any). + + ''' # NOTE: whenever the context's "opener" side (task) **is** # the side which requested the cancellation (likekly via # ``Context.cancel()``), we don't want to re-raise that # cancellation signal locally (would be akin to # a ``trio.Nursery`` nursery raising ``trio.Cancelled`` - # whenever ``CancelScope.cancel()`` was called) and instead - # silently reap the expected cancellation "error"-msg. - # if 'pikerd' in err.msgdata['tb_str']: - # # from . import _debug - # # await _debug.breakpoint() - # breakpoint() - + # whenever ``CancelScope.cancel()`` was called) and + # instead silently reap the expected cancellation + # "error"-msg. if ( isinstance(err, ContextCancelled) and ( @@ -461,7 +505,18 @@ class Context: ): return err - raise err # from None + # NOTE: currently we are masking underlying runtime errors + # which are often superfluous to user handler code. not + # sure if this is still needed / desired for all operation? + # TODO: maybe we can only NOT mask if: + # - [ ] debug mode is enabled or, + # - [ ] a certain log level is set? + # - [ ] consider using `.with_traceback()` to filter out + # runtime frames from the tb explicitly? + # https://docs.python.org/3/reference/simple_stmts.html#the-raise-statement + # https://stackoverflow.com/a/24752607 + __tracebackhide__: bool = True + raise err from None async def result(self) -> Any | Exception: ''' @@ -489,12 +544,12 @@ class Context: of the remote cancellation. ''' + __tracebackhide__: bool = True assert self._portal, "Context.result() can not be called from callee!" assert self._recv_chan if re := self._remote_error: - self._maybe_raise_remote_err(re) - return re + return self._maybe_raise_remote_err(re) if ( self._result == id(self) @@ -505,8 +560,8 @@ class Context: # and discarding any bi dir stream msgs still # in transit from the far end. while True: - msg = await self._recv_chan.receive() try: + msg = await self._recv_chan.receive() self._result: Any = msg['return'] # NOTE: we don't need to do this right? @@ -519,17 +574,22 @@ class Context: # NOTE: we get here if the far end was # `ContextCancelled` in 2 cases: - # - we requested the cancellation and thus - # SHOULD NOT raise that far end error, - # - WE DID NOT REQUEST that cancel and thus - # SHOULD RAISE HERE! + # 1. we requested the cancellation and thus + # SHOULD NOT raise that far end error, + # 2. WE DID NOT REQUEST that cancel and thus + # SHOULD RAISE HERE! except trio.Cancelled: - if not self._cancel_called: - raise self._remote_error - else: - # if we DID request the cancel we simply - # continue as normal. - raise + + # CASE 2: mask the local cancelled-error(s) + # only when we are sure the remote error is the + # (likely) source cause of this local runtime + # task's cancellation. + if re := self._remote_error: + self._maybe_raise_remote_err(re) + + # CASE 1: we DID request the cancel we simply + # continue to bubble up as normal. + raise except KeyError: # as msgerr: @@ -544,7 +604,8 @@ class Context: # internal error should never get here assert msg.get('cid'), ( - "Received internal error at portal?") + "Received internal error at portal?" + ) err = unpack_error( msg, @@ -554,7 +615,10 @@ class Context: err = self._maybe_raise_remote_err(err) self._remote_error = err - return self._remote_error or self._result + if re := self._remote_error: + return self._maybe_raise_remote_err(re) + + return self._result async def started( self, @@ -563,7 +627,7 @@ class Context: ) -> None: ''' Indicate to calling actor's task that this linked context - has started and send ``value`` to the other side. + has started and send ``value`` to the other side via IPC. On the calling side ``value`` is the second item delivered in the tuple returned by ``Portal.open_context()``. @@ -571,19 +635,17 @@ class Context: ''' if self._portal: raise RuntimeError( - f"Caller side context {self} can not call started!") + f'Caller side context {self} can not call started!' + ) elif self._started_called: raise RuntimeError( - f"called 'started' twice on context with {self.chan.uid}") + f'called `.started()` twice on context with {self.chan.uid}' + ) await self.chan.send({'started': value, 'cid': self.cid}) self._started_called = True - # TODO: do we need a restart api? - # async def restart(self) -> None: - # pass - async def _drain_overflows( self, ) -> None: @@ -638,10 +700,21 @@ class Context: self, msg: dict, - draining: bool = False, + # draining: bool = False, ) -> bool: + ''' + Deliver an IPC msg received from a transport-channel to + this context's underlying mem chan for handling by + user operating tasks; deliver a bool indicating whether the + msg was immediately sent. + If `._allow_overruns == True` (maybe) append the msg to an + "overflow queue" and start a "drainer task" (inside the + `._scope_nursery: trio.Nursery`) which ensures that such + messages are eventually sent if possible. + + ''' cid = self.cid chan = self.chan uid = chan.uid @@ -652,8 +725,12 @@ class Context: ) error = msg.get('error') - if error: - await self._maybe_cancel_and_set_remote_error(msg) + if error := unpack_error( + msg, + self.chan, + ): + self._cancel_msg = msg + await self._maybe_cancel_and_set_remote_error(error) if ( self._in_overrun @@ -685,6 +762,7 @@ class Context: # the sender; the main motivation is that using bp can block the # msg handling loop which calls into this method! except trio.WouldBlock: + # XXX: always push an error even if the local # receiver is in overrun state. # await self._maybe_cancel_and_set_remote_error(msg)