From ed10632d97cab4aeb1cbe4cff83354fb4722e7a3 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sat, 2 Mar 2024 16:27:46 -0500 Subject: [PATCH] Avoid `ctx.cancel()` after ctxc rxed in `.open_context()` In the case where the callee side delivers us a ctxc with `.canceller` set we can presume that remote cancellation already has taken place and thus we don't need to do the normal call-`Context.cancel()`-on-error step. Further, in the case where we do call it also handle any `trio.CloseResourceError` gracefully with a `.warning()`. Also, originally I had added a post-`yield`-maybe-raise to attempt handling any remote ctxc the same as for the local case (i.e. raised from `yield` line) wherein if we get a remote ctxc the same handler branch-path would trigger, thus avoiding different behaviour in that case. I ended up masking it out (but can't member why.. ) as it seems the normal `.result()` call and its internal handling gets the same behaviour? I've left in the heavily commented code in case it ends up being the better way to go; likely making the move to having a single code in both cases is better even if it is just a matter of deciding whether to swallow the ctxc or not in the `.cancel_acked` case. Further teensie improvements: - obvi improve/simplify log msg contents as in prior patches. - use the new `maybe_wait_for_debugger(header_msg: str)` if/when waiting to exit in debug mode. - another `hide_tb: bool` frame hider flag. - rando type-annot updates of course :) --- tractor/_portal.py | 248 +++++++++++++++++++++++++-------------------- 1 file changed, 140 insertions(+), 108 deletions(-) diff --git a/tractor/_portal.py b/tractor/_portal.py index a4f2f61..04f613e 100644 --- a/tractor/_portal.py +++ b/tractor/_portal.py @@ -27,8 +27,9 @@ from __future__ import annotations import importlib import inspect from typing import ( - Any, Optional, - Callable, AsyncGenerator, + Any, + Callable, + AsyncGenerator, Type, ) from functools import partial @@ -52,6 +53,7 @@ from ._ipc import Channel from .log import get_logger from .msg import NamespacePath from ._exceptions import ( + InternalError, _raise_from_no_key_in_msg, unpack_error, NoResult, @@ -126,7 +128,7 @@ class Portal: def __init__(self, channel: Channel) -> None: self.chan = channel # during the portal's lifetime - self._result_msg: Optional[dict] = None + self._result_msg: dict|None = None # When set to a ``Context`` (when _submit_for_result is called) # it is expected that ``result()`` will be awaited at some @@ -171,7 +173,7 @@ class Portal: ) -> dict[str, Any]: assert ctx._remote_func_type == 'asyncfunc' # single response - msg = await ctx._recv_chan.receive() + msg: dict = await ctx._recv_chan.receive() return msg async def result(self) -> Any: @@ -255,11 +257,11 @@ class Portal: return False reminfo: str = ( - f'uid: {self.channel.uid}\n' - f' |_{chan}\n' + f'{self.channel.uid}\n' + f' |_{chan}\n' ) log.cancel( - f'Sending actor cancel request to peer\n' + f'Sending runtime `.cancel()` request to peer\n\n' f'{reminfo}' ) @@ -281,7 +283,9 @@ class Portal: return True if cs.cancelled_caught: - log.cancel( + # may timeout and we never get an ack (obvi racy) + # but that doesn't mean it wasn't cancelled. + log.debug( 'May have failed to cancel peer?\n' f'{reminfo}' ) @@ -293,9 +297,11 @@ class Portal: trio.ClosedResourceError, trio.BrokenResourceError, ): - log.cancel( - f"{self.channel} for {self.channel.uid} was already " - "closed or broken?") + log.debug( + 'IPC chan for actor already closed or broken?\n\n' + f'{self.channel.uid}\n' + f' |_{self.channel}\n' + ) return False async def run_from_ns( @@ -416,7 +422,8 @@ class Portal: try: # deliver receive only stream async with MsgStream( - ctx, ctx._recv_chan, + ctx=ctx, + rx_chan=ctx._recv_chan, ) as rchan: self._streams.add(rchan) yield rchan @@ -443,6 +450,11 @@ class Portal: # await recv_chan.aclose() self._streams.remove(rchan) + # TODO: move this impl to `._context` mod and + # instead just bind it here as a method so that the logic + # for ctx stuff stays all in one place (instead of frickin + # having to open this file in tandem every gd time!!! XD) + # @asynccontextmanager async def open_context( @@ -451,6 +463,11 @@ class Portal: allow_overruns: bool = False, + # TODO: if we set this the wrapping `@acm` body will + # still be shown (awkwardly) on pdb REPL entry. Ideally + # we can similarly annotate that frame to NOT show? + hide_tb: bool = False, + # proxied to RPC **kwargs, @@ -484,6 +501,8 @@ class Portal: collection. See ``tractor.Context`` for more details. ''' + __tracebackhide__: bool = hide_tb + # conduct target func method structural checks if not inspect.iscoroutinefunction(func) and ( getattr(func, '_tractor_contex_function', False) @@ -536,9 +555,12 @@ class Portal: # placeholder for any exception raised in the runtime # or by user tasks which cause this context's closure. - scope_err: BaseException | None = None + scope_err: BaseException|None = None + ctxc_from_callee: ContextCancelled|None = None try: async with trio.open_nursery() as nurse: + + # NOTE: used to start overrun queuing tasks ctx._scope_nursery: trio.Nursery = nurse ctx._scope: trio.CancelScope = nurse.cancel_scope @@ -546,14 +568,26 @@ class Portal: # in enter tuple. yield ctx, first - # between the caller exiting and arriving here the - # far end may have sent a ctxc-msg or other error, - # so check for it here immediately and maybe raise - # so as to engage the ctxc handling block below! + # ??TODO??: do we still want to consider this or is + # the `else:` block handling via a `.result()` + # call below enough?? + # -[ ] pretty sure `.result()` internals do the + # same as our ctxc handler below so it ended up + # being same (repeated?) behaviour, but ideally we + # wouldn't have that duplication either by somehow + # factoring the `.result()` handler impl in a way + # that we can re-use it around the `yield` ^ here + # or vice versa? + # + # NOTE: between the caller exiting and arriving + # here the far end may have sent a ctxc-msg or + # other error, so check for it here immediately + # and maybe raise so as to engage the ctxc + # handling block below! + # # if re := ctx._remote_error: # maybe_ctxc: ContextCancelled|None = ctx._maybe_raise_remote_err( # re, - # # TODO: do we want this to always raise? # # - means that on self-ctxc, if/when the # # block is exited before the msg arrives @@ -571,7 +605,7 @@ class Portal: # # block? # raise_ctxc_from_self_call=True, # ) - # assert maybe_ctxc + # ctxc_from_callee = maybe_ctxc # when in allow_overruns mode there may be # lingering overflow sender tasks remaining? @@ -583,13 +617,18 @@ class Portal: not ctx._allow_overruns or len(nurse.child_tasks) > 1 ): - raise RuntimeError( + raise InternalError( 'Context has sub-tasks but is ' 'not in `allow_overruns=True` mode!?' ) - # ensure cancel of all overflow sender tasks - # started in the ctx nursery. + # ensure we cancel all overflow sender + # tasks started in the nursery when + # `._allow_overruns == True`. + # + # NOTE: this means `._scope.cancelled_caught` + # will prolly be set! not sure if that's + # non-ideal or not ??? ctx._scope.cancel() # XXX NOTE XXX: maybe shield against @@ -602,14 +641,15 @@ class Portal: # of a `Context`. In both cases any `ContextCancelled` # raised in this scope-block came from a transport msg # relayed from some remote-actor-task which our runtime set - # as a `Context._remote_error` + # as to `Context._remote_error` # # the CASES: # # - if that context IS THE SAME ONE that called # `Context.cancel()`, we want to absorb the error # silently and let this `.open_context()` block to exit - # without raising. + # without raising, ideally eventually receiving the ctxc + # ack msg thus resulting in `ctx.cancel_acked == True`. # # - if it is from some OTHER context (we did NOT call # `.cancel()`), we want to re-RAISE IT whilst also @@ -633,6 +673,7 @@ class Portal: # `Nursery.cancel_scope.cancel()`) except ContextCancelled as ctxc: scope_err = ctxc + ctxc_from_callee = ctxc # XXX TODO XXX: FIX THIS debug_mode BUGGGG!!! # using this code and then resuming the REPL will @@ -642,6 +683,7 @@ class Portal: # documenting it as a definittive example of # debugging the tractor-runtime itself using it's # own `.devx.` tooling! + # # await pause() # CASE 2: context was cancelled by local task calling @@ -649,15 +691,10 @@ class Portal: # exit silently. if ( ctx._cancel_called - and ( - ctxc is ctx._remote_error - # ctxc.msgdata == ctx._remote_error.msgdata - - # TODO: uhh `Portal.canceller` ain't a thangg - # dawg? (was `self.canceller` before?!?) - and - ctxc.canceller == self.actor.uid - ) + and + ctxc is ctx._remote_error + and + ctxc.canceller == self.actor.uid ): log.cancel( f'Context (cid=[{ctx.cid[-6:]}..] cancelled gracefully with:\n' @@ -665,9 +702,9 @@ class Portal: ) # CASE 1: this context was never cancelled via a local # task (tree) having called `Context.cancel()`, raise - # the error since it was caused by someone else! + # the error since it was caused by someone else + # -> probably a remote peer! else: - # await pause() raise # the above `._scope` can be cancelled due to: @@ -680,19 +717,29 @@ class Portal: # CASE 3: standard local error in this caller/yieldee Exception, - # CASES 1 & 2: normally manifested as - # a `Context._scope_nursery` raised + # CASES 1 & 2: can manifest as a `ctx._scope_nursery` # exception-group of, + # # 1.-`trio.Cancelled`s, since - # `._scope.cancel()` will have been called and any - # `ContextCancelled` absorbed and thus NOT RAISED in - # any `Context._maybe_raise_remote_err()`, + # `._scope.cancel()` will have been called + # (transitively by the runtime calling + # `._deliver_msg()`) and any `ContextCancelled` + # eventually absorbed and thus absorbed/supressed in + # any `Context._maybe_raise_remote_err()` call. + # # 2.-`BaseExceptionGroup[ContextCancelled | RemoteActorError]` - # from any error raised in the "callee" side with - # a group only raised if there was any more then one - # task started here in the "caller" in the - # `yield`-ed to task. - BaseExceptionGroup, # since overrun handler tasks may have been spawned + # from any error delivered from the "callee" side + # AND a group-exc is only raised if there was > 1 + # tasks started *here* in the "caller" / opener + # block. If any one of those tasks calls + # `.result()` or `MsgStream.receive()` + # `._maybe_raise_remote_err()` will be transitively + # called and the remote error raised causing all + # tasks to be cancelled. + # NOTE: ^ this case always can happen if any + # overrun handler tasks were spawned! + BaseExceptionGroup, + trio.Cancelled, # NOTE: NOT from inside the ctx._scope KeyboardInterrupt, @@ -702,69 +749,48 @@ class Portal: # XXX: ALWAYS request the context to CANCEL ON any ERROR. # NOTE: `Context.cancel()` is conversely NEVER CALLED in # the `ContextCancelled` "self cancellation absorbed" case - # handled in the block above! + # handled in the block above ^^^ !! log.cancel( - 'Context cancelled for task due to\n' + 'Context terminated due to\n\n' f'{caller_err}\n' - 'Sending cancel request..\n' - f'task:{cid}\n' - f'actor:{uid}' ) if debug_mode(): - log.pdb( - 'Delaying `ctx.cancel()` until debug lock ' - 'acquired..' - ) # async with acquire_debug_lock(self.actor.uid): # pass # TODO: factor ^ into below for non-root cases? - await maybe_wait_for_debugger() - log.pdb( - 'Acquired debug lock! ' - 'Calling `ctx.cancel()`!' + was_acquired: bool = await maybe_wait_for_debugger( + header_msg=( + 'Delaying `ctx.cancel()` until debug lock ' + 'acquired..\n' + ), ) + if was_acquired: + log.pdb( + 'Acquired debug lock! ' + 'Calling `ctx.cancel()`!\n' + ) - try: - await ctx.cancel() - except trio.BrokenResourceError: - log.warning( - 'IPC connection for context is broken?\n' - f'task:{cid}\n' - f'actor:{uid}' - ) + + # we don't need to cancel the callee if it already + # told us it's cancelled ;p + if ctxc_from_callee is None: + try: + await ctx.cancel() + except ( + trio.BrokenResourceError, + trio.ClosedResourceError, + ): + log.warning( + 'IPC connection for context is broken?\n' + f'task:{cid}\n' + f'actor:{uid}' + ) raise # duh # no local scope error, the "clean exit with a result" case. else: - # between the caller exiting and arriving here the - # far end may have sent a ctxc-msg or other error, - # so check for it here immediately and maybe raise - # so as to engage the ctxc handling block below! - # if re := ctx._remote_error: - # maybe_ctxc: ContextCancelled|None = ctx._maybe_raise_remote_err( - # re, - - # # TODO: do we want this to always raise? - # # - means that on self-ctxc, if/when the - # # block is exited before the msg arrives - # # but then the msg during __exit__ - # # calling we may not activate the - # # ctxc-handler block below? should we - # # be? - # # - if there's a remote error that arrives - # # after the child has exited, we won't - # # handle until the `finally:` block - # # where `.result()` is always called, - # # again in which case we handle it - # # differently then in the handler block - # # that would normally engage from THIS - # # block? - # raise_ctxc_from_self_call=True, - # ) - # assert maybe_ctxc - if ctx.chan.connected(): log.runtime( 'Waiting on final context result for\n' @@ -794,16 +820,18 @@ class Portal: scope_err = berr raise + # yes! this worx Bp + # from .devx import _debug + # await _debug.pause() + # an exception type boxed in a `RemoteActorError` - # is returned (meaning it was obvi not raised). + # is returned (meaning it was obvi not raised) + # that we want to log-report on. msgdata: str|None = getattr( result_or_err, 'msgdata', None ) - # yes! this worx Bp - # from .devx import _debug - # await _debug.pause() match (msgdata, result_or_err): case ( {'tb_str': tbstr}, @@ -846,9 +874,8 @@ class Portal: # operating *in* this scope to have survived # we tear down the runtime feeder chan last # to avoid premature stream clobbers. - rxchan: trio.ReceiveChannel = ctx._recv_chan if ( - rxchan + (rxchan := ctx._recv_chan) # maybe TODO: yes i know the below check is # touching `trio` memchan internals..BUT, there are @@ -904,22 +931,27 @@ class Portal: etype: Type[BaseException] = type(scope_err) # CASE 2 - if ctx._cancel_called: + if ( + ctx._cancel_called + and ctx.cancel_acked + ): log.cancel( 'Context cancelled by caller task\n' f'|_{ctx._task}\n\n' - f'{etype}' + f'{repr(scope_err)}\n' ) + # TODO: should we add a `._cancel_req_received` + # flag to determine if the callee manually called + # `ctx.cancel()`? + # -[ ] going to need a cid check no? + # CASE 1 else: log.cancel( - f'Context cancelled by remote callee task\n' - f'peer: {uid}\n' - f'|_ {nsf}()\n\n' - - f'{etype}\n' + f'Context terminated due to local scope error:\n' + f'{etype.__name__}\n' ) # FINALLY, remove the context from runtime tracking and @@ -967,7 +999,7 @@ class LocalPortal: async def open_portal( channel: Channel, - nursery: Optional[trio.Nursery] = None, + nursery: trio.Nursery|None = None, start_msg_loop: bool = True, shield: bool = False, @@ -992,7 +1024,7 @@ async def open_portal( if channel.uid is None: await actor._do_handshake(channel) - msg_loop_cs: Optional[trio.CancelScope] = None + msg_loop_cs: trio.CancelScope|None = None if start_msg_loop: from ._runtime import process_messages msg_loop_cs = await nursery.start(