From df31047ecb971b7c3236ac0156ba366635a08401 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 23 Oct 2023 17:34:28 -0400 Subject: [PATCH] Be ultra-correct in `Portal.open_context()` This took way too long to get right but hopefully will give us grok-able and correct context exit semantics going forward B) The main fixes were: - always shielding the `MsgStream.aclose()` call on teardown to avoid bubbling a `Cancelled`. - properly absorbing any `ContextCancelled` in cases due to "self cancellation" using the new `Context.canceller` in the logic. - capturing any error raised by the `Context.result()` call in the "normal exit, result received" case and setting it as the `Context._local_error` so that self-cancels can be easily measured via `Context.cancelled_caught` in same way as remote-error caused cancellations. - extremely detailed comments around all of the cancellation-error cases to avoid ever getting confused about the control flow in the future XD --- tractor/_portal.py | 170 +++++++++++++++++++++++++++++++-------------- 1 file changed, 118 insertions(+), 52 deletions(-) diff --git a/tractor/_portal.py b/tractor/_portal.py index d53fc6b..4c0587a 100644 --- a/tractor/_portal.py +++ b/tractor/_portal.py @@ -48,6 +48,7 @@ from ._exceptions import ( unpack_error, NoResult, ContextCancelled, + MessagingError, ) from ._context import Context from ._streaming import MsgStream @@ -71,11 +72,6 @@ def _unwrap_msg( raise unpack_error(msg, channel) from None -# TODO: maybe move this to ._exceptions? -class MessagingError(Exception): - 'Some kind of unexpected SC messaging dialog issue' - - class Portal: ''' A 'portal' to a memory-domain-separated `Actor`. @@ -220,14 +216,18 @@ class Portal: try: # send cancel cmd - might not get response - # XXX: sure would be nice to make this work with a proper shield + # XXX: sure would be nice to make this work with + # a proper shield with trio.move_on_after( timeout or self.cancel_timeout ) as cs: cs.shield = True - await self.run_from_ns('self', 'cancel') + await self.run_from_ns( + 'self', + 'cancel', + ) return True if cs.cancelled_caught: @@ -462,10 +462,14 @@ class Portal: try: # the "first" value here is delivered by the callee's # ``Context.started()`` call. - first = msg['started'] + first: Any = msg['started'] ctx._started_called: bool = True except KeyError: + + # TODO: can we maybe factor this into the new raiser + # `_streaming._raise_from_no_yield_msg()` and make that + # helper more generic, say with a `_no__msg()`? if not (cid := msg.get('cid')): raise MessagingError( 'Received internal error at context?\n' @@ -517,54 +521,102 @@ class Portal: # started in the ctx nursery. ctx._scope.cancel() - # XXX: (maybe) shield/mask context-cancellations that were - # initiated by any of the context's 2 tasks. There are - # subsequently 2 operating cases for a "graceful cancel" - # of a `Context`: - # - # 1.*this* side's task called `Context.cancel()`, in - # which case we mask the `ContextCancelled` from bubbling - # to the opener (much like how `trio.Nursery` swallows - # any `trio.Cancelled` bubbled by a call to - # `Nursery.cancel_scope.cancel()`) + # XXX NOTE XXX: maybe shield against + # self-context-cancellation (which raises a local + # `ContextCancelled`) when requested (via + # `Context.cancel()`) by the same task (tree) which entered + # THIS `.open_context()`. # - # 2.*the other* side's (callee/spawned) task cancelled due - # to a self or peer cancellation request in which case we - # DO let the error bubble to the opener. + # NOTE: There are 2 operating cases for a "graceful cancel" + # of a `Context`. In both cases any `ContextCancelled` + # raised in this scope-block came from a transport msg + # relayed from some remote-actor-task which our runtime set + # as a `Context._remote_error` + # + # the CASES: + # + # - if that context IS THE SAME ONE that called + # `Context.cancel()`, we want to absorb the error + # silently and let this `.open_context()` block to exit + # without raising. + # + # - if it is from some OTHER context (we did NOT call + # `.cancel()`), we want to re-RAISE IT whilst also + # setting our own ctx's "reason for cancel" to be that + # other context's cancellation condition; we set our + # `.canceller: tuple[str, str]` to be same value as + # caught here in a `ContextCancelled.canceller`. + # + # Again, there are 2 cases: + # + # 1-some other context opened in this `.open_context()` + # block cancelled due to a self or peer cancellation + # request in which case we DO let the error bubble to the + # opener. + # + # 2-THIS "caller" task somewhere invoked `Context.cancel()` + # and received a `ContextCanclled` from the "callee" + # task, in which case we mask the `ContextCancelled` from + # bubbling to this "caller" (much like how `trio.Nursery` + # swallows any `trio.Cancelled` bubbled by a call to + # `Nursery.cancel_scope.cancel()`) except ContextCancelled as ctxc: scope_err = ctxc - # CASE 1: this context was never cancelled - # via a local task's call to `Context.cancel()`. - if not ctx._cancel_called: - raise - # CASE 2: context was cancelled by local task calling # `.cancel()`, we don't raise and the exit block should # exit silently. - else: + if ( + ctx._cancel_called + and ( + ctxc is ctx._remote_error + or + ctxc.canceller is self.canceller + ) + ): log.debug( f'Context {ctx} cancelled gracefully with:\n' f'{ctxc}' ) + # CASE 1: this context was never cancelled via a local + # task (tree) having called `Context.cancel()`, raise + # the error since it was caused by someone else! + else: + raise + # the above `._scope` can be cancelled due to: + # 1. an explicit self cancel via `Context.cancel()` or + # `Actor.cancel()`, + # 2. any "callee"-side remote error, possibly also a cancellation + # request by some peer, + # 3. any "caller" (aka THIS scope's) local error raised in the above `yield` except ( - # - a standard error in the caller/yieldee + # CASE 3: standard local error in this caller/yieldee Exception, - # - a runtime teardown exception-group and/or - # cancellation request from a caller task. - BaseExceptionGroup, - trio.Cancelled, + # CASES 1 & 2: normally manifested as + # a `Context._scope_nursery` raised + # exception-group of, + # 1.-`trio.Cancelled`s, since + # `._scope.cancel()` will have been called and any + # `ContextCancelled` absorbed and thus NOT RAISED in + # any `Context._maybe_raise_remote_err()`, + # 2.-`BaseExceptionGroup[ContextCancelled | RemoteActorError]` + # from any error raised in the "callee" side with + # a group only raised if there was any more then one + # task started here in the "caller" in the + # `yield`-ed to task. + BaseExceptionGroup, # since overrun handler tasks may have been spawned + trio.Cancelled, # NOTE: NOT from inside the ctx._scope KeyboardInterrupt, ) as err: scope_err = err - # XXX: request cancel of this context on any error. - # NOTE: `Context.cancel()` is conversely NOT called in - # the `ContextCancelled` "cancellation requested" case - # above. + # XXX: ALWAYS request the context to CANCEL ON any ERROR. + # NOTE: `Context.cancel()` is conversely NEVER CALLED in + # the `ContextCancelled` "self cancellation absorbed" case + # handled in the block above! log.cancel( 'Context cancelled for task due to\n' f'{err}\n' @@ -583,7 +635,7 @@ class Portal: raise # duh - # no scope error case + # no local scope error, the "clean exit with a result" case. else: if ctx.chan.connected(): log.info( @@ -597,15 +649,27 @@ class Portal: # `Context._maybe_raise_remote_err()`) IFF # a `Context._remote_error` was set by the runtime # via a call to - # `Context._maybe_cancel_and_set_remote_error()` - # which IS SET any time the far end fails and - # causes "caller side" cancellation via - # a `ContextCancelled` here. - result = await ctx.result() - log.runtime( - f'Context {fn_name} returned value from callee:\n' - f'`{result}`' - ) + # `Context._maybe_cancel_and_set_remote_error()`. + # As per `Context._deliver_msg()`, that error IS + # ALWAYS SET any time "callee" side fails and causes "caller + # side" cancellation via a `ContextCancelled` here. + # result = await ctx.result() + try: + result = await ctx.result() + log.runtime( + f'Context {fn_name} returned value from callee:\n' + f'`{result}`' + ) + except BaseException as berr: + # on normal teardown, if we get some error + # raised in `Context.result()` we still want to + # save that error on the ctx's state to + # determine things like `.cancelled_caught` for + # cases where there was remote cancellation but + # this task didn't know until final teardown + # / value collection. + scope_err = berr + raise finally: # though it should be impossible for any tasks @@ -655,12 +719,14 @@ class Portal: with trio.CancelScope(shield=True): await ctx._recv_chan.aclose() - # XXX: since we always (maybe) re-raise (and thus also - # mask runtime machinery related - # multi-`trio.Cancelled`s) any scope error which was - # the underlying cause of this context's exit, add - # different log msgs for each of the (2) cases. + # XXX: we always raise remote errors locally and + # generally speaking mask runtime-machinery related + # multi-`trio.Cancelled`s. As such, any `scope_error` + # which was the underlying cause of this context's exit + # should be stored as the `Context._local_error` and + # used in determining `Context.cancelled_caught: bool`. if scope_err is not None: + ctx._local_error: BaseException = scope_err etype: Type[BaseException] = type(scope_err) # CASE 2 @@ -690,7 +756,7 @@ class Portal: await maybe_wait_for_debugger() # FINALLY, remove the context from runtime tracking and - # exit Bo + # exit! self.actor._contexts.pop( (self.channel.uid, ctx.cid), None,