From 3e1d033708bdc101d331c809366d5db06b37818c Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 19 Feb 2024 17:00:46 -0500 Subject: [PATCH] WIP: solved the modden client hang.. --- tractor/_context.py | 602 ++++++++++++++++++++++++++++++++++-------- tractor/_portal.py | 178 +++++++++++-- tractor/_streaming.py | 188 ++++++++++--- 3 files changed, 793 insertions(+), 175 deletions(-) diff --git a/tractor/_context.py b/tractor/_context.py index 4d56fb3..54e309e 100644 --- a/tractor/_context.py +++ b/tractor/_context.py @@ -43,12 +43,17 @@ import warnings import trio +# from .devx import ( +# maybe_wait_for_debugger, +# pause, +# ) from ._exceptions import ( # _raise_from_no_key_in_msg, unpack_error, pack_error, ContextCancelled, # MessagingError, + RemoteActorError, StreamOverrun, ) from .log import get_logger @@ -64,6 +69,164 @@ if TYPE_CHECKING: log = get_logger(__name__) +async def _drain_to_final_msg( + ctx: Context, +) -> list[dict]: + +# ) -> tuple[ +# Any|Exception, +# list[dict], +# ]: + raise_overrun: bool = not ctx._allow_overruns + + # wait for a final context result by collecting (but + # basically ignoring) any bi-dir-stream msgs still in transit + # from the far end. + pre_result_drained: list[dict] = [] + while not ctx._remote_error: + try: + # NOTE: this REPL usage actually works here dawg! Bo + # from .devx._debug import pause + # await pause() + # if re := ctx._remote_error: + # ctx._maybe_raise_remote_err( + # re, + # # NOTE: obvi we don't care if we + # # overran the far end if we're already + # # waiting on a final result (msg). + # raise_overrun_from_self=raise_overrun, + # ) + + # TODO: bad idea? + # with trio.CancelScope() as res_cs: + # ctx._res_scope = res_cs + # msg: dict = await ctx._recv_chan.receive() + # if res_cs.cancelled_caught: + + # from .devx._debug import pause + # await pause() + msg: dict = await ctx._recv_chan.receive() + ctx._result: Any = msg['return'] + log.runtime( + 'Context delivered final result msg:\n' + f'{pformat(msg)}' + ) + pre_result_drained.append(msg) + # NOTE: we don't need to do this right? + # XXX: only close the rx mem chan AFTER + # a final result is retreived. + # if ctx._recv_chan: + # await ctx._recv_chan.aclose() + break + + # NOTE: we get here if the far end was + # `ContextCancelled` in 2 cases: + # 1. we requested the cancellation and thus + # SHOULD NOT raise that far end error, + # 2. WE DID NOT REQUEST that cancel and thus + # SHOULD RAISE HERE! + except trio.Cancelled: + + # CASE 2: mask the local cancelled-error(s) + # only when we are sure the remote error is + # the source cause of this local task's + # cancellation. + if re := ctx._remote_error: + ctx._maybe_raise_remote_err(re) + + # CASE 1: we DID request the cancel we simply + # continue to bubble up as normal. + raise + + except KeyError: + + if 'yield' in msg: + # far end task is still streaming to us so discard + log.warning(f'Discarding std "yield"\n{msg}') + pre_result_drained.append(msg) + continue + + # TODO: work out edge cases here where + # a stream is open but the task also calls + # this? + # -[ ] should be a runtime error if a stream is open + # right? + elif 'stop' in msg: + log.cancel( + 'Remote stream terminated due to "stop" msg:\n' + f'{msg}' + ) + pre_result_drained.append(msg) + continue + + # internal error should never get here + assert msg.get('cid'), ( + "Received internal error at portal?" + ) + + # XXX fallthrough to handle expected error XXX + re: Exception|None = ctx._remote_error + if re: + log.critical( + 'Remote ctx terminated due to "error" msg:\n' + f'{re}' + ) + assert msg is ctx._cancel_msg + # NOTE: this solved a super dupe edge case XD + # this was THE super duper edge case of: + # - local task opens a remote task, + # - requests remote cancellation of far end + # ctx/tasks, + # - needs to wait for the cancel ack msg + # (ctxc) or some result in the race case + # where the other side's task returns + # before the cancel request msg is ever + # rxed and processed, + # - here this surrounding drain loop (which + # iterates all ipc msgs until the ack or + # an early result arrives) was NOT exiting + # since we are the edge case: local task + # does not re-raise any ctxc it receives + # IFF **it** was the cancellation + # requester.. + # will raise if necessary, ow break from + # loop presuming any error terminates the + # context! + ctx._maybe_raise_remote_err( + re, + # NOTE: obvi we don't care if we + # overran the far end if we're already + # waiting on a final result (msg). + # raise_overrun_from_self=False, + raise_overrun_from_self=raise_overrun, + ) + + break # OOOOOF, yeah obvi we need this.. + + # XXX we should never really get here + # right! since `._deliver_msg()` should + # always have detected an {'error': ..} + # msg and already called this right!?! + elif error := unpack_error( + msg=msg, + chan=ctx._portal.channel, + hide_tb=False, + ): + log.critical('SHOULD NEVER GET HERE!?') + assert msg is ctx._cancel_msg + assert error.msgdata == ctx._remote_error.msgdata + from .devx._debug import pause + await pause() + ctx._maybe_cancel_and_set_remote_error(error) + ctx._maybe_raise_remote_err(error) + + else: + # bubble the original src key error + raise + + return pre_result_drained + + # TODO: make this a msgspec.Struct! @dataclass class Context: @@ -118,6 +281,7 @@ class Context: # which is exactly the primitive that allows for # cross-actor-task-supervision and thus SC. _scope: trio.CancelScope | None = None + # _res_scope: trio.CancelScope|None = None # on a clean exit there should be a final value # delivered from the far end "callee" task, so @@ -205,6 +369,10 @@ class Context: ) ) + # @property + # def is_waiting_result(self) -> bool: + # return bool(self._res_scope) + @property def side(self) -> str: ''' @@ -247,7 +415,11 @@ class Context: await self.chan.send({'yield': data, 'cid': self.cid}) async def send_stop(self) -> None: - await self.chan.send({'stop': True, 'cid': self.cid}) + # await pause() + await self.chan.send({ + 'stop': True, + 'cid': self.cid + }) def _maybe_cancel_and_set_remote_error( self, @@ -320,27 +492,37 @@ class Context: # XXX: set the remote side's error so that after we cancel # whatever task is the opener of this context it can raise # that error as the reason. + # if self._remote_error: + # return + + # breakpoint() + log.cancel( + 'Setting remote error for ctx \n' + f'<= remote ctx uid: {self.chan.uid}\n' + f'=>\n{error}' + ) self._remote_error: BaseException = error if ( isinstance(error, ContextCancelled) ): - # always record the cancelling actor's uid since its cancellation - # state is linked and we want to know which process was - # the cause / requester of the cancellation. - self._canceller = error.canceller - log.cancel( 'Remote task-context was cancelled for ' f'actor: {self.chan.uid}\n' f'task: {self.cid}\n' f'canceller: {error.canceller}\n' ) + # always record the cancelling actor's uid since its cancellation + # state is linked and we want to know which process was + # the cause / requester of the cancellation. + # if error.canceller is None: + # import pdbp; pdbp.set_trace() + + # breakpoint() + self._canceller = error.canceller + if self._cancel_called: - # from .devx._debug import breakpoint - # await breakpoint() - # this is an expected cancel request response message # and we **don't need to raise it** in local cancel # scope since it will potentially override a real error. @@ -348,10 +530,11 @@ class Context: else: log.error( - f'Remote context error,\n' - f'remote actor: {self.chan.uid}\n' - f'task: {self.cid}\n' - f'{error}' + f'Remote context error:\n' + f'{error}\n' + f'{pformat(self)}\n' + # f'remote actor: {self.chan.uid}\n' + # f'cid: {self.cid}\n' ) self._canceller = self.chan.uid @@ -376,9 +559,11 @@ class Context: self._scope.cancel() # NOTE: this REPL usage actually works here dawg! Bo - # from .devx._debug import pause # await pause() + # TODO: maybe we have to use `._res_scope.cancel()` if it + # exists? + async def cancel( self, timeout: float = 0.616, @@ -395,6 +580,8 @@ class Context: log.cancel( f'Cancelling {side} side of context to {self.chan.uid}' ) + + # await pause() self._cancel_called: bool = True # caller side who entered `Portal.open_context()` @@ -484,13 +671,11 @@ class Context: ''' actor: Actor = current_actor() - # here we create a mem chan that corresponds to the - # far end caller / callee. - - # Likewise if the surrounding context has been cancelled we error here - # since it likely means the surrounding block was exited or - # killed - + # If the surrounding context has been cancelled by some + # task with a handle to THIS, we error here immediately + # since it likely means the surrounding lexical-scope has + # errored, been `trio.Cancelled` or at the least + # `Context.cancel()` was called by some task. if self._cancel_called: # XXX NOTE: ALWAYS RAISE any remote error here even if @@ -503,6 +688,11 @@ class Context: # actually try to stream - a cancel msg was already # sent to the other side! if self._remote_error: + # NOTE: this is diff then calling + # `._maybe_raise_from_remote_msg()` specifically + # because any task entering this `.open_stream()` + # AFTER cancellation has already been requested, + # we DO NOT want to absorb any ctxc ACK silently! raise self._remote_error # XXX NOTE: if no `ContextCancelled` has been responded @@ -529,7 +719,7 @@ class Context: # to send a stop from the caller to the callee in the # single-direction-stream case you'll get a lookup error # currently. - ctx = actor.get_context( + ctx: Context = actor.get_context( self.chan, self.cid, msg_buffer_size=msg_buffer_size, @@ -548,6 +738,19 @@ class Context: 'The underlying channel for this stream was already closed!?' ) + # NOTE: implicitly this will call `MsgStream.aclose()` on + # `.__aexit__()` due to stream's parent `Channel` type! + # + # XXX NOTE XXX: ensures the stream is "one-shot use", + # which specifically means that on exit, + # - signal ``trio.EndOfChannel``/``StopAsyncIteration`` to + # the far end indicating that the caller exited + # the streaming context purposefully by letting + # the exit block exec. + # - this is diff from the cancel/error case where + # a cancel request from this side or an error + # should be sent to the far end indicating the + # stream WAS NOT just closed normally/gracefully. async with MsgStream( ctx=self, rx_chan=ctx._recv_chan, @@ -567,11 +770,37 @@ class Context: # await trio.lowlevel.checkpoint() yield stream - # NOTE: Make the stream "one-shot use". On exit, - # signal - # ``trio.EndOfChannel``/``StopAsyncIteration`` to - # the far end. - await stream.aclose() + + # XXX: (MEGA IMPORTANT) if this is a root opened process we + # wait for any immediate child in debug before popping the + # context from the runtime msg loop otherwise inside + # ``Actor._push_result()`` the msg will be discarded and in + # the case where that msg is global debugger unlock (via + # a "stop" msg for a stream), this can result in a deadlock + # where the root is waiting on the lock to clear but the + # child has already cleared it and clobbered IPC. + # + # await maybe_wait_for_debugger() + + # XXX TODO: pretty sure this isn't needed (see + # note above this block) AND will result in + # a double `.send_stop()` call. The only reason to + # put it here would be to due with "order" in + # terms of raising any remote error (as per + # directly below) or bc the stream's + # `.__aexit__()` block might not get run + # (doubtful)? Either way if we did put this back + # in we also need a state var to avoid the double + # stop-msg send.. + # + # await stream.aclose() + + # if re := ctx._remote_error: + # ctx._maybe_raise_remote_err( + # re, + # raise_ctxc_from_self_call=True, + # ) + # await trio.lowlevel.checkpoint() finally: if self._portal: @@ -587,7 +816,10 @@ class Context: def _maybe_raise_remote_err( self, err: Exception, - ) -> None: + raise_ctxc_from_self_call: bool = False, + raise_overrun_from_self: bool = True, + + ) -> ContextCancelled|None: ''' Maybe raise a remote error depending on who (which task from which actor) requested a cancellation (if any). @@ -603,13 +835,21 @@ class Context: # "error"-msg. our_uid: tuple[str, str] = current_actor().uid if ( - isinstance(err, ContextCancelled) - and ( + (not raise_ctxc_from_self_call + and isinstance(err, ContextCancelled) + and ( self._cancel_called or self.chan._cancel_called or self.canceller == our_uid - or tuple(err.canceller) == our_uid + or tuple(err.canceller) == our_uid) ) + or + (not raise_overrun_from_self + and isinstance(err, RemoteActorError) + and err.msgdata['type_str'] == 'StreamOverrun' + and tuple(err.msgdata['sender']) == our_uid + ) + ): # NOTE: we set the local scope error to any "self # cancellation" error-response thus "absorbing" @@ -661,77 +901,196 @@ class Context: assert self._portal, "Context.result() can not be called from callee!" assert self._recv_chan - if re := self._remote_error: - return self._maybe_raise_remote_err(re) + raise_overrun: bool = not self._allow_overruns + # if re := self._remote_error: + # return self._maybe_raise_remote_err( + # re, + # # NOTE: obvi we don't care if we + # # overran the far end if we're already + # # waiting on a final result (msg). + # raise_overrun_from_self=raise_overrun, + # ) + res_placeholder: int = id(self) if ( - self._result == id(self) + self._result == res_placeholder and not self._remote_error and not self._recv_chan._closed # type: ignore ): - # wait for a final context result consuming - # and discarding any bi dir stream msgs still - # in transit from the far end. - while True: - try: - msg = await self._recv_chan.receive() - self._result: Any = msg['return'] - # NOTE: we don't need to do this right? - # XXX: only close the rx mem chan AFTER - # a final result is retreived. - # if self._recv_chan: - # await self._recv_chan.aclose() + # wait for a final context result by collecting (but + # basically ignoring) any bi-dir-stream msgs still in transit + # from the far end. + drained_msgs: list[dict] = await _drain_to_final_msg(ctx=self) + log.runtime( + 'Ctx drained pre-result msgs:\n' + f'{drained_msgs}' + ) - break + # TODO: implement via helper func ^^^^ + # pre_result_drained: list[dict] = [] + # while not self._remote_error: + # try: + # # NOTE: this REPL usage actually works here dawg! Bo + # # from .devx._debug import pause + # # await pause() + # # if re := self._remote_error: + # # self._maybe_raise_remote_err( + # # re, + # # # NOTE: obvi we don't care if we + # # # overran the far end if we're already + # # # waiting on a final result (msg). + # # raise_overrun_from_self=raise_overrun, + # # ) - # NOTE: we get here if the far end was - # `ContextCancelled` in 2 cases: - # 1. we requested the cancellation and thus - # SHOULD NOT raise that far end error, - # 2. WE DID NOT REQUEST that cancel and thus - # SHOULD RAISE HERE! - except trio.Cancelled: + # # TODO: bad idea? + # # with trio.CancelScope() as res_cs: + # # self._res_scope = res_cs + # # msg: dict = await self._recv_chan.receive() + # # if res_cs.cancelled_caught: - # CASE 2: mask the local cancelled-error(s) - # only when we are sure the remote error is the - # (likely) source cause of this local runtime - # task's cancellation. - if re := self._remote_error: - self._maybe_raise_remote_err(re) + # # from .devx._debug import pause + # # await pause() + # msg: dict = await self._recv_chan.receive() + # self._result: Any = msg['return'] + # log.runtime( + # 'Context delivered final result msg:\n' + # f'{pformat(msg)}' + # ) + # # NOTE: we don't need to do this right? + # # XXX: only close the rx mem chan AFTER + # # a final result is retreived. + # # if self._recv_chan: + # # await self._recv_chan.aclose() + # break - # CASE 1: we DID request the cancel we simply - # continue to bubble up as normal. - raise + # # NOTE: we get here if the far end was + # # `ContextCancelled` in 2 cases: + # # 1. we requested the cancellation and thus + # # SHOULD NOT raise that far end error, + # # 2. WE DID NOT REQUEST that cancel and thus + # # SHOULD RAISE HERE! + # except trio.Cancelled: - except KeyError: # as msgerr: + # # CASE 2: mask the local cancelled-error(s) + # # only when we are sure the remote error is + # # the source cause of this local task's + # # cancellation. + # if re := self._remote_error: + # self._maybe_raise_remote_err(re) - if 'yield' in msg: - # far end task is still streaming to us so discard - log.warning(f'Discarding stream delivered {msg}') - continue + # # CASE 1: we DID request the cancel we simply + # # continue to bubble up as normal. + # raise - elif 'stop' in msg: - log.debug('Remote stream terminated') - continue + # except KeyError: - # internal error should never get here - assert msg.get('cid'), ( - "Received internal error at portal?" - ) + # if 'yield' in msg: + # # far end task is still streaming to us so discard + # log.warning(f'Discarding std "yield"\n{msg}') + # pre_result_drained.append(msg) + # continue - if err:= unpack_error( - msg, - self._portal.channel - ): # from msgerr - self._maybe_cancel_and_set_remote_error(err) - self._maybe_raise_remote_err(err) + # # TODO: work out edge cases here where + # # a stream is open but the task also calls + # # this? + # # -[ ] should be a runtime error if a stream is open + # # right? + # elif 'stop' in msg: + # log.cancel( + # 'Remote stream terminated due to "stop" msg:\n' + # f'{msg}' + # ) + # pre_result_drained.append(msg) + # continue - else: - raise + # # internal error should never get here + # assert msg.get('cid'), ( + # "Received internal error at portal?" + # ) - if re := self._remote_error: - return self._maybe_raise_remote_err(re) + # # XXX fallthrough to handle expected error XXX + # re: Exception|None = self._remote_error + # if re: + # log.critical( + # 'Remote ctx terminated due to "error" msg:\n' + # f'{re}' + # ) + # assert msg is self._cancel_msg + # # NOTE: this solved a super dupe edge case XD + # # this was THE super duper edge case of: + # # - local task opens a remote task, + # # - requests remote cancellation of far end + # # ctx/tasks, + # # - needs to wait for the cancel ack msg + # # (ctxc) or some result in the race case + # # where the other side's task returns + # # before the cancel request msg is ever + # # rxed and processed, + # # - here this surrounding drain loop (which + # # iterates all ipc msgs until the ack or + # # an early result arrives) was NOT exiting + # # since we are the edge case: local task + # # does not re-raise any ctxc it receives + # # IFF **it** was the cancellation + # # requester.. + # # will raise if necessary, ow break from + # # loop presuming any error terminates the + # # context! + # self._maybe_raise_remote_err( + # re, + # # NOTE: obvi we don't care if we + # # overran the far end if we're already + # # waiting on a final result (msg). + # # raise_overrun_from_self=False, + # raise_overrun_from_self=raise_overrun, + # ) + + # break # OOOOOF, yeah obvi we need this.. + + # # XXX we should never really get here + # # right! since `._deliver_msg()` should + # # always have detected an {'error': ..} + # # msg and already called this right!?! + # elif error := unpack_error( + # msg=msg, + # chan=self._portal.channel, + # hide_tb=False, + # ): + # log.critical('SHOULD NEVER GET HERE!?') + # assert msg is self._cancel_msg + # assert error.msgdata == self._remote_error.msgdata + # from .devx._debug import pause + # await pause() + # self._maybe_cancel_and_set_remote_error(error) + # self._maybe_raise_remote_err(error) + + # else: + # # bubble the original src key error + # raise + + if ( + (re := self._remote_error) + and self._result == res_placeholder + ): + maybe_err: Exception|None = self._maybe_raise_remote_err( + re, + # NOTE: obvi we don't care if we + # overran the far end if we're already + # waiting on a final result (msg). + # raise_overrun_from_self=False, + raise_overrun_from_self=( + raise_overrun + and + # only when we ARE NOT the canceller + # should we raise overruns, bc ow we're + # raising something we know might happen + # during cancellation ;) + (not self._cancel_called) + ), + ) + if maybe_err: + self._result = maybe_err return self._result @@ -779,7 +1138,7 @@ class Context: while self._overflow_q: # NOTE: these msgs should never be errors since we always do # the check prior to checking if we're in an overrun state - # inside ``.deliver_msg()``. + # inside ``._deliver_msg()``. msg = self._overflow_q.popleft() try: await self._send_chan.send(msg) @@ -830,34 +1189,50 @@ class Context: messages are eventually sent if possible. ''' - cid = self.cid - chan = self.chan - uid = chan.uid + cid: str = self.cid + chan: Channel = self.chan + from_uid: tuple[str, str] = chan.uid send_chan: trio.MemorySendChannel = self._send_chan - log.runtime( - f"Delivering {msg} from {uid} to caller {cid}" - ) - - if ( - msg.get('error') # check for field - and ( - error := unpack_error( - msg, - self.chan, - ) + if re := unpack_error( + msg, + self.chan, + ): + log.error( + f'Delivering error-msg from {from_uid} to caller {cid}' + f'{re}' ) - ): self._cancel_msg = msg - self._maybe_cancel_and_set_remote_error(error) + self._maybe_cancel_and_set_remote_error(re) - if ( - self._in_overrun - ): + # XXX NEVER do this XXX..!! + # bc if the error is a ctxc and there is a task + # waiting on `.result()` we need the msg to be sent + # over the `send_chan`/`._recv_chan` so that the error + # is relayed to that waiter task.. + # return True + # + # XXX ALSO NO!! XXX + # if self._remote_error: + # self._maybe_raise_remote_err(error) + + if self._in_overrun: + log.warning( + f'Capturing overrun-msg from {from_uid} to caller {cid}' + f'{msg}' + ) self._overflow_q.append(msg) return False try: + log.runtime( + f'Delivering IPC `Context` msg:\n' + f'<= {from_uid}\n' + f'=> caller: {cid}\n' + f'{msg}' + ) + # from .devx._debug import pause + # await pause() send_chan.send_nowait(msg) return True # if an error is deteced we should always @@ -890,7 +1265,8 @@ class Context: lines = [ f'OVERRUN on actor-task context {cid}@{local_uid}!\n' # TODO: put remote task name here if possible? - f'remote sender actor: {uid}', + f'sender: {from_uid}', + f'msg: {msg}', # TODO: put task func name here and maybe an arrow # from sender to overrunner? # f'local task {self.func_name}' @@ -926,11 +1302,19 @@ class Context: # anything different. return False else: + # raise local overrun and immediately pack as IPC + # msg for far end. try: - raise StreamOverrun(text) + raise StreamOverrun( + text, + sender=from_uid, + ) except StreamOverrun as err: - err_msg = pack_error(err) - err_msg['cid'] = cid + err_msg: dict[str, dict] = pack_error( + err, + cid=cid, + ) + # err_msg['cid']: str = cid try: await chan.send(err_msg) except trio.BrokenResourceError: diff --git a/tractor/_portal.py b/tractor/_portal.py index 378f6a2..14f6fbf 100644 --- a/tractor/_portal.py +++ b/tractor/_portal.py @@ -39,7 +39,15 @@ import trio from async_generator import asynccontextmanager from .trionics import maybe_open_nursery -from ._state import current_actor +from .devx import ( + # acquire_debug_lock, + # pause, + maybe_wait_for_debugger, +) +from ._state import ( + current_actor, + debug_mode, +) from ._ipc import Channel from .log import get_logger from .msg import NamespacePath @@ -48,6 +56,7 @@ from ._exceptions import ( unpack_error, NoResult, ContextCancelled, + RemoteActorError, ) from ._context import ( Context, @@ -55,7 +64,6 @@ from ._context import ( from ._streaming import ( MsgStream, ) -from .devx._debug import maybe_wait_for_debugger log = get_logger(__name__) @@ -469,7 +477,6 @@ class Portal: ctx._started_called: bool = True except KeyError as src_error: - _raise_from_no_key_in_msg( ctx=ctx, msg=msg, @@ -494,6 +501,33 @@ class Portal: # in enter tuple. yield ctx, first + # between the caller exiting and arriving here the + # far end may have sent a ctxc-msg or other error, + # so check for it here immediately and maybe raise + # so as to engage the ctxc handling block below! + # if re := ctx._remote_error: + # maybe_ctxc: ContextCancelled|None = ctx._maybe_raise_remote_err( + # re, + + # # TODO: do we want this to always raise? + # # - means that on self-ctxc, if/when the + # # block is exited before the msg arrives + # # but then the msg during __exit__ + # # calling we may not activate the + # # ctxc-handler block below? should we + # # be? + # # - if there's a remote error that arrives + # # after the child has exited, we won't + # # handle until the `finally:` block + # # where `.result()` is always called, + # # again in which case we handle it + # # differently then in the handler block + # # that would normally engage from THIS + # # block? + # raise_ctxc_from_self_call=True, + # ) + # assert maybe_ctxc + # when in allow_overruns mode there may be # lingering overflow sender tasks remaining? if nurse.child_tasks: @@ -539,7 +573,7 @@ class Portal: # `.canceller: tuple[str, str]` to be same value as # caught here in a `ContextCancelled.canceller`. # - # Again, there are 2 cases: + # AGAIN to restate the above, there are 2 cases: # # 1-some other context opened in this `.open_context()` # block cancelled due to a self or peer cancellation @@ -555,6 +589,16 @@ class Portal: except ContextCancelled as ctxc: scope_err = ctxc + # XXX TODO XXX: FIX THIS debug_mode BUGGGG!!! + # using this code and then resuming the REPL will + # cause a SIGINT-ignoring HANG! + # -> prolly due to a stale debug lock entry.. + # -[ ] USE `.stackscope` to demonstrate that (possibly + # documenting it as a definittive example of + # debugging the tractor-runtime itself using it's + # own `.devx.` tooling! + # await pause() + # CASE 2: context was cancelled by local task calling # `.cancel()`, we don't raise and the exit block should # exit silently. @@ -562,18 +606,23 @@ class Portal: ctx._cancel_called and ( ctxc is ctx._remote_error - or - ctxc.canceller is self.canceller + # ctxc.msgdata == ctx._remote_error.msgdata + + # TODO: uhh `Portal.canceller` ain't a thangg + # dawg? (was `self.canceller` before?!?) + and + ctxc.canceller == self.actor.uid ) ): - log.debug( - f'Context {ctx} cancelled gracefully with:\n' + log.cancel( + f'Context (cid=[{ctx.cid[-6:]}..] cancelled gracefully with:\n' f'{ctxc}' ) # CASE 1: this context was never cancelled via a local # task (tree) having called `Context.cancel()`, raise # the error since it was caused by someone else! else: + # await pause() raise # the above `._scope` can be cancelled due to: @@ -602,8 +651,8 @@ class Portal: trio.Cancelled, # NOTE: NOT from inside the ctx._scope KeyboardInterrupt, - ) as err: - scope_err = err + ) as caller_err: + scope_err = caller_err # XXX: ALWAYS request the context to CANCEL ON any ERROR. # NOTE: `Context.cancel()` is conversely NEVER CALLED in @@ -611,11 +660,26 @@ class Portal: # handled in the block above! log.cancel( 'Context cancelled for task due to\n' - f'{err}\n' + f'{caller_err}\n' 'Sending cancel request..\n' f'task:{cid}\n' f'actor:{uid}' ) + + if debug_mode(): + log.pdb( + 'Delaying `ctx.cancel()` until debug lock ' + 'acquired..' + ) + # async with acquire_debug_lock(self.actor.uid): + # pass + # TODO: factor ^ into below for non-root cases? + await maybe_wait_for_debugger() + log.pdb( + 'Acquired debug lock! ' + 'Calling `ctx.cancel()`!' + ) + try: await ctx.cancel() except trio.BrokenResourceError: @@ -629,6 +693,33 @@ class Portal: # no local scope error, the "clean exit with a result" case. else: + # between the caller exiting and arriving here the + # far end may have sent a ctxc-msg or other error, + # so check for it here immediately and maybe raise + # so as to engage the ctxc handling block below! + # if re := ctx._remote_error: + # maybe_ctxc: ContextCancelled|None = ctx._maybe_raise_remote_err( + # re, + + # # TODO: do we want this to always raise? + # # - means that on self-ctxc, if/when the + # # block is exited before the msg arrives + # # but then the msg during __exit__ + # # calling we may not activate the + # # ctxc-handler block below? should we + # # be? + # # - if there's a remote error that arrives + # # after the child has exited, we won't + # # handle until the `finally:` block + # # where `.result()` is always called, + # # again in which case we handle it + # # differently then in the handler block + # # that would normally engage from THIS + # # block? + # raise_ctxc_from_self_call=True, + # ) + # assert maybe_ctxc + if ctx.chan.connected(): log.info( 'Waiting on final context-task result for\n' @@ -645,13 +736,8 @@ class Portal: # As per `Context._deliver_msg()`, that error IS # ALWAYS SET any time "callee" side fails and causes "caller # side" cancellation via a `ContextCancelled` here. - # result = await ctx.result() try: - result = await ctx.result() - log.runtime( - f'Context {fn_name} returned value from callee:\n' - f'`{result}`' - ) + result_or_err: Exception|Any = await ctx.result() except BaseException as berr: # on normal teardown, if we get some error # raised in `Context.result()` we still want to @@ -663,7 +749,48 @@ class Portal: scope_err = berr raise + # an exception type boxed in a `RemoteActorError` + # is returned (meaning it was obvi not raised). + msgdata: str|None = getattr( + result_or_err, + 'msgdata', + None + ) + # yes! this worx Bp + # from .devx import _debug + # await _debug.pause() + match (msgdata, result_or_err): + case ( + {'tb_str': tbstr}, + ContextCancelled(), + ): + log.cancel(tbstr) + + case ( + {'tb_str': tbstr}, + RemoteActorError(), + ): + log.exception( + f'Context `{fn_name}` remotely errored:\n' + f'`{tbstr}`' + ) + case (None, _): + log.runtime( + f'Context {fn_name} returned value from callee:\n' + f'`{result_or_err}`' + ) + finally: + # XXX: (MEGA IMPORTANT) if this is a root opened process we + # wait for any immediate child in debug before popping the + # context from the runtime msg loop otherwise inside + # ``Actor._push_result()`` the msg will be discarded and in + # the case where that msg is global debugger unlock (via + # a "stop" msg for a stream), this can result in a deadlock + # where the root is waiting on the lock to clear but the + # child has already cleared it and clobbered IPC. + await maybe_wait_for_debugger() + # though it should be impossible for any tasks # operating *in* this scope to have survived # we tear down the runtime feeder chan last @@ -708,6 +835,10 @@ class Portal: # out any exception group or legit (remote) ctx # error that sourced from the remote task or its # runtime. + # + # NOTE: further, this should be the only place the + # underlying feeder channel is + # once-and-only-CLOSED! with trio.CancelScope(shield=True): await ctx._recv_chan.aclose() @@ -737,18 +868,11 @@ class Portal: f'actor:{uid}' ) - # XXX: (MEGA IMPORTANT) if this is a root opened process we - # wait for any immediate child in debug before popping the - # context from the runtime msg loop otherwise inside - # ``Actor._push_result()`` the msg will be discarded and in - # the case where that msg is global debugger unlock (via - # a "stop" msg for a stream), this can result in a deadlock - # where the root is waiting on the lock to clear but the - # child has already cleared it and clobbered IPC. - await maybe_wait_for_debugger() - # FINALLY, remove the context from runtime tracking and # exit! + log.runtime( + f'Exiting context opened with {ctx.chan.uid}' + ) self.actor._contexts.pop( (self.channel.uid, ctx.cid), None, diff --git a/tractor/_streaming.py b/tractor/_streaming.py index 4530e14..e8f735e 100644 --- a/tractor/_streaming.py +++ b/tractor/_streaming.py @@ -21,8 +21,9 @@ The machinery and types behind ``Context.open_stream()`` ''' from __future__ import annotations -import inspect from contextlib import asynccontextmanager as acm +import inspect +from pprint import pformat from typing import ( Any, Callable, @@ -35,6 +36,7 @@ import trio from ._exceptions import ( _raise_from_no_key_in_msg, + ContextCancelled, ) from .log import get_logger from .trionics import ( @@ -84,8 +86,8 @@ class MsgStream(trio.abc.Channel): self._broadcaster = _broadcaster # flag to denote end of stream - self._eoc: bool = False - self._closed: bool = False + self._eoc: bool|trio.EndOfChannel = False + self._closed: bool|trio.ClosedResourceError = False # delegate directly to underlying mem channel def receive_nowait(self): @@ -93,6 +95,9 @@ class MsgStream(trio.abc.Channel): try: return msg['yield'] except KeyError as kerr: + # if 'return' in msg: + # return msg + _raise_from_no_key_in_msg( ctx=self._ctx, msg=msg, @@ -122,30 +127,43 @@ class MsgStream(trio.abc.Channel): # see ``.aclose()`` for notes on the old behaviour prior to # introducing this if self._eoc: - raise trio.EndOfChannel + raise self._eoc + # raise trio.EndOfChannel if self._closed: - raise trio.ClosedResourceError('This stream was closed') + raise self._closed + # raise trio.ClosedResourceError( + # 'This stream was already closed' + # ) + src_err: Exception|None = None try: - msg = await self._rx_chan.receive() - return msg['yield'] + try: + msg = await self._rx_chan.receive() + return msg['yield'] - except KeyError as kerr: - _raise_from_no_key_in_msg( - ctx=self._ctx, - msg=msg, - src_err=kerr, - log=log, - expect_key='yield', - stream=self, - ) + except KeyError as kerr: + src_err = kerr + # NOTE: may raise any of the below error types + # includg EoC when a 'stop' msg is found. + _raise_from_no_key_in_msg( + ctx=self._ctx, + msg=msg, + src_err=kerr, + log=log, + expect_key='yield', + stream=self, + ) + + # XXX: we close the stream on any of these error conditions: except ( - trio.ClosedResourceError, # by self._rx_chan + # trio.ClosedResourceError, # by self._rx_chan trio.EndOfChannel, # by self._rx_chan or `stop` msg from far end - ): - # XXX: we close the stream on any of these error conditions: + ) as eoc: + src_err = eoc + self._eoc = eoc + # await trio.sleep(1) # a ``ClosedResourceError`` indicates that the internal # feeder memory receive channel was closed likely by the @@ -168,14 +186,53 @@ class MsgStream(trio.abc.Channel): # closing this stream and not flushing a final value to # remaining (clone) consumers who may not have been # scheduled to receive it yet. + # try: + # maybe_err_msg_or_res: dict = self._rx_chan.receive_nowait() + # if maybe_err_msg_or_res: + # log.warning( + # 'Discarding un-processed msg:\n' + # f'{maybe_err_msg_or_res}' + # ) + # except trio.WouldBlock: + # # no queued msgs that might be another remote + # # error, so just raise the original EoC + # pass - # when the send is closed we assume the stream has - # terminated and signal this local iterator to stop - await self.aclose() + # raise eoc - raise # propagate + except trio.ClosedResourceError as cre: # by self._rx_chan + src_err = cre + log.warning( + '`Context._rx_chan` was already closed?' + ) + self._closed = cre - async def aclose(self): + # when the send is closed we assume the stream has + # terminated and signal this local iterator to stop + drained: list[Exception|dict] = await self.aclose() + if drained: + log.warning( + 'Drained context msgs during closure:\n' + f'{drained}' + ) + # TODO: pass these to the `._ctx._drained_msgs: deque` + # and then iterate them as part of any `.result()` call? + + # NOTE XXX: if the context was cancelled or remote-errored + # but we received the stream close msg first, we + # probably want to instead raise the remote error + # over the end-of-stream connection error since likely + # the remote error was the source cause? + ctx: Context = self._ctx + if re := ctx._remote_error: + ctx._maybe_raise_remote_err( + re, + raise_ctxc_from_self_call=True, + ) + + raise src_err # propagate + + async def aclose(self) -> list[Exception|dict]: ''' Cancel associated remote actor task and local memory channel on close. @@ -185,15 +242,55 @@ class MsgStream(trio.abc.Channel): # https://trio.readthedocs.io/en/stable/reference-io.html#trio.abc.AsyncResource.aclose rx_chan = self._rx_chan - if rx_chan._closed: - log.cancel(f"{self} is already closed") + if ( + rx_chan._closed + or + self._closed + ): + log.cancel( + f'`MsgStream` is already closed\n' + f'.cid: {self._ctx.cid}\n' + f'._rx_chan`: {rx_chan}\n' + f'._eoc: {self._eoc}\n' + f'._closed: {self._eoc}\n' + ) # this stream has already been closed so silently succeed as # per ``trio.AsyncResource`` semantics. # https://trio.readthedocs.io/en/stable/reference-io.html#trio.abc.AsyncResource.aclose - return + return [] - self._eoc = True + ctx: Context = self._ctx + # caught_eoc: bool = False + drained: list[Exception|dict] = [] + while not drained: + try: + maybe_final_msg = self.receive_nowait() + if maybe_final_msg: + log.cancel( + 'Drained un-processed stream msg:\n' + f'{pformat(maybe_final_msg)}' + ) + # TODO: inject into parent `Context` buf? + drained.append(maybe_final_msg) + + except trio.WouldBlock as be: + drained.append(be) + break + + except trio.EndOfChannel as eoc: + drained.append(eoc) + # caught_eoc = True + self._eoc: bool = eoc + break + + except ContextCancelled as ctxc: + log.cancel( + 'Context was cancelled during stream closure:\n' + f'canceller: {ctxc.canceller}\n' + f'{pformat(ctxc.msgdata)}' + ) + break # NOTE: this is super subtle IPC messaging stuff: # Relay stop iteration to far end **iff** we're @@ -224,26 +321,33 @@ class MsgStream(trio.abc.Channel): except ( trio.BrokenResourceError, trio.ClosedResourceError - ): + ) as re: # the underlying channel may already have been pulled # in which case our stop message is meaningless since # it can't traverse the transport. - ctx = self._ctx log.warning( f'Stream was already destroyed?\n' f'actor: {ctx.chan.uid}\n' f'ctx id: {ctx.cid}' ) + drained.append(re) + self._closed = re - self._closed = True + # if caught_eoc: + # # from .devx import _debug + # # await _debug.pause() + # with trio.CancelScope(shield=True): + # await rx_chan.aclose() - # Do we close the local mem chan ``self._rx_chan`` ??!? + # self._eoc: bool = caught_eoc - # NO, DEFINITELY NOT if we're a bi-dir ``MsgStream``! - # BECAUSE this same core-msg-loop mem recv-chan is used to deliver - # the potential final result from the surrounding inter-actor - # `Context` so we don't want to close it until that context has - # run to completion. + # ?XXX WAIT, why do we not close the local mem chan `._rx_chan` XXX? + # => NO, DEFINITELY NOT! <= + # if we're a bi-dir ``MsgStream`` BECAUSE this same + # core-msg-loop mem recv-chan is used to deliver the + # potential final result from the surrounding inter-actor + # `Context` so we don't want to close it until that + # context has run to completion. # XXX: Notes on old behaviour: # await rx_chan.aclose() @@ -272,6 +376,8 @@ class MsgStream(trio.abc.Channel): # runtime's closure of ``rx_chan`` in the case where we may # still need to consume msgs that are "in transit" from the far # end (eg. for ``Context.result()``). + # self._closed = True + return drained @acm async def subscribe( @@ -337,9 +443,13 @@ class MsgStream(trio.abc.Channel): raise self._ctx._remote_error # from None if self._closed: - raise trio.ClosedResourceError('This stream was already closed') + raise self._closed + # raise trio.ClosedResourceError('This stream was already closed') - await self._ctx.chan.send({'yield': data, 'cid': self._ctx.cid}) + await self._ctx.chan.send({ + 'yield': data, + 'cid': self._ctx.cid, + }) def stream(func: Callable) -> Callable: