From 185dbc7e3ff7939f42610bd1fbfcd1c85f9dc394 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 5 Dec 2021 19:31:41 -0500
Subject: [PATCH] Disable msg stream backpressure by default

Half of portal API usage requires a 1 message response (`.run()`,
`.run_in_actor()`) and the streaming APIs should probably be explicitly
enabled for backpressure if desired by the user. This makes more sense
in (psuedo) realtime systems where it's better to notify on a block then
freeze without notice. Make this default behaviour with a new error to
be raised: `tractor._exceptions.StreamOverrun` when a sender overruns
a stream by the default size (2**6 for now). The old behavior can be
enabled with `Context.open_stream(backpressure=True)` but now with
warning log messages when there are overruns.

Add task-linked-context error propagation using a "nursery raising"
technique such that if either end of context linked pair of tasks
errors, that error can be relayed to other side and raised as a form of
interrupt at the receiving task's next `trio` checkpoint. This enables
reliable error relay without expecting the (error) receiving task to
call an API which would raise the remote exception (which it might never
currently if using `tractor.MsgStream` APIs).

Further internal implementation details:
- define the default msg buffer size as `Actor.msg_buffer_size`
- expose a `msg_buffer_size: int` kwarg from `Actor.get_context()`
- maybe raise aforementioned context errors using
  `Context._maybe_error_from_remote_msg()` inside `Actor._push_result()`
- support optional backpressure on a stream when pushing messages
  in `Actor._push_result()`
- in `_invote()` handle multierrors raised from a `@tractor.context`
  entrypoint as being potentially caused by a relayed error from the
  remote caller task, if `Context._error` has been set then raise that
  error inside the `RemoteActorError` that will be relayed back to that
  caller more or less proxying through the source side error back to its
  origin.
---
 tractor/_actor.py | 99 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 76 insertions(+), 23 deletions(-)

diff --git a/tractor/_actor.py b/tractor/_actor.py
index 36f0092..b78b872 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -32,6 +32,7 @@ from ._exceptions import (
     is_multi_cancelled,
     ContextCancelled,
     TransportClosed,
+    StreamOverrun,
 )
 from . import _debug
 from ._discovery import get_arbiter
@@ -161,16 +162,27 @@ async def _invoke(
             # context func with support for bi-dir streaming
             await chan.send({'functype': 'context', 'cid': cid})
 
-            async with trio.open_nursery() as scope_nursery:
-                ctx._scope_nursery = scope_nursery
-                cs = scope_nursery.cancel_scope
-                task_status.started(cs)
-                try:
+            try:
+                async with trio.open_nursery() as scope_nursery:
+                    ctx._scope_nursery = scope_nursery
+                    cs = scope_nursery.cancel_scope
+                    task_status.started(cs)
                     await chan.send({'return': await coro, 'cid': cid})
-                except trio.Cancelled as err:
-                    tb = err.__traceback__
 
-            if cs.cancelled_caught:
+            except trio.Cancelled as err:
+                tb = err.__traceback__
+                if ctx._error is not None:
+                    tb = ctx._error.__traceback__
+                    raise ctx._error
+
+            except trio.MultiError as err:
+                if ctx._error is not None:
+                    tb = ctx._error.__traceback__
+                    raise ctx._error from err
+                else:
+                    raise
+
+            if cs.cancelled_caught or ctx._error:
 
                 # TODO: pack in ``trio.Cancelled.__traceback__`` here
                 # so they can be unwrapped and displayed on the caller
@@ -314,6 +326,7 @@ class Actor:
     # ugh, we need to get rid of this and replace with a "registry" sys
     # https://github.com/goodboy/tractor/issues/216
     is_arbiter: bool = False
+    msg_buffer_size: int = 2**6
 
     # nursery placeholders filled in by `_async_main()` after fork
     _root_n: Optional[trio.Nursery] = None
@@ -548,7 +561,7 @@ class Actor:
                 # now in a cancelled condition) when the local runtime here
                 # is now cancelled while (presumably) in the middle of msg
                 # loop processing.
-                with trio.move_on_after(0.1) as cs:
+                with trio.move_on_after(0.5) as cs:
                     cs.shield = True
                     # Attempt to wait for the far end to close the channel
                     # and bail after timeout (2-generals on closure).
@@ -611,23 +624,54 @@ class Actor:
         cid: str,
         msg: dict[str, Any],
     ) -> None:
-        """Push an RPC result to the local consumer's queue.
-        """
+        '''
+        Push an RPC result to the local consumer's queue.
+
+        '''
         assert chan.uid, f"`chan.uid` can't be {chan.uid}"
         ctx = self._contexts[(chan.uid, cid)]
         send_chan = ctx._send_chan
         assert send_chan
 
-        # TODO: relaying far end context errors to the local
-        # context through nursery raising?
-        # if 'error' in msg:
-        #     ctx._error_from_remote_msg(msg)
-        #     log.runtime(f"{send_chan} was terminated at remote end")
+        if msg.get('error'):
+            # If this is an error message from a context opened by
+            # ``Portal.open_context()`` we want to interrupt any ongoing
+            # (child) tasks within that context to be notified of the remote
+            # error relayed here.
+            #
+            # The reason we may want to raise the remote error immediately
+            # is that there is no guarantee the associated local task(s)
+            # will attempt to read from any locally opened stream any time
+            # soon.
+            #
+            # NOTE: this only applies when
+            # ``Portal.open_context()`` has been called since it is assumed
+            # (currently) that other portal APIs (``Portal.run()``,
+            # ``.run_in_actor()``) do their own error checking at the point
+            # of the call and result processing.
+            log.warning(f'Remote context for {chan.uid}:{cid} errored')
+            ctx._maybe_error_from_remote_msg(msg)
 
         try:
             log.runtime(f"Delivering {msg} from {chan.uid} to caller {cid}")
-            # maintain backpressure
-            await send_chan.send(msg)
+
+            # XXX: we do **not** maintain backpressure and instead
+            # opt to relay stream overrun errors to the sender.
+            try:
+                send_chan.send_nowait(msg)
+            except trio.WouldBlock:
+                log.warning(f'Caller task {cid} was overrun!?')
+                if ctx._backpressure:
+                    await send_chan.send(msg)
+                else:
+                    try:
+                        raise StreamOverrun(
+                            f'Context stream {cid} for {chan.uid} was overrun!'
+                        )
+                    except StreamOverrun as err:
+                        err_msg = pack_error(err)
+                        err_msg['cid'] = cid
+                        await chan.send(err_msg)
 
         except trio.BrokenResourceError:
             # TODO: what is the right way to handle the case where the
@@ -644,7 +688,7 @@ class Actor:
         self,
         chan: Channel,
         cid: str,
-        max_buffer_size: int = 2**6,
+        msg_buffer_size: Optional[int] = None,
 
     ) -> Context:
         '''
@@ -660,10 +704,17 @@ class Actor:
         assert actor_uid
         try:
             ctx = self._contexts[(actor_uid, cid)]
+
+            # adjust buffer size if specified
+            state = ctx._send_chan._state
+            if msg_buffer_size and state.max_buffer_size != msg_buffer_size:
+                state.max_buffer_size = msg_buffer_size
+
         except KeyError:
             send_chan: trio.MemorySendChannel
             recv_chan: trio.MemoryReceiveChannel
-            send_chan, recv_chan = trio.open_memory_channel(max_buffer_size)
+            send_chan, recv_chan = trio.open_memory_channel(
+                msg_buffer_size or self.msg_buffer_size)
             ctx = Context(
                 chan,
                 cid,
@@ -679,7 +730,8 @@ class Actor:
         chan: Channel,
         ns: str,
         func: str,
-        kwargs: dict
+        kwargs: dict,
+        msg_buffer_size: Optional[int] = None,
 
     ) -> Context:
         '''
@@ -693,7 +745,7 @@ class Actor:
         '''
         cid = str(uuid.uuid4())
         assert chan.uid
-        ctx = self.get_context(chan, cid)
+        ctx = self.get_context(chan, cid, msg_buffer_size=msg_buffer_size)
         log.runtime(f"Sending cmd to {chan.uid}: {ns}.{func}({kwargs})")
         await chan.send({'cmd': (ns, func, kwargs, self.uid, cid)})
 
@@ -743,7 +795,8 @@ class Actor:
                     if msg is None:  # loop terminate sentinel
 
                         log.cancel(
-                            f"Cancelling all tasks for {chan} from {chan.uid}")
+                            f"Channerl to {chan.uid} terminated?\n"
+                            "Cancelling all associated tasks..")
 
                         for (channel, cid) in self._rpc_tasks.copy():
                             if channel is chan: