Add back in async gen loop

Pre-declare disconnected flag
Avoid attr error XD
2022-02-16 13:59:23 -05:00 · 2022-02-16 13:09:24 -05:00 · 2022-02-16 13:07:21 -05:00 · 2022-02-15 09:11:12 -05:00 · 2022-02-15 09:09:35 -05:00 · 2022-02-15 09:09:35 -05:00
6 changed files with 176 additions and 91 deletions
--- a/examples/debugging/subactor_bp_in_ctx.py
+++ b/examples/debugging/subactor_bp_in_ctx.py
@ -6,6 +6,7 @@ async def gen():
    yield 'yo'
    await tractor.breakpoint()
    yield 'yo'
+    await tractor.breakpoint()


@tractor.context
@ -13,35 +14,35 @@ async def just_bp(
    ctx: tractor.Context,
 ) -> None:

-    await ctx.started('yo bpin here')
+    await ctx.started()
    await tractor.breakpoint()

-    # async for val in gen():
-    #     print(val)
+    # TODO: bps and errors in this call..
+    async for val in gen():
+        print(val)

-    await trio.sleep(0.5)
+    # await trio.sleep(0.5)

-    # THIS CAUSES AN UNRECOVERABLE HANG!?
+    # prematurely destroy the connection
+    await ctx.chan.aclose()
+
+    # THIS CAUSES AN UNRECOVERABLE HANG
+    # without latest ``pdbpp``:
    assert 0



 async def main():
    async with tractor.open_nursery(
-        loglevel='transport',
        debug_mode=True,
    ) as n:
        p = await n.start_actor(
            'bp_boi',
            enable_modules=[__name__],
-            # debug_mode=True,
        )
        async with p.open_context(
            just_bp,
        ) as (ctx, first):
-
-            # await tractor.breakpoint()
-            # breakpoint()
            await trio.sleep_forever()


--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@ -26,8 +26,11 @@ import importlib
 import importlib.util
 import inspect
 import uuid
-import typing
-from typing import Any, Optional, Union
+from typing import (
+    Any, Optional,
+    Union, TYPE_CHECKING,
+    Callable,
+)
 from types import ModuleType
 import sys
 import os
@ -57,6 +60,10 @@ from . import _state
 from . import _mp_fixup_main


+if TYPE_CHECKING:
+    from ._supervise import ActorNursery
+
+
 log = get_logger('tractor')


@ -65,7 +72,7 @@ async def _invoke(
    actor: 'Actor',
    cid: str,
    chan: Channel,
-    func: typing.Callable,
+    func: Callable,
    kwargs: dict[str, Any],

    is_rpc: bool = True,
@ -200,7 +207,7 @@ async def _invoke(
                ctx = actor._contexts.pop((chan.uid, cid))
                if ctx:
                    log.runtime(
-                        f'Context entrypoint for {func} was terminated:\n{ctx}'
+                        f'Context entrypoint {func} was terminated:\n{ctx}'
                    )

            assert cs
@ -316,7 +323,9 @@ async def try_ship_error_to_parent(
            trio.ClosedResourceError,
            trio.BrokenResourceError,
        ):
-            log.error(
+            # in SC terms this is one of the worst things that can
+            # happen and creates the 2-general's dilemma.
+            log.critical(
                f"Failed to ship error to parent "
                f"{channel.uid}, channel was closed"
            )
@ -424,7 +433,7 @@ class Actor:
        # (chan, cid) -> (cancel_scope, func)
        self._rpc_tasks: dict[
            tuple[Channel, str],
-            tuple[trio.CancelScope, typing.Callable, trio.Event]
+            tuple[trio.CancelScope, Callable, trio.Event]
        ] = {}

        # map {actor uids -> Context}
@ -513,6 +522,7 @@ class Actor:
        self._no_more_peers = trio.Event()  # unset

        chan = Channel.from_stream(stream)
+        uid: Optional[tuple[str, str]] = chan.uid
        log.runtime(f"New connection to us {chan}")

        # send/receive initial handshake response
@ -560,33 +570,51 @@ class Actor:
        # append new channel
        self._peers[uid].append(chan)

+        local_nursery: Optional[ActorNursery] = None  # noqa
+        disconnected: bool = False
+
        # Begin channel management - respond to remote requests and
        # process received reponses.
        try:
-            await self._process_messages(chan)
+            disconnected = await self._process_messages(chan)

-        except trio.Cancelled:
+        except (
+            trio.Cancelled,
+        ):
            log.cancel(f"Msg loop was cancelled for {chan}")
            raise

        finally:
+            local_nursery = self._actoruid2nursery.get(uid, local_nursery)
+
            # This is set in ``Portal.cancel_actor()``. So if
            # the peer was cancelled we try to wait for them
            # to tear down their side of the connection before
            # moving on with closing our own side.
-            local_nursery = self._actoruid2nursery.get(chan.uid)
            if (
                local_nursery
            ):
+                if disconnected:
+                    # if the transport died and this actor is still
+                    # registered within a local nursery, we report that the
+                    # IPC layer may have failed unexpectedly since it may be
+                    # the cause of other downstream errors.
+                    entry = local_nursery._children.get(uid)
+                    if entry:
+                        _, proc, _ = entry
+                        log.error(f'Actor {uid}@{proc} IPC connection broke!?')
+                        # if proc.poll() is not None:
+                        #     log.error('Actor {uid} proc died and IPC broke?')
+
                log.cancel(f"Waiting on cancel request to peer {chan.uid}")
                # XXX: this is a soft wait on the channel (and its
-                # underlying transport protocol) to close from the remote
-                # peer side since we presume that any channel which
-                # is mapped to a sub-actor (i.e. it's managed by
-                # one of our local nurseries)
-                # message is sent to the peer likely by this actor which is
-                # now in a cancelled condition) when the local runtime here
-                # is now cancelled while (presumably) in the middle of msg
+                # underlying transport protocol) to close from the
+                # remote peer side since we presume that any channel
+                # which is mapped to a sub-actor (i.e. it's managed by
+                # one of our local nurseries) has a message is sent to
+                # the peer likely by this actor (which is now in
+                # a cancelled condition) when the local runtime here is
+                # now cancelled while (presumably) in the middle of msg
                # loop processing.
                with trio.move_on_after(0.5) as cs:
                    cs.shield = True
@ -609,6 +637,8 @@ class Actor:

                    await local_nursery.exited.wait()

+                # if local_nursery._children
+
            # ``Channel`` teardown and closure sequence

            # Drop ref to channel so it can be gc-ed and disconnected
@ -618,7 +648,7 @@ class Actor:

            if not chans:
                log.runtime(f"No more channels for {chan.uid}")
-                self._peers.pop(chan.uid, None)
+                self._peers.pop(uid, None)

                # for (uid, cid) in self._contexts.copy():
                #     if chan.uid == uid:
@ -626,11 +656,13 @@ class Actor:

            log.runtime(f"Peers is {self._peers}")

-            if not self._peers:  # no more channels connected
+            # No more channels to other actors (at all) registered
+            # as connected.
+            if not self._peers:
                log.runtime("Signalling no more peer channels")
                self._no_more_peers.set()

-            # # XXX: is this necessary (GC should do it?)
+            # XXX: is this necessary (GC should do it)?
            if chan.connected():
                # if the channel is still connected it may mean the far
                # end has not closed and we may have gotten here due to
@ -665,7 +697,7 @@ class Actor:
            ctx = self._contexts[(uid, cid)]
        except KeyError:
            log.warning(
-                    f'Ignoring msg from [no-longer/un]known context with {uid}:'
+                f'Ignoring msg from [no-longer/un]known context {uid}:'
                f'\n{msg}')
            return

@ -813,7 +845,7 @@ class Actor:
        shield: bool = False,
        task_status: TaskStatus[trio.CancelScope] = trio.TASK_STATUS_IGNORED,

-    ) -> None:
+    ) -> bool:
        '''
        Process messages for the channel async-RPC style.

@ -839,7 +871,7 @@ class Actor:
                    if msg is None:  # loop terminate sentinel

                        log.cancel(
-                            f"Channerl to {chan.uid} terminated?\n"
+                            f"Channel to {chan.uid} terminated?\n"
                            "Cancelling all associated tasks..")

                        for (channel, cid) in self._rpc_tasks.copy():
@ -986,6 +1018,9 @@ class Actor:
            # up.
            log.runtime(f'channel from {chan.uid} closed abruptly:\n{chan}')

+            # transport **was** disconnected
+            return True
+
        except (Exception, trio.MultiError) as err:
            if nursery_cancelled_before_task:
                sn = self._service_n
@ -1010,6 +1045,9 @@ class Actor:
                f"Exiting msg loop for {chan} from {chan.uid} "
                f"with last msg:\n{msg}")

+        # transport **was not** disconnected
+        return False
+
    async def _from_parent(
        self,
        parent_addr: Optional[tuple[str, int]],
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@ -31,6 +31,7 @@ from typing import (
    AsyncIterator,
    AsyncGenerator,
 )
+from types import FrameType

 import tractor
 import trio
@ -48,7 +49,8 @@ try:
 except ImportError:
    # pdbpp is installed in regular mode...it monkey patches stuff
    import pdb
-    assert pdb.xpm, "pdbpp is not installed?"  # type: ignore
+    xpm = getattr(pdb, 'xpm', None)
+    assert xpm, "pdbpp is not installed?"  # type: ignore
    pdbpp = pdb

 log = get_logger(__name__)
@ -258,16 +260,11 @@ async def _hijack_stdin_for_child(
    orig_handler = signal.signal(
        signal.SIGINT,
        shield_sigint,
-        # partial(shield_sigint, pdb=pdb),
    )
-#     try:
-#         yield
    try:
        with (
            trio.CancelScope(shield=True),
-            # disable_sigint(),
        ):
-
            try:
                lock = None
                async with _acquire_debug_lock(subactor_uid) as lock:
@ -380,7 +377,7 @@ async def wait_for_parent_stdin_hijack(
            log.debug(f"Child {actor_uid} released parent stdio lock")


-def mk_mpdb() -> (MultiActorPdb, Callable):
+def mk_mpdb() -> tuple[MultiActorPdb, Callable]:

    pdb = MultiActorPdb()
    signal.signal = pdbpp.hideframe(signal.signal)
@ -534,9 +531,10 @@ async def _breakpoint(

        _pdb_release_hook = teardown

-    frame = sys._getframe()
-    last_f = frame.f_back
-    last_f.f_globals['__tracebackhide__'] = True
+    # frame = sys._getframe()
+    # last_f = frame.f_back
+    # last_f.f_globals['__tracebackhide__'] = True
+
    try:
        # block here one (at the appropriate frame *up*) where
        # ``breakpoint()`` was awaited and begin handling stdio.
@ -582,10 +580,6 @@ def shield_sigint(
    '''
    __tracebackhide__ = True

-    frame = sys._getframe()
-    last_f = frame.f_back
-    last_f.f_globals['__tracebackhide__'] = True
-
    global _local_task_in_debug, _global_actor_in_debug
    in_debug = _global_actor_in_debug

@ -602,6 +596,7 @@ def shield_sigint(
            log.pdb(
                f"Ignoring SIGINT while child in debug mode: `{in_debug}`"
            )
+
        else:
            log.pdb(
                "Ignoring SIGINT while in debug mode"
@ -658,24 +653,25 @@ def shield_sigint(


 def _set_trace(
-    actor: Optional[tractor.Actor] = None,
+    actor: Optional[tractor._actor.Actor] = None,
    pdb: Optional[MultiActorPdb] = None,
 ):
    __tracebackhide__ = True
    actor = actor or tractor.current_actor()

+    # XXX: on latest ``pdbpp`` i guess we don't need this?
    # frame = sys._getframe()
    # last_f = frame.f_back
    # last_f.f_globals['__tracebackhide__'] = True

+    # start 2 levels up in user code
+    frame: FrameType = sys._getframe()
+    if frame:
+        frame = frame.f_back.f_back  # type: ignore
+
    if pdb and actor is not None:
        log.pdb(f"\nAttaching pdb to actor: {actor.uid}\n")

-        pdb.set_trace(
-            # start 2 levels up in user code
-            frame=sys._getframe().f_back.f_back,
-        )
-
    else:
        pdb, undo_sigint = mk_mpdb()

@ -683,12 +679,7 @@ def _set_trace(
        global _local_task_in_debug, _pdb_release_hook
        _local_task_in_debug = 'sync'

-        _pdb_release_hook = undo_sigint
-
-        pdb.set_trace(
-            # start 2 levels up in user code
-            frame=sys._getframe().f_back.f_back,
-        )
+    pdb.set_trace(frame=frame)


 breakpoint = partial(
@ -698,7 +689,7 @@ breakpoint = partial(


 def _post_mortem(
-    actor: tractor.Actor,
+    actor: tractor._actor.Actor,
    pdb: MultiActorPdb,

 ) -> None:
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@ -24,7 +24,8 @@ import importlib
 import inspect
 from typing import (
    Any, Optional,
-    Callable, AsyncGenerator
+    Callable, AsyncGenerator,
+    Type,
 )
 from functools import partial
 from dataclasses import dataclass
@ -442,6 +443,10 @@ class Portal:
        _err: Optional[BaseException] = None
        ctx._portal = self

+        uid = self.channel.uid
+        cid = ctx.cid
+        etype: Optional[Type[BaseException]] = None
+
        # deliver context instance and .started() msg value in open tuple.
        try:
            async with trio.open_nursery() as scope_nursery:
@ -477,13 +482,24 @@ class Portal:
            # KeyboardInterrupt,

        ) as err:
-            _err = err
+            etype = type(err)
            # the context cancels itself on any cancel
            # causing error.
-            log.cancel(
-                f'Context to {self.channel.uid} sending cancel request..')

+            if ctx.chan.connected():
+                log.cancel(
+                    'Context cancelled for task, sending cancel request..\n'
+                    f'task:{cid}\n'
+                    f'actor:{uid}'
+                )
                await ctx.cancel()
+            else:
+                log.warning(
+                    'IPC connection for context is broken?\n'
+                    f'task:{cid}\n'
+                    f'actor:{uid}'
+                )
+
            raise

        finally:
@ -492,6 +508,12 @@ class Portal:
            # sure we get the error the underlying feeder mem chan.
            # if it's not raised here it *should* be raised from the
            # msg loop nursery right?
+            if ctx.chan.connected():
+                log.info(
+                    'Waiting on final context-task result for\n'
+                    f'task:{cid}\n'
+                    f'actor:{uid}'
+                )
                result = await ctx.result()

            # though it should be impossible for any tasks
@ -502,14 +524,17 @@ class Portal:
                # should we encapsulate this in the context api?
                await ctx._recv_chan.aclose()

-            if _err:
+            if etype:
                if ctx._cancel_called:
                    log.cancel(
-                        f'Context {fn_name} cancelled by caller with\n{_err}'
+                        f'Context {fn_name} cancelled by caller with\n{etype}'
                    )
                elif _err is not None:
                    log.cancel(
-                        f'Context {fn_name} cancelled by callee with\n{_err}'
+                        f'Context for task cancelled by callee with {etype}\n'
+                        f'target: `{fn_name}`\n'
+                        f'task:{cid}\n'
+                        f'actor:{uid}'
                    )
            else:
                log.runtime(
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@ -295,7 +295,7 @@ async def new_proc(
            # the OS; it otherwise can be passed via the parent channel if
            # we prefer in the future (for privacy).
            "--uid",
-            str(subactor.uid),
+            str(uid),
            # Address the child must connect to on startup
            "--parent_addr",
            str(parent_addr)
@ -321,8 +321,7 @@ async def new_proc(
                # wait for actor to spawn and connect back to us
                # channel should have handshake completed by the
                # local actor by the time we get a ref to it
-                event, chan = await actor_nursery._actor.wait_for_peer(
-                    subactor.uid)
+                event, chan = await actor_nursery._actor.wait_for_peer(uid)

            except trio.Cancelled:
                cancelled_during_spawn = True
@ -363,18 +362,13 @@ async def new_proc(
            task_status.started(portal)

            # wait for ActorNursery.wait() to be called
+            n_exited = actor_nursery._join_procs
            with trio.CancelScope(shield=True):
-                await actor_nursery._join_procs.wait()
+                await n_exited.wait()

            async with trio.open_nursery() as nursery:
-                if portal in actor_nursery._cancel_after_result_on_exit:
-                    nursery.start_soon(
-                        cancel_on_completion,
-                        portal,
-                        subactor,
-                        errors
-                    )

+                async def soft_wait_and_maybe_cancel_ria_task():
                    # This is a "soft" (cancellable) join/reap which
                    # will remote cancel the actor on a ``trio.Cancelled``
                    # condition.
@ -384,13 +378,46 @@ async def new_proc(
                        portal
                    )

+                    if n_exited.is_set():
                        # cancel result waiter that may have been spawned in
                        # tandem if not done already
                        log.warning(
                            "Cancelling existing result waiter task for "
-                    f"{subactor.uid}")
+                            f"{subactor.uid}"
+                        )
                        nursery.cancel_scope.cancel()

+                    else:
+                        log.warning(
+                            f'Process for actor {uid} terminated before'
+                            'nursery exit. ' 'This may mean an IPC'
+                            'connection failed!'
+                        )
+
+                nursery.start_soon(soft_wait_and_maybe_cancel_ria_task)
+
+                # TODO: when we finally remove the `.run_in_actor()` api
+                # we should be able to entirely drop these 2 blocking calls:
+                # - we don't need to wait on nursery exit to capture
+                #   process-spawn-machinery level errors (and propagate them).
+                # - we don't need to wait on final results from ria portals
+                #   since this will be done in some higher level wrapper API.
+
+                # XXX: interestingly we can't put this here bc it causes
+                # the pub-sub tests to fail? wth.. should probably drop
+                # those XD
+                # wait for ActorNursery.wait() to be called
+                # with trio.CancelScope(shield=True):
+                #     await n_exited.wait()
+
+                if portal in actor_nursery._cancel_after_result_on_exit:
+                    nursery.start_soon(
+                        cancel_on_completion,
+                        portal,
+                        subactor,
+                        errors
+                    )
+
        finally:
            # The "hard" reap since no actor zombies are allowed!
            # XXX: do this **after** cancellation/tearfown to avoid
@ -407,8 +434,10 @@ async def new_proc(
                                await proc.wait()

                    if is_root_process():
+
                        await maybe_wait_for_debugger(
-                            child_in_debug=_runtime_vars.get('_debug_mode', False),
+                            child_in_debug=_runtime_vars.get(
+                                '_debug_mode', False)
                        )

                    if proc.poll() is None:
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@ -604,7 +604,8 @@ class Context:
                    self._portal._streams.remove(rchan)

    async def result(self) -> Any:
-        '''From a caller side, wait for and return the final result from
+        '''
+        From a caller side, wait for and return the final result from
        the callee side task.

        '''
Author	SHA1	Message	Date
Tyler Goodlet	534277daa5	Add back in async gen loop	2022-02-16 13:59:23 -05:00
Tyler Goodlet	8d77fa91b1	Pre-declare disconnected flag	2022-02-16 13:09:24 -05:00
Tyler Goodlet	a6884e32cf	Avoid attr error XD	2022-02-16 13:07:21 -05:00
Tyler Goodlet	266b0053dc	Type annot updates	2022-02-15 09:11:12 -05:00
Tyler Goodlet	81aa12b46d	Drop uneeded backframe traceback hide annotation	2022-02-15 09:09:35 -05:00
Tyler Goodlet	f4af2b9fda	Run first soft wait inside a task Theoretically we can actually concurrently soft wait the process and the nursery exit event, the only problem at the moment is some pubsub tests puking. You're right in guessing this isn't really changing anything but it is meant to be a reminder. If we can add this the spawn task can report when a process dies earlier then expected and in the longer term once we remove the `ActorNursery.run_in_actor()` API, we can probably do away with both the nursery exit event and the portal result fetching.	2022-02-15 09:09:35 -05:00
Tyler Goodlet	b8117dad2a	Make `Actor._process_messages()` report disconnects The method now returns a `bool` which flags whether the transport died to the caller and allows for reporting a disconnect in the channel-transport handler task. This is something a user will normally want to know about on the caller side especially after seeing a traceback from the peer (if in tree) on console.	2022-02-15 09:09:33 -05:00
Tyler Goodlet	a80591b914	Only cancel/get-result from a ctx if transport is up There's no point in sending a cancel message to the remote linked task and especially no reason to block waiting on a result from that task if the transport layer is detected to be disconnected. We expect that the transport shouldn't go down at the layer of the message loop (reconnection logic should be handled in the transport layer itself) so if we detect the channel is not connected we don't bother requesting cancels nor waiting on a final result message. Why? - if the connection goes down in error the caller side won't have a way to know "how long" it should block to wait for a cancel ack or result and causes a potential hang that may require an additional ctrl-c from the user especially if using the debugger or if the traceback is not seen on console. - obviously there's no point in waiting for messages when there's no transport to deliver them XD Further, add some more detailed cancel logging detailing the task and actor ids.	2022-02-15 09:08:50 -05:00
Tyler Goodlet	52ad597c20	Drop high log level in ctx example	2022-02-15 09:08:50 -05:00
Tyler Goodlet	4973deb55f	Typing fixes, simplify `_set_trace()`	2022-02-15 09:08:50 -05:00