to_asyncio: mask error logging, not sure it adds that much

Always no-raise try-to-pop registry addrs
Add stale entry deleted from registrar test
2023-09-26 10:32:01 -04:00 · 2023-09-15 14:20:12 -04:00 · 2023-08-28 12:20:12 -04:00 · 2023-08-28 11:26:36 -04:00 · 2023-08-21 19:07:14 -04:00
7 changed files with 182 additions and 57 deletions
--- a/setup.py
+++ b/setup.py
@ -41,6 +41,9 @@ setup(
    ],
    install_requires=[

+        # discovery subsys
+        'bidict',
+
        # trio related
        # proper range spec:
        # https://packaging.python.org/en/latest/discussions/install-requires-vs-requirements/#id5
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -219,7 +219,8 @@ def daemon(
    arb_addr: tuple[str, int],
 ):
    '''
-    Run a daemon actor as a "remote arbiter".
+    Run a daemon actor as a "remote registrar" and/or plain ol
+    separate actor (service) tree.

    '''
    if loglevel in ('trace', 'debug'):
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@ -1,6 +1,7 @@
-"""
-Actor "discovery" testing
-"""
+'''
+Discovery subsystem via a "registrar" actor scenarios.
+
+'''
 import os
 import signal
 import platform
@ -127,7 +128,10 @@ async def unpack_reg(actor_or_portal):
    else:
        msg = await actor_or_portal.run_from_ns('self', 'get_registry')

-    return {tuple(key.split('.')): val for key, val in msg.items()}
+    return {
+        tuple(key.split('.')): val
+        for key, val in msg.items()
+    }


 async def spawn_and_check_registry(
@ -283,37 +287,41 @@ async def close_chans_before_nursery(

                async with tractor.open_nursery() as tn:
                    portal1 = await tn.start_actor(
-                        name='consumer1', enable_modules=[__name__])
+                        name='consumer1',
+                        enable_modules=[__name__],
+                    )
                    portal2 = await tn.start_actor(
-                        'consumer2', enable_modules=[__name__])
+                        'consumer2',
+                        enable_modules=[__name__],
+                    )

-                    # TODO: compact this back as was in last commit once
-                    # 3.9+, see https://github.com/goodboy/tractor/issues/207
-                    async with portal1.open_stream_from(
-                        stream_forever
-                    ) as agen1:
-                        async with portal2.open_stream_from(
+                    async with (
+                        portal1.open_stream_from(
                            stream_forever
-                        ) as agen2:
-                            async with trio.open_nursery() as n:
-                                n.start_soon(streamer, agen1)
-                                n.start_soon(cancel, use_signal, .5)
-                                try:
-                                    await streamer(agen2)
-                                finally:
-                                    # Kill the root nursery thus resulting in
-                                    # normal arbiter channel ops to fail during
-                                    # teardown. It doesn't seem like this is
-                                    # reliably triggered by an external SIGINT.
-                                    # tractor.current_actor()._root_nursery.cancel_scope.cancel()
+                        ) as agen1,
+                        portal2.open_stream_from(
+                            stream_forever
+                        ) as agen2,
+                    ):
+                        async with trio.open_nursery() as n:
+                            n.start_soon(streamer, agen1)
+                            n.start_soon(cancel, use_signal, .5)
+                            try:
+                                await streamer(agen2)
+                            finally:
+                                # Kill the root nursery thus resulting in
+                                # normal arbiter channel ops to fail during
+                                # teardown. It doesn't seem like this is
+                                # reliably triggered by an external SIGINT.
+                                # tractor.current_actor()._root_nursery.cancel_scope.cancel()

-                                    # XXX: THIS IS THE KEY THING that
-                                    # happens **before** exiting the
-                                    # actor nursery block
+                                # XXX: THIS IS THE KEY THING that
+                                # happens **before** exiting the
+                                # actor nursery block

-                                    # also kill off channels cuz why not
-                                    await agen1.aclose()
-                                    await agen2.aclose()
+                                # also kill off channels cuz why not
+                                await agen1.aclose()
+                                await agen2.aclose()
            finally:
                with trio.CancelScope(shield=True):
                    await trio.sleep(1)
@ -331,10 +339,12 @@ def test_close_channel_explicit(
    use_signal,
    arb_addr,
 ):
-    """Verify that closing a stream explicitly and killing the actor's
+    '''
+    Verify that closing a stream explicitly and killing the actor's
    "root nursery" **before** the containing nursery tears down also
    results in subactor(s) deregistering from the arbiter.
-    """
+
+    '''
    with pytest.raises(KeyboardInterrupt):
        trio.run(
            partial(
@ -347,16 +357,18 @@ def test_close_channel_explicit(


@pytest.mark.parametrize('use_signal', [False, True])
-def test_close_channel_explicit_remote_arbiter(
+def test_close_channel_explicit_remote_registrar(
    daemon,
    start_method,
    use_signal,
    arb_addr,
 ):
-    """Verify that closing a stream explicitly and killing the actor's
+    '''
+    Verify that closing a stream explicitly and killing the actor's
    "root nursery" **before** the containing nursery tears down also
    results in subactor(s) deregistering from the arbiter.
-    """
+
+    '''
    with pytest.raises(KeyboardInterrupt):
        trio.run(
            partial(
@ -366,3 +378,51 @@ def test_close_channel_explicit_remote_arbiter(
                remote_arbiter=True,
            ),
        )
+
+
+@tractor.context
+async def kill_transport(
+    ctx: tractor.Context,
+) -> None:
+
+    await ctx.started()
+    actor: tractor.Actor = tractor.current_actor()
+    actor.cancel_server()
+    await trio.sleep_forever()
+
+
+
+# @pytest.mark.parametrize('use_signal', [False, True])
+def test_stale_entry_is_deleted(
+    daemon,
+    start_method,
+    arb_addr,
+):
+    '''
+    Ensure that when a stale entry is detected in the registrar's table
+    that the `find_actor()` API takes care of deleting the stale entry
+    and not delivering a bad portal.
+
+    '''
+    async def main():
+
+        name: str = 'transport_fails_actor'
+        regport: tractor.Portal
+        tn: tractor.ActorNursery
+        async with (
+            tractor.open_nursery() as tn,
+            tractor.get_registrar(*arb_addr) as regport,
+        ):
+            ptl: tractor.Portal = await tn.start_actor(
+                name,
+                enable_modules=[__name__],
+            )
+            async with ptl.open_context(
+                kill_transport,
+            ) as (first, ctx):
+                async with tractor.find_actor(name) as maybe_portal:
+                    assert maybe_portal is None
+
+                await ptl.cancel_actor()
+
+    trio.run(main)
--- a/tractor/init.py
+++ b/tractor/init.py
@ -31,6 +31,7 @@ from ._streaming import (
 )
 from ._discovery import (
    get_arbiter,
+    get_registrar,
    find_actor,
    wait_for_actor,
    query_actor,
@ -77,6 +78,7 @@ __all__ = [
    'find_actor',
    'query_actor',
    'get_arbiter',
+    'get_registrar',
    'is_root_process',
    'msg',
    'open_actor_cluster',
--- a/tractor/_discovery.py
+++ b/tractor/_discovery.py
@ -35,7 +35,7 @@ from ._state import current_actor, _runtime_vars


@acm
-async def get_arbiter(
+async def get_registrar(

    host: str,
    port: int,
@ -56,11 +56,14 @@ async def get_arbiter(
        # (likely a re-entrant call from the arbiter actor)
        yield LocalPortal(actor, Channel((host, port)))
    else:
-        async with _connect_chan(host, port) as chan:
+        async with (
+            _connect_chan(host, port) as chan,
+            open_portal(chan) as arb_portal,
+        ):
+            yield arb_portal

-            async with open_portal(chan) as arb_portal:

-                yield arb_portal
+get_arbiter = get_registrar


@acm
@ -101,7 +104,10 @@ async def query_actor(

        # TODO: return portals to all available actors - for now just
        # the last one that registered
-        if name == 'arbiter' and actor.is_arbiter:
+        if (
+            name == 'arbiter'
+            and actor.is_arbiter
+        ):
            raise RuntimeError("The current actor is the arbiter")

        yield sockaddr if sockaddr else None
@ -112,7 +118,7 @@ async def find_actor(
    name: str,
    arbiter_sockaddr: tuple[str, int] | None = None

-) -> AsyncGenerator[Optional[Portal], None]:
+) -> AsyncGenerator[Portal | None, None]:
    '''
    Ask the arbiter to find actor(s) by name.

@ -120,17 +126,49 @@ async def find_actor(
    known to the arbiter.

    '''
-    async with query_actor(
-        name=name,
-        arbiter_sockaddr=arbiter_sockaddr,
-    ) as sockaddr:
+    actor = current_actor()
+    async with get_arbiter(
+        *arbiter_sockaddr or actor._arb_addr
+    ) as arb_portal:
+
+        sockaddr = await arb_portal.run_from_ns(
+            'self',
+            'find_actor',
+            name=name,
+        )
+
+        # TODO: return portals to all available actors - for now just
+        # the last one that registered
+        if (
+            name == 'arbiter'
+            and actor.is_arbiter
+        ):
+            raise RuntimeError("The current actor is the arbiter")

        if sockaddr:
-            async with _connect_chan(*sockaddr) as chan:
-                async with open_portal(chan) as portal:
-                    yield portal
-        else:
-            yield None
+            try:
+                async with _connect_chan(*sockaddr) as chan:
+                    async with open_portal(chan) as portal:
+                        yield portal
+                        return
+
+            # most likely we were unable to connect the
+            # transport and there is likely a stale entry in
+            # the registry actor's table, thus we need to
+            # instruct it to clear that stale entry and then
+            # more silently (pretend there was no reason but
+            # to) indicate that the target actor can't be
+            # contacted at that addr.
+            except OSError:
+                # NOTE: ensure we delete the stale entry from the
+                # registar actor.
+                uid: tuple[str, str] = await arb_portal.run_from_ns(
+                    'self',
+                    'delete_sockaddr',
+                    sockaddr=sockaddr,
+                )
+
+        yield None


@acm
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@ -40,6 +40,7 @@ from contextlib import ExitStack
 import warnings

 from async_generator import aclosing
+from bidict import bidict
 from exceptiongroup import BaseExceptionGroup
 import trio  # type: ignore
 from trio_typing import TaskStatus
@ -1774,10 +1775,10 @@ class Arbiter(Actor):

    def __init__(self, *args, **kwargs) -> None:

-        self._registry: dict[
+        self._registry: bidict[
            tuple[str, str],
            tuple[str, int],
-        ] = {}
+        ] = bidict({})
        self._waiters: dict[
            str,
            # either an event to sync to receiving an actor uid (which
@ -1871,3 +1872,23 @@ class Arbiter(Actor):
        entry: tuple = self._registry.pop(uid, None)
        if entry is None:
            log.warning(f'Request to de-register {uid} failed?')
+
+
+    async def delete_sockaddr(
+        self,
+        sockaddr: tuple[str, int],
+
+    ) -> tuple[str, str]:
+        uid: tuple | None = self._registry.inverse.pop(
+            sockaddr,
+            None,
+        )
+        if uid:
+            log.warning(
+                f'Deleting registry entry for {sockaddr}@{uid}!'
+            )
+        else:
+            log.warning(
+                f'No registry entry for {sockaddr}@{uid}!'
+            )
+        return uid
--- a/tractor/to_asyncio.py
+++ b/tractor/to_asyncio.py
@ -216,7 +216,7 @@ def _run_asyncio_task(
        try:
            result = await coro
        except BaseException as aio_err:
-            log.exception('asyncio task errored')
+            # log.exception('asyncio task errored:')
            chan._aio_err = aio_err
            raise

@ -300,7 +300,7 @@ def _run_asyncio_task(
            elif task_err is None:
                assert aio_err
                aio_err.with_traceback(aio_err.__traceback__)
-                log.error('infected task errorred')
+                # log.error('infected task errorred')

            # XXX: alway cancel the scope on error
            # in case the trio task is blocking
@ -356,7 +356,7 @@ async def translate_aio_errors(
        # relay cancel through to called ``asyncio`` task
        assert chan._aio_task
        chan._aio_task.cancel(
-            msg=f'the `trio` caller task was cancelled: {trio_task.name}'
+            msg=f'`trio`-side caller task cancelled: {trio_task.name}'
        )
        raise

@ -366,7 +366,7 @@ async def translate_aio_errors(
        trio.ClosedResourceError,
        # trio.BrokenResourceError,
    ):
-        aio_err = chan._aio_err
+        aio_err: BaseException = chan._aio_err
        if (
            task.cancelled() and
            type(aio_err) is CancelledError
Author	SHA1	Message	Date
Tyler Goodlet	3a31c9d338	to_asyncio: mask error logging, not sure it adds that much	2023-09-26 10:32:01 -04:00
Tyler Goodlet	3dc57e384e	Always no-raise try-to-pop registry addrs	2023-09-15 14:20:12 -04:00
Tyler Goodlet	687852f368	Add stale entry deleted from registrar test By spawning an actor task that immediately shuts down the transport server and then sleeps, verify that attempting to connect via the `._discovery.find_actor()` helper delivers `None` for the `Portal` value. Relates to #184 and #216	2023-08-28 12:20:12 -04:00
Tyler Goodlet	d83d991f21	Handle stale registrar entries; detect and delete In cases where an actor's transport server task (by default handling new TCP connections) terminates early but does not de-register from the pertaining registry (aka the registrar) actor's address table, the trying-to-connect client actor will get a connection error on that address. In the case where client handles a (local) `OSError` (meaning the target actor address is likely being contacted over `localhost`) exception, make a further call to the registrar to delete the stale entry and `yield None` gracefully indicating to calling code that no `Portal` can be delivered to the target address. This issue was originally discovered in `piker` where the `emsd` (clearing engine) actor would sometimes crash on rapid client re-connects and then leave a `pikerd` stale entry. With this fix new clients will attempt connect via an endpoint which will re-spawn the `emsd` when a `None` portal is delivered (via `maybe_spawn_em()`).	2023-08-28 11:26:36 -04:00
Tyler Goodlet	1cf712cfac	Add `Arbiter.delete_sockaddr()` to remove addrs Since stale addrs can be leaked where the actor transport server task crashes but doesn't (successfully) unregister from the registrar, we need a remote way to remove such entries; hence this new (registrar) method. To implement this make use of the `bidict` lib for the `._registry` table thus making it super simple to do reverse uuid lookups from an input socket-address.	2023-08-21 19:07:14 -04:00