Bring in pretty-ified `msgspec.Struct` extension

Originally designed and used throughout `piker`, the subtype adds some handy pprinting and field diffing extras often handy when viewing struct types in logging or REPL console interfaces B) Obvi this rejigs the `tractor.msg` mod into a sub-pkg and moves the existing namespace obj-pointer stuff into a new `.msg.ptr` sub mod.
Never mask original `KeyError` in portal-error unwrapper, for now?
2024-01-28 16:33:10 -05:00 · 2024-01-23 11:14:10 -05:00 · 2024-01-23 11:13:07 -05:00 · 2024-01-23 11:09:38 -05:00 · 2024-01-03 22:27:05 -05:00 · 2024-01-02 18:43:43 -05:00
51 changed files with 6255 additions and 1814 deletions
--- a/docs/README.rst
+++ b/docs/README.rst
@ -3,8 +3,8 @@
 |gh_actions|
 |docs|

-``tractor`` is a `structured concurrent`_, multi-processing_ runtime
-built on trio_.
+``tractor`` is a `structured concurrent`_, (optionally
+distributed_) multi-processing_ runtime built on trio_.

 Fundamentally, ``tractor`` gives you parallelism via
 ``trio``-"*actors*": independent Python processes (aka
@ -17,11 +17,20 @@ protocol" constructed on top of multiple Pythons each running a ``trio``
 scheduled runtime - a call to ``trio.run()``.

 We believe the system adheres to the `3 axioms`_ of an "`actor model`_"
-but likely *does not* look like what *you* probably think an "actor
-model" looks like, and that's *intentional*.
+but likely **does not** look like what **you** probably *think* an "actor
+model" looks like, and that's **intentional**.

-The first step to grok ``tractor`` is to get the basics of ``trio`` down.
-A great place to start is the `trio docs`_ and this `blog post`_.
+
+Where do i start!?
+------------------
+The first step to grok ``tractor`` is to get an intermediate
+knowledge of ``trio`` and **structured concurrency** B)
+
+Some great places to start are,
+- the seminal `blog post`_
+- obviously the `trio docs`_
+- wikipedia's nascent SC_ page
+- the fancy diagrams @ libdill-docs_


 Features
@ -593,6 +602,7 @@ matrix seems too hip, we're also mostly all in the the `trio gitter
 channel`_!

 .. _structured concurrent: https://trio.discourse.group/t/concise-definition-of-structured-concurrency/228
+.. _distributed: https://en.wikipedia.org/wiki/Distributed_computing
 .. _multi-processing: https://en.wikipedia.org/wiki/Multiprocessing
 .. _trio: https://github.com/python-trio/trio
 .. _nurseries: https://vorpus.org/blog/notes-on-structured-concurrency-or-go-statement-considered-harmful/#nurseries-a-structured-replacement-for-go-statements
@ -611,8 +621,9 @@ channel`_!
 .. _trio docs: https://trio.readthedocs.io/en/latest/
 .. _blog post: https://vorpus.org/blog/notes-on-structured-concurrency-or-go-statement-considered-harmful/
 .. _structured concurrency: https://en.wikipedia.org/wiki/Structured_concurrency
+.. _SC: https://en.wikipedia.org/wiki/Structured_concurrency
+.. _libdill-docs: https://sustrik.github.io/libdill/structured-concurrency.html
 .. _structured chadcurrency: https://en.wikipedia.org/wiki/Structured_concurrency
-.. _structured concurrency: https://en.wikipedia.org/wiki/Structured_concurrency
 .. _unrequirements: https://en.wikipedia.org/wiki/Actor_model#Direct_communication_and_asynchrony
 .. _async generators: https://www.python.org/dev/peps/pep-0525/
 .. _trio-parallel: https://github.com/richardsheridan/trio-parallel
--- a/examples/debugging/asyncio_bp.py
+++ b/examples/debugging/asyncio_bp.py
@ -0,0 +1,117 @@
+import asyncio
+
+import trio
+import tractor
+from tractor import to_asyncio
+
+
+async def aio_sleep_forever():
+    await asyncio.sleep(float('inf'))
+
+
+async def bp_then_error(
+    to_trio: trio.MemorySendChannel,
+    from_trio: asyncio.Queue,
+
+    raise_after_bp: bool = True,
+
+) -> None:
+
+    # sync with ``trio``-side (caller) task
+    to_trio.send_nowait('start')
+
+    # NOTE: what happens here inside the hook needs some refinement..
+    # => seems like it's still `._debug._set_trace()` but
+    #    we set `Lock.local_task_in_debug = 'sync'`, we probably want
+    #    some further, at least, meta-data about the task/actoq in debug
+    #    in terms of making it clear it's asyncio mucking about.
+    breakpoint()
+
+    # short checkpoint / delay
+    await asyncio.sleep(0.5)
+
+    if raise_after_bp:
+        raise ValueError('blah')
+
+    # TODO: test case with this so that it gets cancelled?
+    else:
+        # XXX NOTE: this is required in order to get the SIGINT-ignored
+        # hang case documented in the module script section!
+        await aio_sleep_forever()
+
+
+@tractor.context
+async def trio_ctx(
+    ctx: tractor.Context,
+    bp_before_started: bool = False,
+):
+
+    # this will block until the ``asyncio`` task sends a "first"
+    # message, see first line in above func.
+    async with (
+
+        to_asyncio.open_channel_from(
+            bp_then_error,
+            raise_after_bp=not bp_before_started,
+        ) as (first, chan),
+
+        trio.open_nursery() as n,
+    ):
+
+        assert first == 'start'
+
+        if bp_before_started:
+            await tractor.breakpoint()
+
+        await ctx.started(first)
+
+        n.start_soon(
+            to_asyncio.run_task,
+            aio_sleep_forever,
+        )
+        await trio.sleep_forever()
+
+
+async def main(
+    bps_all_over: bool = False,
+
+) -> None:
+
+    async with tractor.open_nursery() as n:
+
+        p = await n.start_actor(
+            'aio_daemon',
+            enable_modules=[__name__],
+            infect_asyncio=True,
+            debug_mode=True,
+            loglevel='cancel',
+        )
+
+        async with p.open_context(
+            trio_ctx,
+            bp_before_started=bps_all_over,
+        ) as (ctx, first):
+
+            assert first == 'start'
+
+            if bps_all_over:
+                await tractor.breakpoint()
+
+            # await trio.sleep_forever()
+            await ctx.cancel()
+            assert 0
+
+        # TODO: case where we cancel from trio-side while asyncio task
+        # has debugger lock?
+        # await p.cancel_actor()
+
+
+if __name__ == '__main__':
+
+    # works fine B)
+    trio.run(main)
+
+    # will hang and ignores SIGINT !!
+    # NOTE: you'll need to send a SIGQUIT (via ctl-\) to kill it
+    # manually..
+    # trio.run(main, True)
--- a/examples/full_fledged_streaming_service.py
+++ b/examples/full_fledged_streaming_service.py
@ -65,21 +65,28 @@ async def aggregate(seed):
    print("AGGREGATOR COMPLETE!")


-# this is the main actor and *arbiter*
-async def main():
-    # a nursery which spawns "actors"
-    async with tractor.open_nursery(
-        arbiter_addr=('127.0.0.1', 1616)
-    ) as nursery:
+async def main() -> list[int]:
+    '''
+    This is the "root" actor's main task's entrypoint.
+
+    By default (and if not otherwise specified) that root process
+    also acts as a "registry actor" / "registrar" on the localhost
+    for the purposes of multi-actor "service discovery".
+
+    '''
+    # yes, a nursery which spawns `trio`-"actors" B)
+    nursery: tractor.ActorNursery
+    async with tractor.open_nursery() as nursery:

        seed = int(1e3)
        pre_start = time.time()

-        portal = await nursery.start_actor(
+        portal: tractor.Portal = await nursery.start_actor(
            name='aggregator',
            enable_modules=[__name__],
        )

+        stream: tractor.MsgStream
        async with portal.open_stream_from(
            aggregate,
            seed=seed,
--- a/requirements-test.txt
+++ b/requirements-test.txt
@ -6,3 +6,4 @@ mypy
 trio_typing
 pexpect
 towncrier
+numpy
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -29,7 +29,7 @@ def tractor_test(fn):

    If fixtures:

-        - ``arb_addr`` (a socket addr tuple where arbiter is listening)
+        - ``reg_addr`` (a socket addr tuple where arbiter is listening)
        - ``loglevel`` (logging level passed to tractor internals)
        - ``start_method`` (subprocess spawning backend)

@ -40,16 +40,16 @@ def tractor_test(fn):
    def wrapper(
        *args,
        loglevel=None,
-        arb_addr=None,
+        reg_addr=None,
        start_method=None,
        **kwargs
    ):
        # __tracebackhide__ = True

-        if 'arb_addr' in inspect.signature(fn).parameters:
+        if 'reg_addr' in inspect.signature(fn).parameters:
            # injects test suite fixture value to test as well
            # as `run()`
-            kwargs['arb_addr'] = arb_addr
+            kwargs['reg_addr'] = reg_addr

        if 'loglevel' in inspect.signature(fn).parameters:
            # allows test suites to define a 'loglevel' fixture
@ -71,7 +71,7 @@ def tractor_test(fn):
            async def _main():
                async with tractor.open_root_actor(
                    # **kwargs,
-                    arbiter_addr=arb_addr,
+                    registry_addrs=[reg_addr] if reg_addr else None,
                    loglevel=loglevel,
                    start_method=start_method,

@ -92,9 +92,6 @@ def tractor_test(fn):
    return wrapper


-_arb_addr = '127.0.0.1', random.randint(1000, 9999)
-
-
 # Sending signal.SIGINT on subprocess fails on windows. Use CTRL_* alternatives
 if platform.system() == 'Windows':
    _KILL_SIGNAL = signal.CTRL_BREAK_EVENT
@ -173,9 +170,23 @@ def ci_env() -> bool:
    return _ci_env


+# choose randomly at import time
+_reg_addr: tuple[str, int] = (
+    '127.0.0.1',
+    random.randint(1000, 9999),
+)
+
+
@pytest.fixture(scope='session')
-def arb_addr():
-    return _arb_addr
+def reg_addr() -> tuple[str, int]:
+
+    # globally override the runtime to the per-test-session-dynamic
+    # addr so that all tests never conflict with any other actor
+    # tree using the default.
+    from tractor import _root
+    _root._default_lo_addrs = [_reg_addr]
+
+    return _reg_addr


 def pytest_generate_tests(metafunc):
@ -216,30 +227,35 @@ def sig_prog(proc, sig):
 def daemon(
    loglevel: str,
    testdir,
-    arb_addr: tuple[str, int],
+    reg_addr: tuple[str, int],
 ):
    '''
-    Run a daemon actor as a "remote arbiter".
+    Run a daemon root actor as a separate actor-process tree and
+    "remote registrar" for discovery-protocol related tests.

    '''
    if loglevel in ('trace', 'debug'):
-        # too much logging will lock up the subproc (smh)
-        loglevel = 'info'
+        # XXX: too much logging will lock up the subproc (smh)
+        loglevel: str = 'info'

-    cmdargs = [
-        sys.executable, '-c',
-        "import tractor; tractor.run_daemon([], registry_addr={}, loglevel={})"
-        .format(
-            arb_addr,
-            "'{}'".format(loglevel) if loglevel else None)
+    code: str = (
+            "import tractor; "
+            "tractor.run_daemon([], registry_addrs={reg_addrs}, loglevel={ll})"
+    ).format(
+        reg_addrs=str([reg_addr]),
+        ll="'{}'".format(loglevel) if loglevel else None,
+    )
+    cmd: list[str] = [
+        sys.executable,
+        '-c', code,
    ]
-    kwargs = dict()
+    kwargs = {}
    if platform.system() == 'Windows':
        # without this, tests hang on windows forever
        kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP

    proc = testdir.popen(
-        cmdargs,
+        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        **kwargs,
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@ -47,7 +47,7 @@ async def do_nuthin():
    ],
    ids=['no_args', 'unexpected_args'],
 )
-def test_remote_error(arb_addr, args_err):
+def test_remote_error(reg_addr, args_err):
    """Verify an error raised in a subactor that is propagated
    to the parent nursery, contains the underlying boxed builtin
    error type info and causes cancellation and reraising all the
@ -57,7 +57,7 @@ def test_remote_error(arb_addr, args_err):

    async def main():
        async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
        ) as nursery:

            # on a remote type error caused by bad input args
@ -97,7 +97,7 @@ def test_remote_error(arb_addr, args_err):
            assert exc.type == errtype


-def test_multierror(arb_addr):
+def test_multierror(reg_addr):
    '''
    Verify we raise a ``BaseExceptionGroup`` out of a nursery where
    more then one actor errors.
@ -105,7 +105,7 @@ def test_multierror(arb_addr):
    '''
    async def main():
        async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
        ) as nursery:

            await nursery.run_in_actor(assert_err, name='errorer1')
@ -130,14 +130,14 @@ def test_multierror(arb_addr):
@pytest.mark.parametrize(
    'num_subactors', range(25, 26),
 )
-def test_multierror_fast_nursery(arb_addr, start_method, num_subactors, delay):
+def test_multierror_fast_nursery(reg_addr, start_method, num_subactors, delay):
    """Verify we raise a ``BaseExceptionGroup`` out of a nursery where
    more then one actor errors and also with a delay before failure
    to test failure during an ongoing spawning.
    """
    async def main():
        async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
        ) as nursery:

            for i in range(num_subactors):
@ -175,15 +175,20 @@ async def do_nothing():


@pytest.mark.parametrize('mechanism', ['nursery_cancel', KeyboardInterrupt])
-def test_cancel_single_subactor(arb_addr, mechanism):
-    """Ensure a ``ActorNursery.start_actor()`` spawned subactor
+def test_cancel_single_subactor(reg_addr, mechanism):
+    '''
+    Ensure a ``ActorNursery.start_actor()`` spawned subactor
    cancels when the nursery is cancelled.
-    """
+
+    '''
    async def spawn_actor():
-        """Spawn an actor that blocks indefinitely.
-        """
+        '''
+        Spawn an actor that blocks indefinitely then cancel via
+        either `ActorNursery.cancel()` or an exception raise.
+
+        '''
        async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
        ) as nursery:

            portal = await nursery.start_actor(
--- a/tests/test_child_manages_service_nursery.py
+++ b/tests/test_child_manages_service_nursery.py
@ -141,7 +141,7 @@ async def open_actor_local_nursery(
 )
 def test_actor_managed_trio_nursery_task_error_cancels_aio(
    asyncio_mode: bool,
-    arb_addr
+    reg_addr: tuple,
 ):
    '''
    Verify that a ``trio`` nursery created managed in a child actor
--- a/tests/test_clustering.py
+++ b/tests/test_clustering.py
@ -49,7 +49,7 @@ async def worker(
    await ctx.started()

    async with ctx.open_stream(
-        backpressure=True,
+        allow_overruns=True,
    ) as stream:

        # TODO: this with the below assert causes a hang bug?
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@ -1,11 +1,11 @@
 '''
 ``async with ():`` inlined context-stream cancellation testing.

-Verify the we raise errors when streams are opened prior to sync-opening
-a ``tractor.Context`` beforehand.
+Verify the we raise errors when streams are opened prior to
+sync-opening a ``tractor.Context`` beforehand.

 '''
-from contextlib import asynccontextmanager as acm
+# from contextlib import asynccontextmanager as acm
 from itertools import count
 import platform
 from typing import Optional
@ -13,7 +13,15 @@ from typing import Optional
 import pytest
 import trio
 import tractor
-from tractor._exceptions import StreamOverrun
+from tractor import (
+    Actor,
+    Context,
+    current_actor,
+)
+from tractor._exceptions import (
+    StreamOverrun,
+    ContextCancelled,
+)

 from conftest import tractor_test

@ -91,7 +99,10 @@ async def not_started_but_stream_opened(

@pytest.mark.parametrize(
    'target',
-    [too_many_starteds, not_started_but_stream_opened],
+    [
+        too_many_starteds,
+        not_started_but_stream_opened,
+    ],
    ids='misuse_type={}'.format,
 )
 def test_started_misuse(target):
@ -187,9 +198,6 @@ def test_simple_context(
                        else:
                            assert await ctx.result() == 'yo'

-                        if not error_parent:
-                            await ctx.cancel()
-
                        if pointlessly_open_stream:
                            async with ctx.open_stream():
                                if error_parent:
@ -202,10 +210,15 @@ def test_simple_context(
                                    # 'stop' msg to the far end which needs
                                    # to be ignored
                                    pass
+
                        else:
                            if error_parent:
                                raise error_parent

+                            # cancel AFTER we open a stream
+                            # to avoid a cancel raised inside
+                            # `.open_stream()`
+                            await ctx.cancel()
                finally:

                    # after cancellation
@ -228,6 +241,88 @@ def test_simple_context(
        trio.run(main)


+@pytest.mark.parametrize(
+    'callee_returns_early',
+    [True, False],
+    ids=lambda item: f'callee_returns_early={item}'
+)
+@pytest.mark.parametrize(
+    'cancel_method',
+    ['ctx', 'portal'],
+    ids=lambda item: f'cancel_method={item}'
+)
+@pytest.mark.parametrize(
+    'chk_ctx_result_before_exit',
+    [True, False],
+    ids=lambda item: f'chk_ctx_result_before_exit={item}'
+)
+def test_caller_cancels(
+    cancel_method: str,
+    chk_ctx_result_before_exit: bool,
+    callee_returns_early: bool,
+):
+    '''
+    Verify that when the opening side of a context (aka the caller)
+    cancels that context, the ctx does not raise a cancelled when
+    either calling `.result()` or on context exit.
+
+    '''
+    async def check_canceller(
+        ctx: tractor.Context,
+    ) -> None:
+        # should not raise yet return the remote
+        # context cancelled error.
+        res = await ctx.result()
+
+        if callee_returns_early:
+            assert res == 'yo'
+
+        else:
+            err = res
+            assert isinstance(err, ContextCancelled)
+            assert (
+                tuple(err.canceller)
+                ==
+                current_actor().uid
+            )
+
+    async def main():
+        async with tractor.open_nursery() as nursery:
+            portal = await nursery.start_actor(
+                'simple_context',
+                enable_modules=[__name__],
+            )
+            timeout = 0.5 if not callee_returns_early else 2
+            with trio.fail_after(timeout):
+                async with portal.open_context(
+                    simple_setup_teardown,
+                    data=10,
+                    block_forever=not callee_returns_early,
+                ) as (ctx, sent):
+
+                    if callee_returns_early:
+                        # ensure we block long enough before sending
+                        # a cancel such that the callee has already
+                        # returned it's result.
+                        await trio.sleep(0.5)
+
+                    if cancel_method == 'ctx':
+                        await ctx.cancel()
+                    else:
+                        await portal.cancel_actor()
+
+                    if chk_ctx_result_before_exit:
+                        await check_canceller(ctx)
+
+            if not chk_ctx_result_before_exit:
+                await check_canceller(ctx)
+
+            if cancel_method != 'portal':
+                await portal.cancel_actor()
+
+    trio.run(main)
+
+
 # basic stream terminations:
 # - callee context closes without using stream
 # - caller context closes without using stream
@ -342,9 +437,11 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
 ):
    'caller context closes without using stream'

-    async with tractor.open_nursery() as n:
+    async with tractor.open_nursery() as an:

-        portal = await n.start_actor(
+        root: Actor = current_actor()
+
+        portal = await an.start_actor(
            'ctx_cancelled',
            enable_modules=[__name__],
        )
@ -352,10 +449,10 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
        async with portal.open_context(
            expect_cancelled,
        ) as (ctx, sent):
-            await portal.run(assert_state, value=True)
-
            assert sent is None

+            await portal.run(assert_state, value=True)
+
            # call cancel explicitly
            if use_ctx_cancel_method:

@ -366,8 +463,21 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
                        async for msg in stream:
                            pass

-                except tractor.ContextCancelled:
-                    raise  # XXX: must be propagated to __aexit__
+                except tractor.ContextCancelled as ctxc:
+                    # XXX: the cause is US since we call
+                    # `Context.cancel()` just above!
+                    assert (
+                        ctxc.canceller
+                        ==
+                        current_actor().uid
+                        ==
+                        root.uid
+                    )
+
+                    # XXX: must be propagated to __aexit__
+                    # and should be silently absorbed there
+                    # since we called `.cancel()` just above ;)
+                    raise

                else:
                    assert 0, "Should have context cancelled?"
@ -384,7 +494,13 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
                        await ctx.result()
                        assert 0, "Callee should have blocked!?"
                except trio.TooSlowError:
+                    # NO-OP -> since already called above
                    await ctx.cancel()
+
+        # local scope should have absorbed the cancellation
+        assert ctx.cancelled_caught
+        assert ctx._remote_error is ctx._local_error
+
        try:
            async with ctx.open_stream() as stream:
                async for msg in stream:
@ -463,19 +579,25 @@ async def cancel_self(
    global _state
    _state = True

+    # since we call this the below `.open_stream()` should always
+    # error!
    await ctx.cancel()

    # should inline raise immediately
    try:
        async with ctx.open_stream():
            pass
-    except tractor.ContextCancelled:
+    # except tractor.ContextCancelled:
+    except RuntimeError:
        # suppress for now so we can do checkpoint tests below
-        pass
+        print('Got expected runtime error for stream-after-cancel')
+
    else:
        raise RuntimeError('Context didnt cancel itself?!')

-    # check a real ``trio.Cancelled`` is raised on a checkpoint
+    # check that``trio.Cancelled`` is now raised on any further
+    # checkpoints since the self cancel above will have cancelled
+    # the `Context._scope.cancel_scope: trio.CancelScope`
    try:
        with trio.fail_after(0.1):
            await trio.sleep_forever()
@ -486,6 +608,7 @@ async def cancel_self(
        # should never get here
        assert 0

+    raise RuntimeError('Context didnt cancel itself?!')

@tractor_test
 async def test_callee_cancels_before_started():
@ -506,7 +629,6 @@ async def test_callee_cancels_before_started():
                cancel_self,
            ) as (ctx, sent):
                async with ctx.open_stream():
-
                    await trio.sleep_forever()

        # raises a special cancel signal
@ -514,7 +636,7 @@ async def test_callee_cancels_before_started():
            ce.type == trio.Cancelled

            # the traceback should be informative
-            assert 'cancelled itself' in ce.msgdata['tb_str']
+            assert 'itself' in ce.msgdata['tb_str']

        # teardown the actor
        await portal.cancel_actor()
@ -559,7 +681,6 @@ async def keep_sending_from_callee(
    'overrun_by',
    [
        ('caller', 1, never_open_stream),
-        ('cancel_caller_during_overrun', 1, never_open_stream),
        ('callee', 0, keep_sending_from_callee),
    ],
    ids='overrun_condition={}'.format,
@ -589,14 +710,13 @@ def test_one_end_stream_not_opened(overrun_by):
                if 'caller' in overrunner:

                    async with ctx.open_stream() as stream:
+
+                        # itersend +1 msg more then the buffer size
+                        # to cause the most basic overrun.
                        for i in range(buf_size):
                            print(f'sending {i}')
                            await stream.send(i)

-                        if 'cancel' in overrunner:
-                            # without this we block waiting on the child side
-                            await ctx.cancel()
-
                        else:
                            # expect overrun error to be relayed back
                            # and this sleep interrupted
@ -610,7 +730,9 @@ def test_one_end_stream_not_opened(overrun_by):

    # 2 overrun cases and the no overrun case (which pushes right up to
    # the msg limit)
-    if overrunner == 'caller' or 'cance' in overrunner:
+    if (
+        overrunner == 'caller'
+    ):
        with pytest.raises(tractor.RemoteActorError) as excinfo:
            trio.run(main)

@ -634,40 +756,102 @@ async def echo_back_sequence(

    ctx:  tractor.Context,
    seq: list[int],
-    msg_buffer_size: Optional[int] = None,
+    wait_for_cancel: bool,
+    allow_overruns_side: str,
+    be_slow: bool = False,
+    msg_buffer_size: int = 1,

 ) -> None:
    '''
-    Send endlessly on the calleee stream.
+    Send endlessly on the calleee stream using a small buffer size
+    setting on the contex to simulate backlogging that would normally
+    cause overruns.

    '''
+    # NOTE: ensure that if the caller is expecting to cancel this task
+    # that we stay echoing much longer then they are so we don't
+    # return early instead of receive the cancel msg.
+    total_batches: int = 1000 if wait_for_cancel else 6
+
    await ctx.started()
+    # await tractor.breakpoint()
    async with ctx.open_stream(
        msg_buffer_size=msg_buffer_size,
+
+        # literally the point of this test XD
+        allow_overruns=(allow_overruns_side in {'child', 'both'}),
    ) as stream:

-        seq = list(seq)  # bleh, `msgpack`...
-        count = 0
-        while count < 3:
+        # ensure mem chan settings are correct
+        assert (
+            ctx._send_chan._state.max_buffer_size
+            ==
+            msg_buffer_size
+        )
+
+        seq = list(seq)  # bleh, msgpack sometimes ain't decoded right
+        for _ in range(total_batches):
            batch = []
            async for msg in stream:
                batch.append(msg)
                if batch == seq:
                    break

+                if be_slow:
+                    await trio.sleep(0.05)
+
+                print('callee waiting on next')
+
            for msg in batch:
                print(f'callee sending {msg}')
                await stream.send(msg)

-            count += 1
-
-        return 'yo'
+    print(
+        'EXITING CALLEEE:\n'
+        f'{ctx.canceller}'
+    )
+    return 'yo'


-def test_stream_backpressure():
+@pytest.mark.parametrize(
+    # aka the side that will / should raise
+    # and overrun under normal conditions.
+    'allow_overruns_side',
+    ['parent', 'child', 'none', 'both'],
+    ids=lambda item: f'allow_overruns_side={item}'
+)
+@pytest.mark.parametrize(
+    # aka the side that will / should raise
+    # and overrun under normal conditions.
+    'slow_side',
+    ['parent', 'child'],
+    ids=lambda item: f'slow_side={item}'
+)
+@pytest.mark.parametrize(
+    'cancel_ctx',
+    [True, False],
+    ids=lambda item: f'cancel_ctx={item}'
+)
+def test_maybe_allow_overruns_stream(
+    cancel_ctx: bool,
+    slow_side: str,
+    allow_overruns_side: str,
+    loglevel: str,
+):
    '''
    Demonstrate small overruns of each task back and forth
-    on a stream not raising any errors by default.
+    on a stream not raising any errors by default by setting
+    the ``allow_overruns=True``.
+
+    The original idea here was to show that if you set the feeder mem
+    chan to a size smaller then the # of msgs sent you could could not
+    get a `StreamOverrun` crash plus maybe get all the msgs that were
+    sent. The problem with the "real backpressure" case is that due to
+    the current arch it can result in the msg loop being blocked and thus
+    blocking cancellation - which is like super bad. So instead this test
+    had to be adjusted to more or less just "not send overrun errors" so
+    as to handle the case where the sender just moreso cares about not getting
+    errored out when it send to fast..

    '''
    async def main():
@ -675,124 +859,101 @@ def test_stream_backpressure():
            portal = await n.start_actor(
                'callee_sends_forever',
                enable_modules=[__name__],
+                loglevel=loglevel,
+
+                # debug_mode=True,
            )
-            seq = list(range(3))
+            seq = list(range(10))
            async with portal.open_context(
                echo_back_sequence,
                seq=seq,
-                msg_buffer_size=1,
+                wait_for_cancel=cancel_ctx,
+                be_slow=(slow_side == 'child'),
+                allow_overruns_side=allow_overruns_side,
            ) as (ctx, sent):
+
                assert sent is None

-                async with ctx.open_stream(msg_buffer_size=1) as stream:
-                    count = 0
-                    while count < 3:
+                async with ctx.open_stream(
+                    msg_buffer_size=1 if slow_side == 'parent' else None,
+                    allow_overruns=(allow_overruns_side in {'parent', 'both'}),
+                ) as stream:
+
+                    total_batches: int = 2
+                    for _ in range(total_batches):
                        for msg in seq:
-                            print(f'caller sending {msg}')
+                            # print(f'root tx {msg}')
                            await stream.send(msg)
-                            await trio.sleep(0.1)
+                            if slow_side == 'parent':
+                                # NOTE: we make the parent slightly
+                                # slower, when it is slow, to make sure
+                                # that in the overruns everywhere case
+                                await trio.sleep(0.16)

                        batch = []
                        async for msg in stream:
+                            print(f'root rx {msg}')
                            batch.append(msg)
                            if batch == seq:
                                break

-                        count += 1
+                if cancel_ctx:
+                    # cancel the remote task
+                    print('sending root side cancel')
+                    await ctx.cancel()

-            # here the context should return
-            assert await ctx.result() == 'yo'
+            res = await ctx.result()
+
+            if cancel_ctx:
+                assert isinstance(res, ContextCancelled)
+                assert tuple(res.canceller) == current_actor().uid
+
+            else:
+                print(f'RX ROOT SIDE RESULT {res}')
+                assert res == 'yo'

            # cancel the daemon
            await portal.cancel_actor()

-    trio.run(main)
-
-
-@tractor.context
-async def sleep_forever(
-    ctx: tractor.Context,
-) -> None:
-    await ctx.started()
-    async with ctx.open_stream():
-        await trio.sleep_forever()
-
-
-@acm
-async def attach_to_sleep_forever():
-    '''
-    Cancel a context **before** any underlying error is raised in order
-    to trigger a local reception of a ``ContextCancelled`` which **should not**
-    be re-raised in the local surrounding ``Context`` *iff* the cancel was
-    requested by **this** side of the context.
-
-    '''
-    async with tractor.wait_for_actor('sleeper') as p2:
-        async with (
-            p2.open_context(sleep_forever) as (peer_ctx, first),
-            peer_ctx.open_stream(),
-        ):
-            try:
-                yield
-            finally:
-                # XXX: previously this would trigger local
-                # ``ContextCancelled`` to be received and raised in the
-                # local context overriding any local error due to
-                # logic inside ``_invoke()`` which checked for
-                # an error set on ``Context._error`` and raised it in
-                # under a cancellation scenario.
-
-                # The problem is you can have a remote cancellation
-                # that is part of a local error and we shouldn't raise
-                # ``ContextCancelled`` **iff** we weren't the side of
-                # the context to initiate it, i.e.
-                # ``Context._cancel_called`` should **NOT** have been
-                # set. The special logic to handle this case is now
-                # inside ``Context._may_raise_from_remote_msg()`` XD
-                await peer_ctx.cancel()
-
-
-@tractor.context
-async def error_before_started(
-    ctx: tractor.Context,
-) -> None:
-    '''
-    This simulates exactly an original bug discovered in:
-    https://github.com/pikers/piker/issues/244
-
-    '''
-    async with attach_to_sleep_forever():
-        # send an unserializable type which should raise a type error
-        # here and **NOT BE SWALLOWED** by the surrounding acm!!?!
-        await ctx.started(object())
-
-
-def test_do_not_swallow_error_before_started_by_remote_contextcancelled():
-    '''
-    Verify that an error raised in a remote context which itself opens another
-    remote context, which it cancels, does not ovverride the original error that
-    caused the cancellation of the secondardy context.
-
-    '''
-    async def main():
-        async with tractor.open_nursery() as n:
-            portal = await n.start_actor(
-                'errorer',
-                enable_modules=[__name__],
-            )
-            await n.start_actor(
-                'sleeper',
-                enable_modules=[__name__],
-            )
-
-            async with (
-                portal.open_context(
-                    error_before_started
-                ) as (ctx, sent),
-            ):
-                await trio.sleep_forever()
-
-    with pytest.raises(tractor.RemoteActorError) as excinfo:
+    if (
+        allow_overruns_side == 'both'
+        or slow_side == allow_overruns_side
+    ):
        trio.run(main)

-    assert excinfo.value.type == TypeError
+    elif (
+        slow_side != allow_overruns_side
+    ):
+
+        with pytest.raises(tractor.RemoteActorError) as excinfo:
+            trio.run(main)
+
+        err = excinfo.value
+
+        if (
+            allow_overruns_side == 'none'
+        ):
+            # depends on timing is is racy which side will
+            # overrun first :sadkitty:
+
+            # NOTE: i tried to isolate to a deterministic case here
+            # based on timeing, but i was kinda wasted, and i don't
+            # think it's sane to catch them..
+            assert err.type in (
+                tractor.RemoteActorError,
+                StreamOverrun,
+            )
+
+        elif (
+            slow_side == 'child'
+        ):
+            assert err.type == StreamOverrun
+
+        elif slow_side == 'parent':
+            assert err.type == tractor.RemoteActorError
+            assert 'StreamOverrun' in err.msgdata['tb_str']
+
+    else:
+        # if this hits the logic blocks from above are not
+        # exhaustive..
+        pytest.fail('PARAMETRIZED CASE GEN PROBLEM YO')
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@ -78,7 +78,7 @@ has_nested_actors = pytest.mark.has_nested_actors
 def spawn(
    start_method,
    testdir,
-    arb_addr,
+    reg_addr,
 ) -> 'pexpect.spawn':

    if start_method != 'trio':
@ -166,7 +166,7 @@ def ctlc(
        # XXX: disable pygments highlighting for auto-tests
        # since some envs (like actions CI) will struggle
        # the the added color-char encoding..
-        from tractor._debug import TractorConfig
+        from tractor.devx._debug import TractorConfig
        TractorConfig.use_pygements = False

    yield use_ctlc
@ -607,7 +607,7 @@ def test_multi_daemon_subactors(
    # now the root actor won't clobber the bp_forever child
    # during it's first access to the debug lock, but will instead
    # wait for the lock to release, by the edge triggered
-    # ``_debug.Lock.no_remote_has_tty`` event before sending cancel messages
+    # ``devx._debug.Lock.no_remote_has_tty`` event before sending cancel messages
    # (via portals) to its underlings B)

    # at some point here there should have been some warning msg from
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@ -15,19 +15,19 @@ from conftest import tractor_test


@tractor_test
-async def test_reg_then_unreg(arb_addr):
+async def test_reg_then_unreg(reg_addr):
    actor = tractor.current_actor()
    assert actor.is_arbiter
    assert len(actor._registry) == 1  # only self is registered

    async with tractor.open_nursery(
-        arbiter_addr=arb_addr,
+        registry_addrs=[reg_addr],
    ) as n:

        portal = await n.start_actor('actor', enable_modules=[__name__])
        uid = portal.channel.uid

-        async with tractor.get_arbiter(*arb_addr) as aportal:
+        async with tractor.get_arbiter(*reg_addr) as aportal:
            # this local actor should be the arbiter
            assert actor is aportal.actor

@ -53,15 +53,27 @@ async def hi():
    return the_line.format(tractor.current_actor().name)


-async def say_hello(other_actor):
+async def say_hello(
+    other_actor: str,
+    reg_addr: tuple[str, int],
+):
    await trio.sleep(1)  # wait for other actor to spawn
-    async with tractor.find_actor(other_actor) as portal:
+    async with tractor.find_actor(
+        other_actor,
+        registry_addrs=[reg_addr],
+    ) as portal:
        assert portal is not None
        return await portal.run(__name__, 'hi')


-async def say_hello_use_wait(other_actor):
-    async with tractor.wait_for_actor(other_actor) as portal:
+async def say_hello_use_wait(
+    other_actor: str,
+    reg_addr: tuple[str, int],
+):
+    async with tractor.wait_for_actor(
+        other_actor,
+        registry_addr=reg_addr,
+    ) as portal:
        assert portal is not None
        result = await portal.run(__name__, 'hi')
        return result
@ -69,21 +81,29 @@ async def say_hello_use_wait(other_actor):

@tractor_test
@pytest.mark.parametrize('func', [say_hello, say_hello_use_wait])
-async def test_trynamic_trio(func, start_method, arb_addr):
-    """Main tractor entry point, the "master" process (for now
-    acts as the "director").
-    """
+async def test_trynamic_trio(
+    func,
+    start_method,
+    reg_addr,
+):
+    '''
+    Root actor acting as the "director" and running one-shot-task-actors
+    for the directed subs.
+
+    '''
    async with tractor.open_nursery() as n:
        print("Alright... Action!")

        donny = await n.run_in_actor(
            func,
            other_actor='gretchen',
+            reg_addr=reg_addr,
            name='donny',
        )
        gretchen = await n.run_in_actor(
            func,
            other_actor='donny',
+            reg_addr=reg_addr,
            name='gretchen',
        )
        print(await gretchen.result())
@ -131,7 +151,7 @@ async def unpack_reg(actor_or_portal):


 async def spawn_and_check_registry(
-    arb_addr: tuple,
+    reg_addr: tuple,
    use_signal: bool,
    remote_arbiter: bool = False,
    with_streaming: bool = False,
@ -139,9 +159,9 @@ async def spawn_and_check_registry(
 ) -> None:

    async with tractor.open_root_actor(
-        arbiter_addr=arb_addr,
+        registry_addrs=[reg_addr],
    ):
-        async with tractor.get_arbiter(*arb_addr) as portal:
+        async with tractor.get_arbiter(*reg_addr) as portal:
            # runtime needs to be up to call this
            actor = tractor.current_actor()

@ -213,17 +233,19 @@ async def spawn_and_check_registry(
 def test_subactors_unregister_on_cancel(
    start_method,
    use_signal,
-    arb_addr,
+    reg_addr,
    with_streaming,
 ):
-    """Verify that cancelling a nursery results in all subactors
+    '''
+    Verify that cancelling a nursery results in all subactors
    deregistering themselves with the arbiter.
-    """
+
+    '''
    with pytest.raises(KeyboardInterrupt):
        trio.run(
            partial(
                spawn_and_check_registry,
-                arb_addr,
+                reg_addr,
                use_signal,
                remote_arbiter=False,
                with_streaming=with_streaming,
@ -237,7 +259,7 @@ def test_subactors_unregister_on_cancel_remote_daemon(
    daemon,
    start_method,
    use_signal,
-    arb_addr,
+    reg_addr,
    with_streaming,
 ):
    """Verify that cancelling a nursery results in all subactors
@ -248,7 +270,7 @@ def test_subactors_unregister_on_cancel_remote_daemon(
        trio.run(
            partial(
                spawn_and_check_registry,
-                arb_addr,
+                reg_addr,
                use_signal,
                remote_arbiter=True,
                with_streaming=with_streaming,
@ -262,7 +284,7 @@ async def streamer(agen):


 async def close_chans_before_nursery(
-    arb_addr: tuple,
+    reg_addr: tuple,
    use_signal: bool,
    remote_arbiter: bool = False,
 ) -> None:
@ -275,9 +297,9 @@ async def close_chans_before_nursery(
        entries_at_end = 1

    async with tractor.open_root_actor(
-        arbiter_addr=arb_addr,
+        registry_addrs=[reg_addr],
    ):
-        async with tractor.get_arbiter(*arb_addr) as aportal:
+        async with tractor.get_arbiter(*reg_addr) as aportal:
            try:
                get_reg = partial(unpack_reg, aportal)

@ -329,7 +351,7 @@ async def close_chans_before_nursery(
 def test_close_channel_explicit(
    start_method,
    use_signal,
-    arb_addr,
+    reg_addr,
 ):
    """Verify that closing a stream explicitly and killing the actor's
    "root nursery" **before** the containing nursery tears down also
@ -339,7 +361,7 @@ def test_close_channel_explicit(
        trio.run(
            partial(
                close_chans_before_nursery,
-                arb_addr,
+                reg_addr,
                use_signal,
                remote_arbiter=False,
            ),
@ -351,7 +373,7 @@ def test_close_channel_explicit_remote_arbiter(
    daemon,
    start_method,
    use_signal,
-    arb_addr,
+    reg_addr,
 ):
    """Verify that closing a stream explicitly and killing the actor's
    "root nursery" **before** the containing nursery tears down also
@ -361,7 +383,7 @@ def test_close_channel_explicit_remote_arbiter(
        trio.run(
            partial(
                close_chans_before_nursery,
-                arb_addr,
+                reg_addr,
                use_signal,
                remote_arbiter=True,
            ),
--- a/tests/test_docs_examples.py
+++ b/tests/test_docs_examples.py
@ -21,7 +21,7 @@ from conftest import (
 def run_example_in_subproc(
    loglevel: str,
    testdir,
-    arb_addr: tuple[str, int],
+    reg_addr: tuple[str, int],
 ):

    @contextmanager
--- a/tests/test_infected_asyncio.py
+++ b/tests/test_infected_asyncio.py
@ -15,6 +15,7 @@ import tractor
 from tractor import (
    to_asyncio,
    RemoteActorError,
+    ContextCancelled,
 )
 from tractor.trionics import BroadcastReceiver

@ -46,7 +47,7 @@ async def trio_cancels_single_aio_task():
        await tractor.to_asyncio.run_task(sleep_forever)


-def test_trio_cancels_aio_on_actor_side(arb_addr):
+def test_trio_cancels_aio_on_actor_side(reg_addr):
    '''
    Spawn an infected actor that is cancelled by the ``trio`` side
    task using std cancel scope apis.
@ -54,7 +55,7 @@ def test_trio_cancels_aio_on_actor_side(arb_addr):
    '''
    async def main():
        async with tractor.open_nursery(
-            arbiter_addr=arb_addr
+            registry_addrs=[reg_addr]
        ) as n:
            await n.run_in_actor(
                trio_cancels_single_aio_task,
@ -93,7 +94,7 @@ async def asyncio_actor(
        raise


-def test_aio_simple_error(arb_addr):
+def test_aio_simple_error(reg_addr):
    '''
    Verify a simple remote asyncio error propagates back through trio
    to the parent actor.
@ -102,7 +103,7 @@ def test_aio_simple_error(arb_addr):
    '''
    async def main():
        async with tractor.open_nursery(
-            arbiter_addr=arb_addr
+            registry_addrs=[reg_addr]
        ) as n:
            await n.run_in_actor(
                asyncio_actor,
@ -119,7 +120,7 @@ def test_aio_simple_error(arb_addr):
    assert err.type == AssertionError


-def test_tractor_cancels_aio(arb_addr):
+def test_tractor_cancels_aio(reg_addr):
    '''
    Verify we can cancel a spawned asyncio task gracefully.

@ -138,7 +139,7 @@ def test_tractor_cancels_aio(arb_addr):
    trio.run(main)


-def test_trio_cancels_aio(arb_addr):
+def test_trio_cancels_aio(reg_addr):
    '''
    Much like the above test with ``tractor.Portal.cancel_actor()``
    except we just use a standard ``trio`` cancellation api.
@ -193,7 +194,7 @@ async def trio_ctx(
    ids='parent_actor_cancels_child={}'.format
 )
 def test_context_spawns_aio_task_that_errors(
-    arb_addr,
+    reg_addr,
    parent_cancels: bool,
 ):
    '''
@ -224,14 +225,23 @@ def test_context_spawns_aio_task_that_errors(

                    await trio.sleep_forever()

-    with pytest.raises(RemoteActorError) as excinfo:
-        trio.run(main)
+                return await ctx.result()

-    err = excinfo.value
-    assert isinstance(err, RemoteActorError)
    if parent_cancels:
-        assert err.type == trio.Cancelled
+        # bc the parent made the cancel request,
+        # the error is not raised locally but instead
+        # the context is exited silently
+        res = trio.run(main)
+        assert isinstance(res, ContextCancelled)
+        assert 'root' in res.canceller[0]
+
    else:
+        expect = RemoteActorError
+        with pytest.raises(expect) as excinfo:
+            trio.run(main)
+
+        err = excinfo.value
+        assert isinstance(err, expect)
        assert err.type == AssertionError


@ -248,7 +258,7 @@ async def aio_cancel():
    await sleep_forever()


-def test_aio_cancelled_from_aio_causes_trio_cancelled(arb_addr):
+def test_aio_cancelled_from_aio_causes_trio_cancelled(reg_addr):

    async def main():
        async with tractor.open_nursery() as n:
@ -385,7 +395,7 @@ async def stream_from_aio(
    'fan_out', [False, True],
    ids='fan_out_w_chan_subscribe={}'.format
 )
-def test_basic_interloop_channel_stream(arb_addr, fan_out):
+def test_basic_interloop_channel_stream(reg_addr, fan_out):
    async def main():
        async with tractor.open_nursery() as n:
            portal = await n.run_in_actor(
@ -399,7 +409,7 @@ def test_basic_interloop_channel_stream(arb_addr, fan_out):


 # TODO: parametrize the above test and avoid the duplication here?
-def test_trio_error_cancels_intertask_chan(arb_addr):
+def test_trio_error_cancels_intertask_chan(reg_addr):
    async def main():
        async with tractor.open_nursery() as n:
            portal = await n.run_in_actor(
@ -418,7 +428,7 @@ def test_trio_error_cancels_intertask_chan(arb_addr):
        assert exc.type == Exception


-def test_trio_closes_early_and_channel_exits(arb_addr):
+def test_trio_closes_early_and_channel_exits(reg_addr):
    async def main():
        async with tractor.open_nursery() as n:
            portal = await n.run_in_actor(
@ -433,7 +443,7 @@ def test_trio_closes_early_and_channel_exits(arb_addr):
    trio.run(main)


-def test_aio_errors_and_channel_propagates_and_closes(arb_addr):
+def test_aio_errors_and_channel_propagates_and_closes(reg_addr):
    async def main():
        async with tractor.open_nursery() as n:
            portal = await n.run_in_actor(
@ -510,7 +520,7 @@ async def trio_to_aio_echo_server(
    ids='raise_error={}'.format,
 )
 def test_echoserver_detailed_mechanics(
-    arb_addr,
+    reg_addr,
    raise_error_mid_stream,
 ):

--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@ -0,0 +1,556 @@
+'''
+Codify the cancellation request semantics in terms
+of one remote actor cancelling another.
+
+'''
+# from contextlib import asynccontextmanager as acm
+import itertools
+
+import pytest
+import trio
+import tractor
+from tractor import (  # typing
+    Portal,
+    Context,
+    ContextCancelled,
+)
+
+# XXX TODO cases:
+# - [ ] peer cancelled itself - so other peers should
+#   get errors reflecting that the peer was itself the .canceller?
+
+# - [x] WE cancelled the peer and thus should not see any raised
+#   `ContextCancelled` as it should be reaped silently?
+#   => pretty sure `test_context_stream_semantics::test_caller_cancels()`
+#      already covers this case?
+
+# - [x] INTER-PEER: some arbitrary remote peer cancels via
+#   Portal.cancel_actor().
+#   => all other connected peers should get that cancel requesting peer's
+#      uid in the ctx-cancelled error msg raised in all open ctxs
+#      with that peer.
+
+# - [ ] PEER-FAILS-BY-CHILD-ERROR: peer spawned a sub-actor which
+#   (also) spawned a failing task which was unhandled and
+#   propagated up to the immediate parent - the peer to the actor
+#   that also spawned a remote task task in that same peer-parent.
+
+
+# def test_self_cancel():
+#     '''
+#     2 cases:
+#     - calls `Actor.cancel()` locally in some task
+#     - calls LocalPortal.cancel_actor()` ?
+
+#     '''
+#     ...
+
+
+@tractor.context
+async def sleep_forever(
+    ctx: Context,
+    expect_ctxc: bool = False,
+) -> None:
+    '''
+    Sync the context, open a stream then just sleep.
+
+    Allow checking for (context) cancellation locally.
+
+    '''
+    try:
+        await ctx.started()
+        async with ctx.open_stream():
+            await trio.sleep_forever()
+
+    except BaseException as berr:
+
+        # TODO: it'd sure be nice to be able to inject our own
+        # `ContextCancelled` here instead of of `trio.Cancelled`
+        # so that our runtime can expect it and this "user code"
+        # would be able to tell the diff between a generic trio
+        # cancel and a tractor runtime-IPC cancel.
+        if expect_ctxc:
+            assert isinstance(berr, trio.Cancelled)
+
+        raise
+
+
+@tractor.context
+async def error_before_started(
+    ctx: Context,
+) -> None:
+    '''
+    This simulates exactly an original bug discovered in:
+    https://github.com/pikers/piker/issues/244
+
+    Cancel a context **before** any underlying error is raised so
+    as to trigger a local reception of a ``ContextCancelled`` which
+    SHOULD NOT be re-raised in the local surrounding ``Context``
+    *iff* the cancel was requested by **this** (callee)  side of
+    the context.
+
+    '''
+    async with tractor.wait_for_actor('sleeper') as p2:
+        async with (
+            p2.open_context(sleep_forever) as (peer_ctx, first),
+            peer_ctx.open_stream(),
+        ):
+            # NOTE: this WAS inside an @acm body but i factored it
+            # out and just put it inline here since i don't think
+            # the mngr part really matters, though maybe it could?
+            try:
+                # XXX NOTE XXX: THIS sends an UNSERIALIZABLE TYPE which
+                # should raise a `TypeError` and **NOT BE SWALLOWED** by
+                # the surrounding try/finally (normally inside the
+                # body of some acm)..
+                await ctx.started(object())
+                # yield
+            finally:
+                # XXX: previously this would trigger local
+                # ``ContextCancelled`` to be received and raised in the
+                # local context overriding any local error due to logic
+                # inside ``_invoke()`` which checked for an error set on
+                # ``Context._error`` and raised it in a cancellation
+                # scenario.
+                # ------
+                # The problem is you can have a remote cancellation that
+                # is part of a local error and we shouldn't raise
+                # ``ContextCancelled`` **iff** we **were not** the side
+                # of the context to initiate it, i.e.
+                # ``Context._cancel_called`` should **NOT** have been
+                # set. The special logic to handle this case is now
+                # inside ``Context._maybe_raise_from_remote_msg()`` XD
+                await peer_ctx.cancel()
+
+
+def test_do_not_swallow_error_before_started_by_remote_contextcancelled():
+    '''
+    Verify that an error raised in a remote context which itself
+    opens YET ANOTHER remote context, which it then cancels, does not
+    override the original error that caused the cancellation of the
+    secondary context.
+
+    '''
+    async def main():
+        async with tractor.open_nursery() as n:
+            portal = await n.start_actor(
+                'errorer',
+                enable_modules=[__name__],
+            )
+            await n.start_actor(
+                'sleeper',
+                enable_modules=[__name__],
+            )
+
+            async with (
+                portal.open_context(
+                    error_before_started
+                ) as (ctx, sent),
+            ):
+                await trio.sleep_forever()
+
+    with pytest.raises(tractor.RemoteActorError) as excinfo:
+        trio.run(main)
+
+    assert excinfo.value.type == TypeError
+
+
+@tractor.context
+async def sleep_a_bit_then_cancel_peer(
+    ctx: Context,
+    peer_name: str = 'sleeper',
+    cancel_after: float = .5,
+
+) -> None:
+    '''
+    Connect to peer, sleep as per input delay, cancel the peer.
+
+    '''
+    peer: Portal
+    async with tractor.wait_for_actor(peer_name) as peer:
+        await ctx.started()
+        await trio.sleep(cancel_after)
+        await peer.cancel_actor()
+
+
+@tractor.context
+async def stream_ints(
+    ctx: Context,
+):
+    await ctx.started()
+    async with ctx.open_stream() as stream:
+        for i in itertools.count():
+            await stream.send(i)
+            await trio.sleep(0.01)
+
+
+@tractor.context
+async def stream_from_peer(
+    ctx: Context,
+    peer_name: str = 'sleeper',
+) -> None:
+
+    peer: Portal
+    try:
+        async with (
+            tractor.wait_for_actor(peer_name) as peer,
+            peer.open_context(stream_ints) as (peer_ctx, first),
+            peer_ctx.open_stream() as stream,
+        ):
+            await ctx.started()
+            # XXX QUESTIONS & TODO: for further details around this
+            # in the longer run..
+            # https://github.com/goodboy/tractor/issues/368
+            # - should we raise `ContextCancelled` or `Cancelled` (rn
+            #   it does latter) and should/could it be implemented
+            #   as a general injection override for `trio` such
+            #   that ANY next checkpoint would raise the "cancel
+            #   error type" of choice?
+            # - should the `ContextCancelled` bubble from
+            #   all `Context` and `MsgStream` apis wherein it
+            #   prolly makes the most sense to make it
+            #   a `trio.Cancelled` subtype?
+            # - what about IPC-transport specific errors, should
+            #   they bubble from the async for and trigger
+            #   other special cases?
+            # NOTE: current ctl flow:
+            # - stream raises `trio.EndOfChannel` and
+            #   exits the loop
+            # - `.open_context()` will raise the ctxcanc
+            #   received from the sleeper.
+            async for msg in stream:
+                assert msg is not None
+                print(msg)
+
+    # NOTE: cancellation of the (sleeper) peer should always
+    # cause a `ContextCancelled` raise in this streaming
+    # actor.
+    except ContextCancelled as ctxerr:
+        err = ctxerr
+        assert peer_ctx._remote_error is ctxerr
+        assert peer_ctx.canceller == ctxerr.canceller
+
+        # caller peer should not be the cancel requester
+        assert not ctx.cancel_called
+        # XXX can never be true since `._invoke` only
+        # sets this AFTER the nursery block this task
+        # was started in, exits.
+        assert not ctx.cancelled_caught
+
+        # we never requested cancellation
+        assert not peer_ctx.cancel_called
+        # the `.open_context()` exit definitely caught
+        # a cancellation in the internal `Context._scope` since
+        # likely the runtime called `_deliver_msg()` after
+        # receiving the remote error from the streaming task.
+        assert peer_ctx.cancelled_caught
+
+        # TODO / NOTE `.canceller` won't have been set yet
+        # here because that machinery is inside
+        # `.open_context().__aexit__()` BUT, if we had
+        # a way to know immediately (from the last
+        # checkpoint) that cancellation was due to
+        # a remote, we COULD assert this here..see,
+        # https://github.com/goodboy/tractor/issues/368
+
+        # root/parent actor task should NEVER HAVE cancelled us!
+        assert not ctx.canceller
+        assert 'canceller' in peer_ctx.canceller
+
+        raise
+        # TODO: IN THEORY we could have other cases depending on
+        # who cancels first, the root actor or the canceller peer?.
+        #
+        # 1- when the peer request is first then the `.canceller`
+        #   field should obvi be set to the 'canceller' uid,
+        #
+        # 2-if the root DOES req cancel then we should see the same
+        #   `trio.Cancelled` implicitly raised
+        # assert ctx.canceller[0] == 'root'
+        # assert peer_ctx.canceller[0] == 'sleeper'
+
+    raise RuntimeError(
+        'peer never triggered local `ContextCancelled`?'
+    )
+
+
+@pytest.mark.parametrize(
+    'error_during_ctxerr_handling',
+    [False, True],
+)
+def test_peer_canceller(
+    error_during_ctxerr_handling: bool,
+):
+    '''
+    Verify that a cancellation triggered by an in-actor-tree peer
+    results in a cancelled errors with all other actors which have
+    opened contexts to that same actor.
+
+    legend:
+    name>
+        a "play button" that indicates a new runtime instance,
+        an individual actor with `name`.
+
+    .subname>
+        a subactor who's parent should be on some previous
+        line and be less indented.
+
+    .actor0> ()-> .actor1>
+        a inter-actor task context opened (by `async with
+        `Portal.open_context()`) from actor0 *into* actor1.
+
+    .actor0> ()<=> .actor1>
+        a inter-actor task context opened (as above)
+        from actor0 *into* actor1 which INCLUDES an additional
+        stream open using `async with Context.open_stream()`.
+
+
+    ------ - ------
+    supervision view
+    ------ - ------
+    root>
+     .sleeper> TODO: SOME SYNTAX SHOWING JUST SLEEPING
+     .just_caller> ()=> .sleeper>
+     .canceller> ()-> .sleeper>
+                  TODO:  how define calling `Portal.cancel_actor()`
+
+    In this case a `ContextCancelled` with `.errorer` set to the
+    requesting actor, in this case 'canceller', should be relayed
+    to all other actors who have also opened a (remote task)
+    context with that now cancelled actor.
+
+    ------ - ------
+    task view
+    ------ - ------
+    So there are 5 context open in total with 3 from the root to
+    its children and 2 from children to their peers:
+    1. root> ()-> .sleeper>
+    2. root> ()-> .streamer>
+    3. root> ()-> .canceller>
+
+    4. .streamer> ()<=> .sleep>
+    5. .canceller> ()-> .sleeper>
+        - calls `Portal.cancel_actor()`
+
+    '''
+    async def main():
+        async with tractor.open_nursery(
+            # NOTE: to halt the peer tasks on ctxc, uncomment this.
+            # debug_mode=True
+        ) as an:
+            canceller: Portal = await an.start_actor(
+                'canceller',
+                enable_modules=[__name__],
+            )
+            sleeper: Portal = await an.start_actor(
+                'sleeper',
+                enable_modules=[__name__],
+            )
+            just_caller: Portal = await an.start_actor(
+                'just_caller',  # but i just met her?
+                enable_modules=[__name__],
+            )
+
+            root = tractor.current_actor()
+
+            try:
+                async with (
+                    sleeper.open_context(
+                        sleep_forever,
+                        expect_ctxc=True,
+                    ) as (sleeper_ctx, sent),
+
+                    just_caller.open_context(
+                        stream_from_peer,
+                    ) as (caller_ctx, sent),
+
+                    canceller.open_context(
+                        sleep_a_bit_then_cancel_peer,
+                    ) as (canceller_ctx, sent),
+
+                ):
+                    ctxs: list[Context] = [
+                        sleeper_ctx,
+                        caller_ctx,
+                        canceller_ctx,
+                    ]
+
+                    try:
+                        print('PRE CONTEXT RESULT')
+                        await sleeper_ctx.result()
+
+                        # should never get here
+                        pytest.fail(
+                            'Context.result() did not raise ctx-cancelled?'
+                        )
+
+                    # should always raise since this root task does
+                    # not request the sleeper cancellation ;)
+                    except ContextCancelled as ctxerr:
+                        print(f'CAUGHT REMOTE CONTEXT CANCEL {ctxerr}')
+
+                        # canceller and caller peers should not
+                        # have been remotely cancelled.
+                        assert canceller_ctx.canceller is None
+                        assert caller_ctx.canceller is None
+
+                        assert ctxerr.canceller[0] == 'canceller'
+
+                        # XXX NOTE XXX: since THIS `ContextCancelled`
+                        # HAS NOT YET bubbled up to the
+                        # `sleeper.open_context().__aexit__()` this
+                        # value is not yet set, however outside this
+                        # block it should be.
+                        assert not sleeper_ctx.cancelled_caught
+
+                        if error_during_ctxerr_handling:
+                            raise RuntimeError('Simulated error during teardown')
+
+                        raise
+
+                    # XXX SHOULD NEVER EVER GET HERE XXX
+                    except BaseException as berr:
+                        err = berr
+                        pytest.fail('did not rx ctx-cancelled error?')
+                    else:
+                        pytest.fail('did not rx ctx-cancelled error?')
+
+            except (
+                ContextCancelled,
+                RuntimeError,
+            )as ctxerr:
+                _err = ctxerr
+
+                # NOTE: the main state to check on `Context` is:
+                # - `.cancelled_caught` (maps to nursery cs)
+                # - `.cancel_called` (bool of whether this side
+                #    requested)
+                # - `.canceller` (uid of cancel-causing actor-task)
+                # - `._remote_error` (any `RemoteActorError`
+                #    instance from other side of context)
+                # TODO: are we really planning to use this tho?
+                # - `._cancel_msg` (any msg that caused the
+                #    cancel)
+
+                # CASE: error raised during handling of
+                # `ContextCancelled` inside `.open_context()`
+                # block
+                if error_during_ctxerr_handling:
+                    assert isinstance(ctxerr, RuntimeError)
+
+                    # NOTE: this root actor task should have
+                    # called `Context.cancel()` on the
+                    # `.__aexit__()` to every opened ctx.
+                    for ctx in ctxs:
+                        assert ctx.cancel_called
+
+                        # this root actor task should have
+                        # cancelled all opened contexts except the
+                        # sleeper which is obvi by the "canceller"
+                        # peer.
+                        re = ctx._remote_error
+                        if (
+                            ctx is sleeper_ctx
+                            or ctx is caller_ctx
+                        ):
+                            assert (
+                                re.canceller
+                                ==
+                                ctx.canceller
+                                ==
+                                canceller.channel.uid
+                            )
+
+                        else:
+                            assert (
+                                re.canceller
+                                ==
+                                ctx.canceller
+                                ==
+                                root.uid
+                            )
+
+                # CASE: standard teardown inside in `.open_context()` block
+                else:
+                    assert ctxerr.canceller == sleeper_ctx.canceller
+                    assert (
+                        ctxerr.canceller[0]
+                        ==
+                        sleeper_ctx.canceller[0]
+                        ==
+                        'canceller'
+                    )
+
+                    # the sleeper's remote error is the error bubbled
+                    # out of the context-stack above!
+                    re = sleeper_ctx._remote_error
+                    assert re is ctxerr
+
+                    for ctx in ctxs:
+                        re: BaseException | None = ctx._remote_error
+                        assert re
+
+                        # root doesn't cancel sleeper since it's
+                        # cancelled by its peer.
+                        if ctx is sleeper_ctx:
+                            assert not ctx.cancel_called
+                            # since sleeper_ctx.result() IS called
+                            # above we should have (silently)
+                            # absorbed the corresponding
+                            # `ContextCancelled` for it and thus
+                            # the logic inside `.cancelled_caught`
+                            # should trigger!
+                            assert ctx.cancelled_caught
+
+                        elif ctx is caller_ctx:
+                            # since its context was remotely
+                            # cancelled, we never needed to
+                            # call `Context.cancel()` bc it was
+                            # done by the peer and also we never 
+                            assert ctx.cancel_called
+
+                            # TODO: figure out the details of
+                            # this..
+                            # if you look the `._local_error` here
+                            # is a multi of ctxc + 2 Cancelleds?
+                            # assert not ctx.cancelled_caught
+
+                        else:
+                            assert ctx.cancel_called
+                            assert not ctx.cancelled_caught
+
+                        # TODO: do we even need this flag?
+                        # -> each context should have received
+                        # a silently absorbed context cancellation
+                        # in its remote nursery scope.
+                        # assert ctx.chan.uid == ctx.canceller
+
+                    # NOTE: when an inter-peer cancellation
+                    # occurred, we DO NOT expect this
+                    # root-actor-task to have requested a cancel of
+                    # the context since cancellation was caused by
+                    # the "canceller" peer and thus
+                    # `Context.cancel()` SHOULD NOT have been
+                    # called inside
+                    # `Portal.open_context().__aexit__()`.
+                    assert not sleeper_ctx.cancel_called
+
+                # XXX NOTE XXX: and see matching comment above but,
+                # this flag is set only AFTER the `.open_context()`
+                # has exited and should be set in both outcomes
+                # including the case where ctx-cancel handling
+                # itself errors.
+                assert sleeper_ctx.cancelled_caught
+
+                raise  # always to ensure teardown
+
+    if error_during_ctxerr_handling:
+        with pytest.raises(RuntimeError) as excinfo:
+            trio.run(main)
+    else:
+
+        with pytest.raises(ContextCancelled) as excinfo:
+            trio.run(main)
+
+        assert excinfo.value.type == ContextCancelled
+        assert excinfo.value.canceller[0] == 'canceller'
--- a/tests/test_legacy_one_way_streaming.py
+++ b/tests/test_legacy_one_way_streaming.py
@ -55,7 +55,7 @@ async def context_stream(


 async def stream_from_single_subactor(
-    arb_addr,
+    reg_addr,
    start_method,
    stream_func,
 ):
@ -64,7 +64,7 @@ async def stream_from_single_subactor(
    # only one per host address, spawns an actor if None

    async with tractor.open_nursery(
-        arbiter_addr=arb_addr,
+        registry_addrs=[reg_addr],
        start_method=start_method,
    ) as nursery:

@ -115,13 +115,13 @@ async def stream_from_single_subactor(
@pytest.mark.parametrize(
    'stream_func', [async_gen_stream, context_stream]
 )
-def test_stream_from_single_subactor(arb_addr, start_method, stream_func):
+def test_stream_from_single_subactor(reg_addr, start_method, stream_func):
    """Verify streaming from a spawned async generator.
    """
    trio.run(
        partial(
            stream_from_single_subactor,
-            arb_addr,
+            reg_addr,
            start_method,
            stream_func=stream_func,
        ),
@ -225,14 +225,14 @@ async def a_quadruple_example():
        return result_stream


-async def cancel_after(wait, arb_addr):
-    async with tractor.open_root_actor(arbiter_addr=arb_addr):
+async def cancel_after(wait, reg_addr):
+    async with tractor.open_root_actor(registry_addrs=[reg_addr]):
        with trio.move_on_after(wait):
            return await a_quadruple_example()


@pytest.fixture(scope='module')
-def time_quad_ex(arb_addr, ci_env, spawn_backend):
+def time_quad_ex(reg_addr, ci_env, spawn_backend):
    if spawn_backend == 'mp':
        """no idea but the  mp *nix runs are flaking out here often...
        """
@ -240,7 +240,7 @@ def time_quad_ex(arb_addr, ci_env, spawn_backend):

    timeout = 7 if platform.system() in ('Windows', 'Darwin') else 4
    start = time.time()
-    results = trio.run(cancel_after, timeout, arb_addr)
+    results = trio.run(cancel_after, timeout, reg_addr)
    diff = time.time() - start
    assert results
    return results, diff
@ -260,14 +260,14 @@ def test_a_quadruple_example(time_quad_ex, ci_env, spawn_backend):
    list(map(lambda i: i/10, range(3, 9)))
 )
 def test_not_fast_enough_quad(
-    arb_addr, time_quad_ex, cancel_delay, ci_env, spawn_backend
+    reg_addr, time_quad_ex, cancel_delay, ci_env, spawn_backend
 ):
    """Verify we can cancel midway through the quad example and all actors
    cancel gracefully.
    """
    results, diff = time_quad_ex
    delay = max(diff - cancel_delay, 0)
-    results = trio.run(cancel_after, delay, arb_addr)
+    results = trio.run(cancel_after, delay, reg_addr)
    system = platform.system()
    if system in ('Windows', 'Darwin') and results is not None:
        # In CI envoirments it seems later runs are quicker then the first
@ -280,7 +280,7 @@ def test_not_fast_enough_quad(

@tractor_test
 async def test_respawn_consumer_task(
-    arb_addr,
+    reg_addr,
    spawn_backend,
    loglevel,
 ):
--- a/tests/test_local.py
+++ b/tests/test_local.py
@ -24,7 +24,7 @@ async def test_no_runtime():


@tractor_test
-async def test_self_is_registered(arb_addr):
+async def test_self_is_registered(reg_addr):
    "Verify waiting on the arbiter to register itself using the standard api."
    actor = tractor.current_actor()
    assert actor.is_arbiter
@ -34,20 +34,20 @@ async def test_self_is_registered(arb_addr):


@tractor_test
-async def test_self_is_registered_localportal(arb_addr):
+async def test_self_is_registered_localportal(reg_addr):
    "Verify waiting on the arbiter to register itself using a local portal."
    actor = tractor.current_actor()
    assert actor.is_arbiter
-    async with tractor.get_arbiter(*arb_addr) as portal:
+    async with tractor.get_arbiter(*reg_addr) as portal:
        assert isinstance(portal, tractor._portal.LocalPortal)

        with trio.fail_after(0.2):
            sockaddr = await portal.run_from_ns(
                    'self', 'wait_for_actor', name='root')
-            assert sockaddr[0] == arb_addr
+            assert sockaddr[0] == reg_addr


-def test_local_actor_async_func(arb_addr):
+def test_local_actor_async_func(reg_addr):
    """Verify a simple async function in-process.
    """
    nums = []
@ -55,7 +55,7 @@ def test_local_actor_async_func(arb_addr):
    async def print_loop():

        async with tractor.open_root_actor(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
        ):
            # arbiter is started in-proc if dne
            assert tractor.current_actor().is_arbiter
--- a/tests/test_multi_program.py
+++ b/tests/test_multi_program.py
@ -28,9 +28,9 @@ def test_abort_on_sigint(daemon):


@tractor_test
-async def test_cancel_remote_arbiter(daemon, arb_addr):
+async def test_cancel_remote_arbiter(daemon, reg_addr):
    assert not tractor.current_actor().is_arbiter
-    async with tractor.get_arbiter(*arb_addr) as portal:
+    async with tractor.get_arbiter(*reg_addr) as portal:
        await portal.cancel_actor()

    time.sleep(0.1)
@ -39,16 +39,16 @@ async def test_cancel_remote_arbiter(daemon, arb_addr):

    # no arbiter socket should exist
    with pytest.raises(OSError):
-        async with tractor.get_arbiter(*arb_addr) as portal:
+        async with tractor.get_arbiter(*reg_addr) as portal:
            pass


-def test_register_duplicate_name(daemon, arb_addr):
+def test_register_duplicate_name(daemon, reg_addr):

    async def main():

        async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
        ) as n:

            assert not tractor.current_actor().is_arbiter
--- a/tests/test_pubsub.py
+++ b/tests/test_pubsub.py
@ -160,7 +160,7 @@ async def test_required_args(callwith_expecterror):
 )
 def test_multi_actor_subs_arbiter_pub(
    loglevel,
-    arb_addr,
+    reg_addr,
    pub_actor,
 ):
    """Try out the neato @pub decorator system.
@ -170,7 +170,7 @@ def test_multi_actor_subs_arbiter_pub(
    async def main():

        async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
            enable_modules=[__name__],
        ) as n:

@ -255,12 +255,12 @@ def test_multi_actor_subs_arbiter_pub(

 def test_single_subactor_pub_multitask_subs(
    loglevel,
-    arb_addr,
+    reg_addr,
 ):
    async def main():

        async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
            enable_modules=[__name__],
        ) as n:

--- a/tests/test_resource_cache.py
+++ b/tests/test_resource_cache.py
@ -34,7 +34,6 @@ def test_resource_only_entered_once(key_on):
    global _resource
    _resource = 0

-    kwargs = {}
    key = None
    if key_on == 'key_value':
        key = 'some_common_key'
@ -139,7 +138,7 @@ def test_open_local_sub_to_stream():
    N local tasks using ``trionics.maybe_open_context():``.

    '''
-    timeout = 3 if platform.system() != "Windows" else 10
+    timeout: float = 3.6 if platform.system() != "Windows" else 10

    async def main():

--- a/tests/test_rpc.py
+++ b/tests/test_rpc.py
@ -13,9 +13,19 @@ async def sleep_back_actor(
    func_name,
    func_defined,
    exposed_mods,
+    *,
+    reg_addr: tuple,
 ):
    if actor_name:
-        async with tractor.find_actor(actor_name) as portal:
+        async with tractor.find_actor(
+            actor_name,
+            # NOTE: must be set manually since
+            # the subactor doesn't have the reg_addr
+            # fixture code run in it!
+            # TODO: maybe we should just set this once in the
+            # _state mod and derive to all children?
+            registry_addrs=[reg_addr],
+        ) as portal:
            try:
                await portal.run(__name__, func_name)
            except tractor.RemoteActorError as err:
@ -45,11 +55,17 @@ async def short_sleep():
    ids=['no_mods', 'this_mod', 'this_mod_bad_func', 'fail_to_import',
         'fail_on_syntax'],
 )
-def test_rpc_errors(arb_addr, to_call, testdir):
-    """Test errors when making various RPC requests to an actor
+def test_rpc_errors(
+    reg_addr,
+    to_call,
+    testdir,
+):
+    '''
+    Test errors when making various RPC requests to an actor
    that either doesn't have the requested module exposed or doesn't define
    the named function.
-    """
+
+    '''
    exposed_mods, funcname, inside_err = to_call
    subactor_exposed_mods = []
    func_defined = globals().get(funcname, False)
@ -77,8 +93,13 @@ def test_rpc_errors(arb_addr, to_call, testdir):

        # spawn a subactor which calls us back
        async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
            enable_modules=exposed_mods.copy(),
+
+            # NOTE: will halt test in REPL if uncommented, so only
+            # do that if actually debugging subactor but keep it
+            # disabled for the test.
+            # debug_mode=True,
        ) as n:

            actor = tractor.current_actor()
@ -95,6 +116,7 @@ def test_rpc_errors(arb_addr, to_call, testdir):
                exposed_mods=exposed_mods,
                func_defined=True if func_defined else False,
                enable_modules=subactor_exposed_mods,
+                reg_addr=reg_addr,
            )

    def run():
--- a/tests/test_shm.py
+++ b/tests/test_shm.py
@ -0,0 +1,167 @@
+"""
+Shared mem primitives and APIs.
+
+"""
+import uuid
+
+# import numpy
+import pytest
+import trio
+import tractor
+from tractor._shm import (
+    open_shm_list,
+    attach_shm_list,
+)
+
+
+@tractor.context
+async def child_attach_shml_alot(
+    ctx: tractor.Context,
+    shm_key: str,
+) -> None:
+
+    await ctx.started(shm_key)
+
+    # now try to attach a boatload of times in a loop..
+    for _ in range(1000):
+        shml = attach_shm_list(
+            key=shm_key,
+            readonly=False,
+        )
+        assert shml.shm.name == shm_key
+        await trio.sleep(0.001)
+
+
+def test_child_attaches_alot():
+    async def main():
+        async with tractor.open_nursery() as an:
+
+            # allocate writeable list in parent
+            key = f'shml_{uuid.uuid4()}'
+            shml = open_shm_list(
+                key=key,
+            )
+
+            portal = await an.start_actor(
+                'shm_attacher',
+                enable_modules=[__name__],
+            )
+
+            async with (
+                portal.open_context(
+                    child_attach_shml_alot,
+                    shm_key=shml.key,
+                ) as (ctx, start_val),
+            ):
+                assert start_val == key
+                await ctx.result()
+
+            await portal.cancel_actor()
+
+    trio.run(main)
+
+
+@tractor.context
+async def child_read_shm_list(
+    ctx: tractor.Context,
+    shm_key: str,
+    use_str: bool,
+    frame_size: int,
+) -> None:
+
+    # attach in child
+    shml = attach_shm_list(
+        key=shm_key,
+        # dtype=str if use_str else float,
+    )
+    await ctx.started(shml.key)
+
+    async with ctx.open_stream() as stream:
+        async for i in stream:
+            print(f'(child): reading shm list index: {i}')
+
+            if use_str:
+                expect = str(float(i))
+            else:
+                expect = float(i)
+
+            if frame_size == 1:
+                val = shml[i]
+                assert expect == val
+                print(f'(child): reading value: {val}')
+            else:
+                frame = shml[i - frame_size:i]
+                print(f'(child): reading frame: {frame}')
+
+
+@pytest.mark.parametrize(
+    'use_str',
+    [False, True],
+    ids=lambda i: f'use_str_values={i}',
+)
+@pytest.mark.parametrize(
+    'frame_size',
+    [1, 2**6, 2**10],
+    ids=lambda i: f'frame_size={i}',
+)
+def test_parent_writer_child_reader(
+    use_str: bool,
+    frame_size: int,
+):
+
+    async def main():
+        async with tractor.open_nursery(
+            # debug_mode=True,
+        ) as an:
+
+            portal = await an.start_actor(
+                'shm_reader',
+                enable_modules=[__name__],
+                debug_mode=True,
+            )
+
+            # allocate writeable list in parent
+            key = 'shm_list'
+            seq_size = int(2 * 2 ** 10)
+            shml = open_shm_list(
+                key=key,
+                size=seq_size,
+                dtype=str if use_str else float,
+                readonly=False,
+            )
+
+            async with (
+                portal.open_context(
+                    child_read_shm_list,
+                    shm_key=key,
+                    use_str=use_str,
+                    frame_size=frame_size,
+                ) as (ctx, sent),
+
+                ctx.open_stream() as stream,
+            ):
+
+                assert sent == key
+
+                for i in range(seq_size):
+
+                    val = float(i)
+                    if use_str:
+                        val = str(val)
+
+                    # print(f'(parent): writing {val}')
+                    shml[i] = val
+
+                    # only on frame fills do we
+                    # signal to the child that a frame's
+                    # worth is ready.
+                    if (i % frame_size) == 0:
+                        print(f'(parent): signalling frame full on {val}')
+                        await stream.send(i)
+                else:
+                    print(f'(parent): signalling final frame on {val}')
+                    await stream.send(i)
+
+            await portal.cancel_actor()
+
+    trio.run(main)
--- a/tests/test_spawning.py
+++ b/tests/test_spawning.py
@ -16,14 +16,14 @@ data_to_pass_down = {'doggy': 10, 'kitty': 4}
 async def spawn(
    is_arbiter: bool,
    data: dict,
-    arb_addr: tuple[str, int],
+    reg_addr: tuple[str, int],
 ):
    namespaces = [__name__]

    await trio.sleep(0.1)

    async with tractor.open_root_actor(
-        arbiter_addr=arb_addr,
+        arbiter_addr=reg_addr,
    ):

        actor = tractor.current_actor()
@ -32,8 +32,7 @@ async def spawn(

        if actor.is_arbiter:

-            async with tractor.open_nursery(
-            ) as nursery:
+            async with tractor.open_nursery() as nursery:

                # forks here
                portal = await nursery.run_in_actor(
@ -41,7 +40,7 @@ async def spawn(
                    is_arbiter=False,
                    name='sub-actor',
                    data=data,
-                    arb_addr=arb_addr,
+                    reg_addr=reg_addr,
                    enable_modules=namespaces,
                )

@ -55,12 +54,14 @@ async def spawn(
            return 10


-def test_local_arbiter_subactor_global_state(arb_addr):
+def test_local_arbiter_subactor_global_state(
+    reg_addr,
+):
    result = trio.run(
        spawn,
        True,
        data_to_pass_down,
-        arb_addr,
+        reg_addr,
    )
    assert result == 10

@ -140,7 +141,7 @@ async def check_loglevel(level):
 def test_loglevel_propagated_to_subactor(
    start_method,
    capfd,
-    arb_addr,
+    reg_addr,
 ):
    if start_method == 'mp_forkserver':
        pytest.skip(
@ -152,7 +153,7 @@ def test_loglevel_propagated_to_subactor(
        async with tractor.open_nursery(
            name='arbiter',
            start_method=start_method,
-            arbiter_addr=arb_addr,
+            arbiter_addr=reg_addr,

        ) as tn:
            await tn.run_in_actor(
--- a/tests/test_task_broadcasting.py
+++ b/tests/test_task_broadcasting.py
@ -66,13 +66,13 @@ async def ensure_sequence(
 async def open_sequence_streamer(

    sequence: list[int],
-    arb_addr: tuple[str, int],
+    reg_addr: tuple[str, int],
    start_method: str,

 ) -> tractor.MsgStream:

    async with tractor.open_nursery(
-        arbiter_addr=arb_addr,
+        arbiter_addr=reg_addr,
        start_method=start_method,
    ) as tn:

@ -86,14 +86,14 @@ async def open_sequence_streamer(
        ) as (ctx, first):

            assert first is None
-            async with ctx.open_stream(backpressure=True) as stream:
+            async with ctx.open_stream(allow_overruns=True) as stream:
                yield stream

        await portal.cancel_actor()


 def test_stream_fan_out_to_local_subscriptions(
-    arb_addr,
+    reg_addr,
    start_method,
 ):

@ -103,7 +103,7 @@ def test_stream_fan_out_to_local_subscriptions(

        async with open_sequence_streamer(
            sequence,
-            arb_addr,
+            reg_addr,
            start_method,
        ) as stream:

@ -138,7 +138,7 @@ def test_stream_fan_out_to_local_subscriptions(
    ]
 )
 def test_consumer_and_parent_maybe_lag(
-    arb_addr,
+    reg_addr,
    start_method,
    task_delays,
 ):
@ -150,7 +150,7 @@ def test_consumer_and_parent_maybe_lag(

        async with open_sequence_streamer(
            sequence,
-            arb_addr,
+            reg_addr,
            start_method,
        ) as stream:

@ -211,7 +211,7 @@ def test_consumer_and_parent_maybe_lag(


 def test_faster_task_to_recv_is_cancelled_by_slower(
-    arb_addr,
+    reg_addr,
    start_method,
 ):
    '''
@ -225,7 +225,7 @@ def test_faster_task_to_recv_is_cancelled_by_slower(

        async with open_sequence_streamer(
            sequence,
-            arb_addr,
+            reg_addr,
            start_method,

        ) as stream:
@ -302,7 +302,7 @@ def test_subscribe_errors_after_close():


 def test_ensure_slow_consumers_lag_out(
-    arb_addr,
+    reg_addr,
    start_method,
 ):
    '''This is a pure local task test; no tractor
@ -413,8 +413,8 @@ def test_ensure_slow_consumers_lag_out(
                    seq = brx._state.subs[brx.key]
                    assert seq == len(brx._state.queue) - 1

-                # all backpressured entries in the underlying
-                # channel should have been copied into the caster
+                # all no_overruns entries in the underlying
+                # channel should have been copied into the bcaster
                # queue trailing-window
                async for i in rx:
                    print(f'bped: {i}')
--- a/tractor/init.py
+++ b/tractor/init.py
@ -15,72 +15,52 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.

 """
-tractor: structured concurrent "actors".
+tractor: structured concurrent ``trio``-"actors".

 """
-from exceptiongroup import BaseExceptionGroup
+from exceptiongroup import BaseExceptionGroup as BaseExceptionGroup

-from ._clustering import open_actor_cluster
-from ._ipc import Channel
+from ._clustering import (
+    open_actor_cluster as open_actor_cluster,
+)
+from ._context import (
+    Context as Context,  # the type
+    context as context,  # a func-decorator
+)
 from ._streaming import (
-    Context,
-    MsgStream,
-    stream,
-    context,
+    MsgStream as MsgStream,
+    stream as stream,
 )
 from ._discovery import (
-    get_arbiter,
-    find_actor,
-    wait_for_actor,
-    query_actor,
+    get_arbiter as get_arbiter,
+    find_actor as find_actor,
+    wait_for_actor as wait_for_actor,
+    query_actor as query_actor,
+)
+from ._supervise import (
+    open_nursery as open_nursery,
+    ActorNursery as ActorNursery,
 )
-from ._supervise import open_nursery
 from ._state import (
-    current_actor,
-    is_root_process,
+    current_actor as current_actor,
+    is_root_process as is_root_process,
 )
 from ._exceptions import (
-    RemoteActorError,
-    ModuleNotExposed,
-    ContextCancelled,
+    RemoteActorError as RemoteActorError,
+    ModuleNotExposed as ModuleNotExposed,
+    ContextCancelled as ContextCancelled,
 )
-from ._debug import (
-    breakpoint,
-    post_mortem,
+from .devx import (
+    breakpoint as breakpoint,
+    pause as pause,
+    pause_from_sync as pause_from_sync,
+    post_mortem as post_mortem,
 )
-from . import msg
+from . import msg as msg
 from ._root import (
-    run_daemon,
-    open_root_actor,
+    run_daemon as run_daemon,
+    open_root_actor as open_root_actor,
 )
-from ._portal import Portal
-from ._runtime import Actor
-
-
-__all__ = [
-    'Actor',
-    'Channel',
-    'Context',
-    'ContextCancelled',
-    'ModuleNotExposed',
-    'MsgStream',
-    'BaseExceptionGroup',
-    'Portal',
-    'RemoteActorError',
-    'breakpoint',
-    'context',
-    'current_actor',
-    'find_actor',
-    'get_arbiter',
-    'is_root_process',
-    'msg',
-    'open_actor_cluster',
-    'open_nursery',
-    'open_root_actor',
-    'post_mortem',
-    'query_actor',
-    'run_daemon',
-    'stream',
-    'to_asyncio',
-    'wait_for_actor',
-]
+from ._ipc import Channel as Channel
+from ._portal import Portal as Portal
+from ._runtime import Actor as Actor
--- a/tractor/_child.py
+++ b/tractor/_child.py
@ -18,8 +18,6 @@
 This is the "bootloader" for actors started using the native trio backend.

 """
-import sys
-import trio
 import argparse

 from ast import literal_eval
@ -37,8 +35,6 @@ def parse_ipaddr(arg):
    return (str(host), int(port))


-from ._entry import _trio_main
-
 if __name__ == "__main__":

    parser = argparse.ArgumentParser()
--- a/tractor/_context.py
+++ b/tractor/_context.py
@ -0,0 +1,990 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+The fundamental cross process SC abstraction: an inter-actor,
+cancel-scope linked task "context".
+
+A ``Context`` is very similar to the ``trio.Nursery.cancel_scope`` built
+into each ``trio.Nursery`` except it links the lifetimes of memory space
+disjoint, parallel executing tasks in separate actors.
+
+'''
+from __future__ import annotations
+from collections import deque
+from contextlib import asynccontextmanager as acm
+from dataclasses import (
+    dataclass,
+    field,
+)
+from functools import partial
+import inspect
+from pprint import pformat
+from typing import (
+    Any,
+    Callable,
+    AsyncGenerator,
+    TYPE_CHECKING,
+)
+import warnings
+
+import trio
+
+from ._exceptions import (
+    # _raise_from_no_key_in_msg,
+    unpack_error,
+    pack_error,
+    ContextCancelled,
+    # MessagingError,
+    StreamOverrun,
+)
+from .log import get_logger
+from ._ipc import Channel
+from ._streaming import MsgStream
+from ._state import current_actor
+
+if TYPE_CHECKING:
+    from ._portal import Portal
+    from ._runtime import Actor
+
+
+log = get_logger(__name__)
+
+
+# TODO: make this a msgspec.Struct!
+@dataclass
+class Context:
+    '''
+    An inter-actor, SC transitive, `trio.Task` communication context.
+
+    NB: This class should **never be instatiated directly**, it is allocated
+    by the runtime in 2 ways:
+     - by entering ``Portal.open_context()`` which is the primary
+       public API for any "caller" task or,
+     - by the RPC machinery's `._runtime._invoke()` as a `ctx` arg
+       to a remotely scheduled "callee" function.
+
+    AND is always constructed using the below ``mk_context()``.
+
+    Allows maintaining task or protocol specific state between
+    2 cancel-scope-linked, communicating and parallel executing
+    `trio.Task`s. Contexts are allocated on each side of any task
+    RPC-linked msg dialog, i.e. for every request to a remote
+    actor from a `Portal`. On the "callee" side a context is
+    always allocated inside ``._runtime._invoke()``.
+
+    # TODO: more detailed writeup on cancellation, error and
+    # streaming semantics..
+
+    A context can be cancelled and (possibly eventually restarted) from
+    either side of the underlying IPC channel, it can also open task
+    oriented message streams,  and acts more or less as an IPC aware
+    inter-actor-task ``trio.CancelScope``.
+
+    '''
+    chan: Channel
+    cid: str  # "context id", more or less a unique linked-task-pair id
+
+    # the "feeder" channels for delivering message values to the
+    # local task from the runtime's msg processing loop.
+    _recv_chan: trio.MemoryReceiveChannel
+    _send_chan: trio.MemorySendChannel
+
+    # the "invocation type" of the far end task-entry-point
+    # function, normally matching a logic block inside
+    # `._runtime.invoke()`.
+    _remote_func_type: str | None = None
+
+    # NOTE: (for now) only set (a portal) on the caller side since
+    # the callee doesn't generally need a ref to one and should
+    # normally need to explicitly ask for handle to its peer if
+    # more the the `Context` is needed?
+    _portal: Portal | None = None
+
+    # NOTE: each side of the context has its own cancel scope
+    # which is exactly the primitive that allows for
+    # cross-actor-task-supervision and thus SC.
+    _scope: trio.CancelScope | None = None
+
+    # on a clean exit there should be a final value
+    # delivered from the far end "callee" task, so
+    # this value is only set on one side.
+    _result: Any | int = None
+
+    # if the local "caller"  task errors this
+    # value is always set to the error that was
+    # captured in the `Portal.open_context().__aexit__()`
+    # teardown.
+    _local_error: BaseException | None = None
+
+    # if the either side gets an error from the other
+    # this value is set to that error unpacked from an
+    # IPC msg.
+    _remote_error: BaseException | None = None
+
+    # only set if the local task called `.cancel()`
+    _cancel_called: bool = False  # did WE cancel the far end?
+
+    # TODO: do we even need this? we can assume that if we're
+    # cancelled that the other side is as well, so maybe we should
+    # instead just have a `.canceller` pulled from the
+    # `ContextCancelled`?
+    _canceller: tuple[str, str] | None = None
+
+    # NOTE: we try to ensure assignment of a "cancel msg" since
+    # there's always going to be an "underlying reason" that any
+    # context was closed due to either a remote side error or
+    # a call to `.cancel()` which triggers `ContextCancelled`.
+    _cancel_msg: str | dict | None = None
+
+    # NOTE: this state var used by the runtime to determine if the
+    # `pdbp` REPL is allowed to engage on contexts terminated via
+    # a `ContextCancelled` due to a call to `.cancel()` triggering
+    # "graceful closure" on either side:
+    # - `._runtime._invoke()` will check this flag before engaging
+    #   the crash handler REPL in such cases where the "callee"
+    #   raises the cancellation,
+    # - `.devx._debug.lock_tty_for_child()` will set it to `False` if
+    #   the global tty-lock has been configured to filter out some
+    #   actors from being able to acquire the debugger lock.
+    _enter_debugger_on_cancel: bool = True
+
+    @property
+    def cancel_called(self) -> bool:
+        '''
+        Records whether cancellation has been requested for this context
+        by either an explicit call to  ``.cancel()`` or an implicit call
+        due to an error caught inside the ``Portal.open_context()``
+        block.
+
+        '''
+        return self._cancel_called
+
+    @property
+    def canceller(self) -> tuple[str, str] | None:
+        '''
+        ``Actor.uid: tuple[str, str]`` of the (remote)
+        actor-process who's task was cancelled thus causing this
+        (side of the) context to also be cancelled.
+
+        '''
+        return self._canceller
+
+    @property
+    def cancelled_caught(self) -> bool:
+        return (
+            # the local scope was cancelled either by
+            # remote error or self-request
+            self._scope.cancelled_caught
+
+            # the local scope was never cancelled
+            # and instead likely we received a remote side
+            # cancellation that was raised inside `.result()`
+            or (
+                (se := self._local_error)
+                and
+                isinstance(se, ContextCancelled)
+                and (
+                    se.canceller == self.canceller
+                    or
+                    se is self._remote_error
+                )
+            )
+        )
+
+    @property
+    def side(self) -> str:
+        '''
+        Return string indicating which task this instance is wrapping.
+
+        '''
+        return 'caller' if self._portal else 'callee'
+
+    # init and streaming state
+    _started_called: bool = False
+    _stream_opened: bool = False
+
+    # overrun handling machinery
+    # NOTE: none of this provides "backpressure" to the remote
+    # task, only an ability to not lose messages when the local
+    # task is configured to NOT transmit ``StreamOverrun``s back
+    # to the other side.
+    _overflow_q: deque[dict] = field(
+        default_factory=partial(
+            deque,
+            maxlen=616,
+        )
+    )
+    _scope_nursery: trio.Nursery | None = None
+    _in_overrun: bool = False
+    _allow_overruns: bool = False
+
+    async def send_yield(
+        self,
+        data: Any,
+
+    ) -> None:
+
+        warnings.warn(
+            "`Context.send_yield()` is now deprecated. "
+            "Use ``MessageStream.send()``. ",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        await self.chan.send({'yield': data, 'cid': self.cid})
+
+    async def send_stop(self) -> None:
+        await self.chan.send({'stop': True, 'cid': self.cid})
+
+    def _maybe_cancel_and_set_remote_error(
+        self,
+        error: BaseException,
+
+    ) -> None:
+        '''
+        (Maybe) cancel this local scope due to a received remote
+        error (normally via an IPC msg) which the actor runtime
+        routes to this context.
+
+        Acts as a form of "relay" for a remote error raised in the
+        corresponding remote task's `Context` wherein the next time
+        the local task exectutes a checkpoint, a `trio.Cancelled`
+        will be raised and depending on the type and source of the
+        original remote error, and whether or not the local task
+        called `.cancel()` itself prior, an equivalent
+        `ContextCancelled` or `RemoteActorError` wrapping the
+        remote error may be raised here by any of,
+
+        - `Portal.open_context()`
+        - `Portal.result()`
+        - `Context.open_stream()`
+        - `Context.result()`
+
+        when called/closed by actor local task(s).
+
+        NOTEs & TODOs: 
+          - It is expected that the caller has previously unwrapped
+            the remote error using a call to `unpack_error()` and
+            provides that output exception value as the input
+            `error` argument here.
+          - If this is an error message from a context opened by
+            `Portal.open_context()` we want to interrupt any
+            ongoing local tasks operating within that `Context`'s
+            cancel-scope so as to be notified ASAP of the remote
+            error and engage any caller handling (eg. for
+            cross-process task supervision).
+          - In some cases we may want to raise the remote error
+            immediately since there is no guarantee the locally
+            operating task(s) will attempt to execute a checkpoint
+            any time soon; in such cases there are 2 possible
+            approaches depending on the current task's work and
+            wrapping "thread" type:
+
+            - `trio`-native-and-graceful: only ever wait for tasks
+              to exec a next `trio.lowlevel.checkpoint()` assuming
+              that any such task must do so to interact with the
+              actor runtime and IPC interfaces.
+
+            - (NOT IMPLEMENTED) system-level-aggressive: maybe we
+              could eventually interrupt sync code (invoked using
+              `trio.to_thread` or some other adapter layer) with
+              a signal (a custom unix one for example?
+              https://stackoverflow.com/a/5744185) depending on the
+              task's wrapping thread-type such that long running
+              sync code should never cause the delay of actor
+              supervision tasks such as cancellation and respawn
+              logic.
+
+        '''
+        # XXX: currently this should only be used when
+        # `Portal.open_context()` has been opened since it's
+        # assumed that other portal APIs like,
+        #  - `Portal.run()`,
+        #  - `ActorNursery.run_in_actor()`
+        # do their own error checking at their own call points and
+        # result processing.
+
+        # XXX: set the remote side's error so that after we cancel
+        # whatever task is the opener of this context it can raise
+        # that error as the reason.
+        self._remote_error: BaseException = error
+
+        if (
+            isinstance(error, ContextCancelled)
+        ):
+            # always record the cancelling actor's uid since its cancellation
+            # state is linked and we want to know which process was
+            # the cause / requester of the cancellation.
+            self._canceller = error.canceller
+
+            log.cancel(
+                'Remote task-context was cancelled for '
+                f'actor: {self.chan.uid}\n'
+                f'task: {self.cid}\n'
+                f'canceller: {error.canceller}\n'
+            )
+
+            if self._cancel_called:
+                # from .devx._debug import breakpoint
+                # await breakpoint()
+
+                # this is an expected cancel request response message
+                # and we **don't need to raise it** in local cancel
+                # scope since it will potentially override a real error.
+                return
+
+        else:
+            log.error(
+                f'Remote context error,\n'
+                f'remote actor: {self.chan.uid}\n'
+                f'task: {self.cid}\n'
+                f'{error}'
+            )
+            self._canceller = self.chan.uid
+
+        # TODO: tempted to **not** do this by-reraising in a
+        # nursery and instead cancel a surrounding scope, detect
+        # the cancellation, then lookup the error that was set?
+        # YES! this is way better and simpler!
+        cs: trio.CancelScope = self._scope
+        if (
+            cs
+            and not cs.cancel_called
+            and not cs.cancelled_caught
+        ):
+
+            # TODO: we can for sure drop this right?
+            # from trio.testing import wait_all_tasks_blocked
+            # await wait_all_tasks_blocked()
+
+            # TODO: it'd sure be handy to inject our own
+            # `trio.Cancelled` subtype here ;)
+            # https://github.com/goodboy/tractor/issues/368
+            self._scope.cancel()
+
+            # NOTE: this REPL usage actually works here dawg! Bo
+            # from .devx._debug import pause
+            # await pause()
+
+    async def cancel(
+        self,
+        timeout: float = 0.616,
+
+    ) -> None:
+        '''
+        Cancel this inter-actor-task context.
+
+        Request that the far side cancel it's current linked context,
+        Timeout quickly in an attempt to sidestep 2-generals...
+
+        '''
+        side: str = self.side
+        log.cancel(
+            f'Cancelling {side} side of context to {self.chan.uid}'
+        )
+        self._cancel_called: bool = True
+
+        # caller side who entered `Portal.open_context()`
+        # NOTE: on the call side we never manually call
+        # `._scope.cancel()` since we expect the eventual
+        # `ContextCancelled` from the other side to trigger this
+        # when the runtime finally receives it during teardown
+        # (normally in `.result()` called from
+        # `Portal.open_context().__aexit__()`)
+        if side == 'caller':
+            if not self._portal:
+                raise RuntimeError(
+                    "No portal found, this is likely a callee side context"
+                )
+
+            cid: str = self.cid
+            with trio.move_on_after(timeout) as cs:
+                cs.shield = True
+                log.cancel(
+                    f'Cancelling stream {cid} to '
+                    f'{self._portal.channel.uid}'
+                )
+
+                # NOTE: we're telling the far end actor to cancel a task
+                # corresponding to *this actor*. The far end local channel
+                # instance is passed to `Actor._cancel_task()` implicitly.
+                await self._portal.run_from_ns(
+                    'self',
+                    '_cancel_task',
+                    cid=cid,
+                )
+
+            if cs.cancelled_caught:
+                # XXX: there's no way to know if the remote task was indeed
+                # cancelled in the case where the connection is broken or
+                # some other network error occurred.
+                # if not self._portal.channel.connected():
+                if not self.chan.connected():
+                    log.cancel(
+                        'May have failed to cancel remote task '
+                        f'{cid} for {self._portal.channel.uid}'
+                    )
+                else:
+                    log.cancel(
+                        'Timed out on cancel request of remote task '
+                        f'{cid} for {self._portal.channel.uid}'
+                    )
+
+        # callee side remote task
+        # NOTE: on this side we ALWAYS cancel the local scope since
+        # the caller expects a `ContextCancelled` to be sent from
+        # `._runtime._invoke()` back to the other side.
+        else:
+            # TODO: should we have an explicit cancel message
+            # or is relaying the local `trio.Cancelled` as an
+            # {'error': trio.Cancelled, cid: "blah"} enough?
+            # This probably gets into the discussion in
+            # https://github.com/goodboy/tractor/issues/36
+            assert self._scope
+            self._scope.cancel()
+
+    @acm
+    async def open_stream(
+        self,
+        allow_overruns: bool | None = False,
+        msg_buffer_size: int | None = None,
+
+    ) -> AsyncGenerator[MsgStream, None]:
+        '''
+        Open a ``MsgStream``, a bi-directional stream connected to the
+        cross-actor (far end) task for this ``Context``.
+
+        This context manager must be entered on both the caller and
+        callee for the stream to logically be considered "connected".
+
+        A ``MsgStream`` is currently "one-shot" use, meaning if you
+        close it you can not "re-open" it for streaming and instead you
+        must re-establish a new surrounding ``Context`` using
+        ``Portal.open_context()``.  In the future this may change but
+        currently there seems to be no obvious reason to support
+        "re-opening":
+          - pausing a stream can be done with a message.
+          - task errors will normally require a restart of the entire
+            scope of the inter-actor task context due to the nature of
+            ``trio``'s cancellation system.
+
+        '''
+        actor: Actor = current_actor()
+
+        # here we create a mem chan that corresponds to the
+        # far end caller / callee.
+
+        # Likewise if the surrounding context has been cancelled we error here
+        # since it likely means the surrounding block was exited or
+        # killed
+
+        if self._cancel_called:
+
+            # XXX NOTE: ALWAYS RAISE any remote error here even if
+            # it's an expected `ContextCancelled` due to a local
+            # task having called `.cancel()`!
+            #
+            # WHY: we expect the error to always bubble up to the
+            # surrounding `Portal.open_context()` call and be
+            # absorbed there (silently) and we DO NOT want to
+            # actually try to stream - a cancel msg was already
+            # sent to the other side!
+            if self._remote_error:
+                raise self._remote_error
+
+            # XXX NOTE: if no `ContextCancelled` has been responded
+            # back from the other side (yet), we raise a different
+            # runtime error indicating that this task's usage of
+            # `Context.cancel()` and then `.open_stream()` is WRONG!
+            task: str = trio.lowlevel.current_task().name
+            raise RuntimeError(
+                'Stream opened after `Context.cancel()` called..?\n'
+                f'task: {actor.uid[0]}:{task}\n'
+                f'{self}'
+            )
+
+        if (
+            not self._portal
+            and not self._started_called
+        ):
+            raise RuntimeError(
+                'Context.started()` must be called before opening a stream'
+            )
+
+        # NOTE: in one way streaming this only happens on the
+        # caller side inside `Actor.start_remote_task()` so if you try
+        # to send a stop from the caller to the callee in the
+        # single-direction-stream case you'll get a lookup error
+        # currently.
+        ctx = actor.get_context(
+            self.chan,
+            self.cid,
+            msg_buffer_size=msg_buffer_size,
+            allow_overruns=allow_overruns,
+        )
+        ctx._allow_overruns: bool = allow_overruns
+        assert ctx is self
+
+        # XXX: If the underlying channel feeder receive mem chan has
+        # been closed then likely client code has already exited
+        # a ``.open_stream()`` block prior or there was some other
+        # unanticipated error or cancellation from ``trio``.
+
+        if ctx._recv_chan._closed:
+            raise trio.ClosedResourceError(
+                'The underlying channel for this stream was already closed!?'
+            )
+
+        async with MsgStream(
+            ctx=self,
+            rx_chan=ctx._recv_chan,
+        ) as stream:
+
+            # NOTE: we track all existing streams per portal for
+            # the purposes of attempting graceful closes on runtime
+            # cancel requests.
+            if self._portal:
+                self._portal._streams.add(stream)
+
+            try:
+                self._stream_opened: bool = True
+
+                # XXX: do we need this?
+                # ensure we aren't cancelled before yielding the stream
+                # await trio.lowlevel.checkpoint()
+                yield stream
+
+                # NOTE: Make the stream "one-shot use".  On exit,
+                # signal
+                # ``trio.EndOfChannel``/``StopAsyncIteration`` to
+                # the far end.
+                await stream.aclose()
+
+            finally:
+                if self._portal:
+                    try:
+                        self._portal._streams.remove(stream)
+                    except KeyError:
+                        log.warning(
+                            f'Stream was already destroyed?\n'
+                            f'actor: {self.chan.uid}\n'
+                            f'ctx id: {self.cid}'
+                        )
+
+    def _maybe_raise_remote_err(
+        self,
+        err: Exception,
+    ) -> None:
+        '''
+        Maybe raise a remote error depending on who (which task from
+        which actor) requested a cancellation (if any).
+
+        '''
+        # NOTE: whenever the context's "opener" side (task) **is**
+        # the side which requested the cancellation (likekly via
+        # ``Context.cancel()``), we don't want to re-raise that
+        # cancellation signal locally (would be akin to
+        # a ``trio.Nursery`` nursery raising ``trio.Cancelled``
+        # whenever  ``CancelScope.cancel()`` was called) and
+        # instead silently reap the expected cancellation
+        # "error"-msg.
+        our_uid: tuple[str, str] = current_actor().uid
+        if (
+            isinstance(err, ContextCancelled)
+            and (
+                self._cancel_called
+                or self.chan._cancel_called
+                or self.canceller == our_uid
+                or tuple(err.canceller) == our_uid
+            )
+        ):
+            # NOTE: we set the local scope error to any "self
+            # cancellation" error-response thus "absorbing"
+            # the error silently B)
+            if self._local_error is None:
+                self._local_error = err
+
+            return err
+
+        # NOTE: currently we are masking underlying runtime errors
+        # which are often superfluous to user handler code. not
+        # sure if this is still needed / desired for all operation?
+        # TODO: maybe we can only NOT mask if:
+        # - [ ] debug mode is enabled or,
+        # - [ ] a certain log level is set?
+        # - [ ] consider using `.with_traceback()` to filter out
+        #       runtime frames from the tb explicitly?
+        # https://docs.python.org/3/reference/simple_stmts.html#the-raise-statement
+        # https://stackoverflow.com/a/24752607
+        # __tracebackhide__: bool = True
+        raise err from None
+
+    async def result(self) -> Any | Exception:
+        '''
+        From some (caller) side task, wait for and return the final
+        result from the remote (callee) side's task.
+
+        This provides a mechanism for one task running in some actor to wait
+        on another task at the other side, in some other actor, to terminate.
+
+        If the remote task is still in a streaming state (it is delivering
+        values from inside a ``Context.open_stream():`` block, then those
+        msgs are drained but discarded since it is presumed this side of
+        the context has already finished with its own streaming logic.
+
+        If the remote context (or its containing actor runtime) was
+        canceled, either by a local task calling one of
+        ``Context.cancel()`` or `Portal.cancel_actor()``, we ignore the
+        received ``ContextCancelled`` exception if the context or
+        underlying IPC channel is marked as having been "cancel called".
+        This is similar behavior to using ``trio.Nursery.cancel()``
+        wherein tasks which raise ``trio.Cancel`` are silently reaped;
+        the main different in this API is in the "cancel called" case,
+        instead of just not raising, we also return the exception *as
+        the result* since client code may be interested in the details
+        of the remote cancellation.
+
+        '''
+        assert self._portal, "Context.result() can not be called from callee!"
+        assert self._recv_chan
+
+        if re := self._remote_error:
+            return self._maybe_raise_remote_err(re)
+
+        if (
+            self._result == id(self)
+            and not self._remote_error
+            and not self._recv_chan._closed  # type: ignore
+        ):
+            # wait for a final context result consuming
+            # and discarding any bi dir stream msgs still
+            # in transit from the far end.
+            while True:
+                try:
+                    msg = await self._recv_chan.receive()
+                    self._result: Any = msg['return']
+
+                    # NOTE: we don't need to do this right?
+                    # XXX: only close the rx mem chan AFTER
+                    # a final result is retreived.
+                    # if self._recv_chan:
+                    #     await self._recv_chan.aclose()
+
+                    break
+
+                # NOTE: we get here if the far end was
+                # `ContextCancelled` in 2 cases:
+                # 1. we requested the cancellation and thus
+                #    SHOULD NOT raise that far end error,
+                # 2. WE DID NOT REQUEST that cancel and thus
+                #    SHOULD RAISE HERE!
+                except trio.Cancelled:
+
+                    # CASE 2: mask the local cancelled-error(s)
+                    # only when we are sure the remote error is the
+                    # (likely) source cause of this local runtime
+                    # task's cancellation.
+                    if re := self._remote_error:
+                        self._maybe_raise_remote_err(re)
+
+                    # CASE 1: we DID request the cancel we simply
+                    # continue to bubble up as normal.
+                    raise
+
+                except KeyError:  # as msgerr:
+
+                    if 'yield' in msg:
+                        # far end task is still streaming to us so discard
+                        log.warning(f'Discarding stream delivered {msg}')
+                        continue
+
+                    elif 'stop' in msg:
+                        log.debug('Remote stream terminated')
+                        continue
+
+                    # internal error should never get here
+                    assert msg.get('cid'), (
+                        "Received internal error at portal?"
+                    )
+
+                    if err:= unpack_error(
+                        msg,
+                        self._portal.channel
+                    ):  # from msgerr
+                        self._maybe_cancel_and_set_remote_error(err)
+                        self._maybe_raise_remote_err(err)
+
+                    else:
+                        raise
+
+        if re := self._remote_error:
+            return self._maybe_raise_remote_err(re)
+
+        return self._result
+
+    async def started(
+        self,
+        value: Any | None = None
+
+    ) -> None:
+        '''
+        Indicate to calling actor's task that this linked context
+        has started and send ``value`` to the other side via IPC.
+
+        On the calling side ``value`` is the second item delivered
+        in the tuple returned by ``Portal.open_context()``.
+
+        '''
+        if self._portal:
+            raise RuntimeError(
+                f'Caller side context {self} can not call started!'
+            )
+
+        elif self._started_called:
+            raise RuntimeError(
+                f'called `.started()` twice on context with {self.chan.uid}'
+            )
+
+        await self.chan.send({'started': value, 'cid': self.cid})
+        self._started_called = True
+
+    async def _drain_overflows(
+        self,
+    ) -> None:
+        '''
+        Private task spawned to push newly received msgs to the local
+        task which getting overrun by the remote side.
+
+        In order to not block the rpc msg loop, but also not discard
+        msgs received in this context, we need to async push msgs in
+        a new task which only runs for as long as the local task is in
+        an overrun state.
+
+        '''
+        self._in_overrun = True
+        try:
+            while self._overflow_q:
+                # NOTE: these msgs should never be errors since we always do
+                # the check prior to checking if we're in an overrun state
+                # inside ``.deliver_msg()``.
+                msg = self._overflow_q.popleft()
+                try:
+                    await self._send_chan.send(msg)
+                except trio.BrokenResourceError:
+                    log.warning(
+                        f"{self._send_chan} consumer is already closed"
+                    )
+                    return
+                except trio.Cancelled:
+                    # we are obviously still in overrun
+                    # but the context is being closed anyway
+                    # so we just warn that there are un received
+                    # msgs still..
+                    self._overflow_q.appendleft(msg)
+                    fmt_msgs = ''
+                    for msg in self._overflow_q:
+                        fmt_msgs += f'{pformat(msg)}\n'
+
+                    log.warning(
+                        f'Context for {self.cid} is being closed while '
+                        'in an overrun state!\n'
+                        'Discarding the following msgs:\n'
+                        f'{fmt_msgs}\n'
+                    )
+                    raise
+
+        finally:
+            # task is now finished with the backlog so mark us as
+            # no longer in backlog.
+            self._in_overrun = False
+
+    async def _deliver_msg(
+        self,
+        msg: dict,
+
+        # draining: bool = False,
+
+    ) -> bool:
+        '''
+        Deliver an IPC msg received from a transport-channel to
+        this context's underlying mem chan for handling by
+        user operating tasks; deliver a bool indicating whether the
+        msg was immediately sent.
+
+        If `._allow_overruns == True` (maybe) append the msg to an
+        "overflow queue" and start a "drainer task" (inside the
+        `._scope_nursery: trio.Nursery`) which ensures that such
+        messages are eventually sent if possible.
+
+        '''
+        cid = self.cid
+        chan = self.chan
+        uid = chan.uid
+        send_chan: trio.MemorySendChannel = self._send_chan
+
+        log.runtime(
+            f"Delivering {msg} from {uid} to caller {cid}"
+        )
+
+        if (
+            msg.get('error')  # check for field
+            and (
+                error := unpack_error(
+                    msg,
+                    self.chan,
+                )
+            )
+        ):
+            self._cancel_msg = msg
+            self._maybe_cancel_and_set_remote_error(error)
+
+        if (
+            self._in_overrun
+        ):
+            self._overflow_q.append(msg)
+            return False
+
+        try:
+            send_chan.send_nowait(msg)
+            return True
+            # if an error is deteced we should always
+            # expect it to be raised by any context (stream)
+            # consumer task
+
+        except trio.BrokenResourceError:
+            # TODO: what is the right way to handle the case where the
+            # local task has already sent a 'stop' / StopAsyncInteration
+            # to the other side but and possibly has closed the local
+            # feeder mem chan? Do we wait for some kind of ack or just
+            # let this fail silently and bubble up (currently)?
+
+            # XXX: local consumer has closed their side
+            # so cancel the far end streaming task
+            log.warning(f"{send_chan} consumer is already closed")
+            return False
+
+        # NOTE XXX: by default we do **not** maintain context-stream
+        # backpressure and instead opt to relay stream overrun errors to
+        # the sender; the main motivation is that using bp can block the
+        # msg handling loop which calls into this method!
+        except trio.WouldBlock:
+
+            # XXX: always push an error even if the local
+            # receiver is in overrun state.
+            # self._maybe_cancel_and_set_remote_error(msg)
+
+            local_uid = current_actor().uid
+            lines = [
+                f'OVERRUN on actor-task context {cid}@{local_uid}!\n'
+                # TODO: put remote task name here if possible?
+                f'remote sender actor: {uid}',
+                # TODO: put task func name here and maybe an arrow
+                # from sender to overrunner?
+                # f'local task {self.func_name}'
+            ]
+            if not self._stream_opened:
+                lines.insert(
+                    1,
+                    f'\n*** No stream open on `{local_uid[0]}` side! ***\n'
+                )
+
+            text = '\n'.join(lines)
+
+            # XXX: lul, this really can't be backpressure since any
+            # blocking here will block the entire msg loop rpc sched for
+            # a whole channel.. maybe we should rename it?
+            if self._allow_overruns:
+                text += f'\nStarting overflow queuing task on msg: {msg}'
+                log.warning(text)
+                if (
+                    not self._in_overrun
+                ):
+                    self._overflow_q.append(msg)
+                    n = self._scope_nursery
+                    assert not n.child_tasks
+                    try:
+                        n.start_soon(
+                            self._drain_overflows,
+                        )
+                    except RuntimeError:
+                        # if the nursery is already cancelled due to
+                        # this context exiting or in error, we ignore
+                        # the nursery error since we never expected
+                        # anything different.
+                        return False
+            else:
+                try:
+                    raise StreamOverrun(text)
+                except StreamOverrun as err:
+                    err_msg = pack_error(err)
+                    err_msg['cid'] = cid
+                    try:
+                        await chan.send(err_msg)
+                    except trio.BrokenResourceError:
+                        # XXX: local consumer has closed their side
+                        # so cancel the far end streaming task
+                        log.warning(f"{chan} is already closed")
+
+            return False
+
+
+def mk_context(
+    chan: Channel,
+    cid: str,
+    msg_buffer_size: int = 2**6,
+
+    **kwargs,
+
+) -> Context:
+    '''
+    Internal factory to create an inter-actor task ``Context``.
+
+    This is called by internals and should generally never be called
+    by user code.
+
+    '''
+    send_chan: trio.MemorySendChannel
+    recv_chan: trio.MemoryReceiveChannel
+    send_chan, recv_chan = trio.open_memory_channel(msg_buffer_size)
+
+    ctx = Context(
+        chan,
+        cid,
+        _send_chan=send_chan,
+        _recv_chan=recv_chan,
+        **kwargs,
+    )
+    ctx._result: int | Any = id(ctx)
+    return ctx
+
+
+def context(func: Callable) -> Callable:
+    '''
+    Mark an async function as a streaming routine with ``@context``.
+
+    '''
+    # TODO: apply whatever solution ``mypy`` ends up picking for this:
+    # https://github.com/python/mypy/issues/2087#issuecomment-769266912
+    func._tractor_context_function = True  # type: ignore
+
+    sig = inspect.signature(func)
+    params = sig.parameters
+    if 'ctx' not in params:
+        raise TypeError(
+            "The first argument to the context function "
+            f"{func.__name__} must be `ctx: tractor.Context`"
+        )
+    return func
--- a/tractor/_discovery.py
+++ b/tractor/_discovery.py
@ -15,16 +15,20 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.

 """
-Actor discovery API.
+Discovery (protocols) API for automatic addressing and location
+management of (service) actors.

 """
+from __future__ import annotations
 from typing import (
-    Optional,
-    Union,
    AsyncGenerator,
+    AsyncContextManager,
+    TYPE_CHECKING,
 )
 from contextlib import asynccontextmanager as acm
+import warnings

+from .trionics import gather_contexts
 from ._ipc import _connect_chan, Channel
 from ._portal import (
    Portal,
@ -34,31 +38,56 @@ from ._portal import (
 from ._state import current_actor, _runtime_vars


-@acm
-async def get_arbiter(
+if TYPE_CHECKING:
+    from ._runtime import Actor

+
+@acm
+async def get_registry(
    host: str,
    port: int,

-) -> AsyncGenerator[Union[Portal, LocalPortal], None]:
-    '''Return a portal instance connected to a local or remote
+) -> AsyncGenerator[
+    Portal | LocalPortal | None,
+    None,
+]:
+    '''
+    Return a portal instance connected to a local or remote
    arbiter.
+
    '''
    actor = current_actor()

    if not actor:
        raise RuntimeError("No actor instance has been defined yet?")

-    if actor.is_arbiter:
+    if actor.is_registrar:
        # we're already the arbiter
        # (likely a re-entrant call from the arbiter actor)
-        yield LocalPortal(actor, Channel((host, port)))
+        yield LocalPortal(
+            actor,
+            Channel((host, port))
+        )
    else:
-        async with _connect_chan(host, port) as chan:
+        async with (
+            _connect_chan(host, port) as chan,
+            open_portal(chan) as regstr_ptl,
+        ):
+            yield regstr_ptl

-            async with open_portal(chan) as arb_portal:

-                yield arb_portal
+
+# TODO: deprecate and this remove _arbiter form!
+@acm
+async def get_arbiter(*args, **kwargs):
+    warnings.warn(
+        '`tractor.get_arbiter()` is now deprecated!\n'
+        'Use `.get_registry()` instead!',
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    async with get_registry(*args, **kwargs) as to_yield:
+        yield to_yield


@acm
@ -66,51 +95,80 @@ async def get_root(
    **kwargs,
 ) -> AsyncGenerator[Portal, None]:

+    # TODO: rename mailbox to `_root_maddr` when we finally
+    # add and impl libp2p multi-addrs?
    host, port = _runtime_vars['_root_mailbox']
    assert host is not None

-    async with _connect_chan(host, port) as chan:
-        async with open_portal(chan, **kwargs) as portal:
-            yield portal
+    async with (
+        _connect_chan(host, port) as chan,
+        open_portal(chan, **kwargs) as portal,
+    ):
+        yield portal


@acm
 async def query_actor(
    name: str,
-    arbiter_sockaddr: Optional[tuple[str, int]] = None,
+    arbiter_sockaddr: tuple[str, int] | None = None,
+    regaddr: tuple[str, int] | None = None,

-) -> AsyncGenerator[tuple[str, int], None]:
+) -> AsyncGenerator[
+    tuple[str, int] | None,
+    None,
+]:
    '''
-    Simple address lookup for a given actor name.
+    Make a transport address lookup for an actor name to a specific
+    registrar.

-    Returns the (socket) address or ``None``.
+    Returns the (socket) address or ``None`` if no entry under that
+    name exists for the given registrar listening @ `regaddr`.

    '''
-    actor = current_actor()
-    async with get_arbiter(
-        *arbiter_sockaddr or actor._arb_addr
-    ) as arb_portal:
+    actor: Actor = current_actor()
+    if (
+        name == 'registrar'
+        and actor.is_registrar
+    ):
+        raise RuntimeError(
+            'The current actor IS the registry!?'
+        )

-        sockaddr = await arb_portal.run_from_ns(
+    if arbiter_sockaddr is not None:
+        warnings.warn(
+            '`tractor.query_actor(regaddr=<blah>)` is deprecated.\n'
+            'Use `registry_addrs: list[tuple]` instead!',
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        regaddr: list[tuple[str, int]] = arbiter_sockaddr
+
+    reg_portal: Portal
+    regaddr: tuple[str, int] = regaddr or actor.reg_addrs[0]
+    async with get_registry(*regaddr) as reg_portal:
+        # TODO: return portals to all available actors - for now
+        # just the last one that registered
+        sockaddr: tuple[str, int] = await reg_portal.run_from_ns(
            'self',
            'find_actor',
            name=name,
        )
-
-        # TODO: return portals to all available actors - for now just
-        # the last one that registered
-        if name == 'arbiter' and actor.is_arbiter:
-            raise RuntimeError("The current actor is the arbiter")
-
-        yield sockaddr if sockaddr else None
+        yield sockaddr


@acm
 async def find_actor(
    name: str,
-    arbiter_sockaddr: tuple[str, int] | None = None
+    arbiter_sockaddr: tuple[str, int] | None = None,
+    registry_addrs: list[tuple[str, int]] | None = None,

-) -> AsyncGenerator[Optional[Portal], None]:
+    only_first: bool = True,
+    raise_on_none: bool = False,
+
+) -> AsyncGenerator[
+    Portal | list[Portal] | None,
+    None,
+]:
    '''
    Ask the arbiter to find actor(s) by name.

@ -118,39 +176,112 @@ async def find_actor(
    known to the arbiter.

    '''
-    async with query_actor(
-        name=name,
-        arbiter_sockaddr=arbiter_sockaddr,
-    ) as sockaddr:
+    if arbiter_sockaddr is not None:
+        warnings.warn(
+            '`tractor.find_actor(arbiter_sockaddr=<blah>)` is deprecated.\n'
+            'Use `registry_addrs: list[tuple]` instead!',
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        registry_addrs: list[tuple[str, int]] = [arbiter_sockaddr]

-        if sockaddr:
-            async with _connect_chan(*sockaddr) as chan:
-                async with open_portal(chan) as portal:
-                    yield portal
-        else:
+    @acm
+    async def maybe_open_portal_from_reg_addr(
+        addr: tuple[str, int],
+    ):
+        async with query_actor(
+            name=name,
+            regaddr=addr,
+        ) as sockaddr:
+            if sockaddr:
+                async with _connect_chan(*sockaddr) as chan:
+                    async with open_portal(chan) as portal:
+                        yield portal
+            else:
+                yield None
+
+    if not registry_addrs:
+        # XXX NOTE: make sure to dynamically read the value on
+        # every call since something may change it globally (eg.
+        # like in our discovery test suite)!
+        from . import _root
+        registry_addrs = _root._default_lo_addrs
+
+    maybe_portals: list[
+        AsyncContextManager[tuple[str, int]]
+    ] = list(
+        maybe_open_portal_from_reg_addr(addr)
+        for addr in registry_addrs
+    )
+
+    async with gather_contexts(
+        mngrs=maybe_portals,
+    ) as portals:
+        # log.runtime(
+        #     'Gathered portals:\n'
+        #     f'{portals}'
+        # )
+        # NOTE: `gather_contexts()` will return a
+        # `tuple[None, None, ..., None]` if no contact
+        # can be made with any regstrar at any of the
+        # N provided addrs!
+        if not any(portals):
+            if raise_on_none:
+                raise RuntimeError(
+                    f'No actor "{name}" found registered @ {registry_addrs}'
+                )
            yield None
+            return
+
+        portals: list[Portal] = list(portals)
+        if only_first:
+            yield portals[0]
+
+        else:
+            # TODO: currently this may return multiple portals
+            # given there are multi-homed or multiple registrars..
+            # SO, we probably need de-duplication logic?
+            yield portals


@acm
 async def wait_for_actor(
    name: str,
-    arbiter_sockaddr: tuple[str, int] | None = None
+    arbiter_sockaddr: tuple[str, int] | None = None,
+    registry_addr: tuple[str, int] | None = None,
+
 ) -> AsyncGenerator[Portal, None]:
-    """Wait on an actor to register with the arbiter.
+    '''
+    Wait on an actor to register with the arbiter.

    A portal to the first registered actor is returned.
-    """
-    actor = current_actor()

-    async with get_arbiter(
-        *arbiter_sockaddr or actor._arb_addr,
-    ) as arb_portal:
-        sockaddrs = await arb_portal.run_from_ns(
+    '''
+    actor: Actor = current_actor()
+
+    if arbiter_sockaddr is not None:
+        warnings.warn(
+            '`tractor.wait_for_actor(arbiter_sockaddr=<foo>)` is deprecated.\n'
+            'Use `registry_addr: tuple` instead!',
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        registry_addr: tuple[str, int] = arbiter_sockaddr
+
+    # TODO: use `.trionics.gather_contexts()` like
+    # above in `find_actor()` as well?
+    reg_portal: Portal
+    regaddr: tuple[str, int] = registry_addr or actor.reg_addrs[0]
+    async with get_registry(*regaddr) as reg_portal:
+        sockaddrs = await reg_portal.run_from_ns(
            'self',
            'wait_for_actor',
            name=name,
        )
-        sockaddr = sockaddrs[-1]
+
+        # get latest registered addr by default?
+        # TODO: offer multi-portal yields in multi-homed case?
+        sockaddr: tuple[str, int] = sockaddrs[-1]

        async with _connect_chan(*sockaddr) as chan:
            async with open_portal(chan) as portal:
--- a/tractor/_entry.py
+++ b/tractor/_entry.py
@ -47,8 +47,8 @@ log = get_logger(__name__)

 def _mp_main(

-    actor: Actor,  # type: ignore
-    accept_addr: tuple[str, int],
+    actor: Actor,
+    accept_addrs: list[tuple[str, int]],
    forkserver_info: tuple[Any, Any, Any, Any, Any],
    start_method: SpawnMethodKey,
    parent_addr: tuple[str, int] | None = None,
@ -77,8 +77,8 @@ def _mp_main(
    log.debug(f"parent_addr is {parent_addr}")
    trio_main = partial(
        async_main,
-        actor,
-        accept_addr,
+        actor=actor,
+        accept_addrs=accept_addrs,
        parent_addr=parent_addr
    )
    try:
@ -96,7 +96,7 @@ def _mp_main(

 def _trio_main(

-    actor: Actor,  # type: ignore
+    actor: Actor,
    *,
    parent_addr: tuple[str, int] | None = None,
    infect_asyncio: bool = False,
@ -132,7 +132,9 @@ def _trio_main(
        else:
            trio.run(trio_main)
    except KeyboardInterrupt:
-        log.warning(f"Actor {actor.uid} received KBI")
+        log.cancel(
+            f'Actor@{actor.uid} received KBI'
+        )

    finally:
        log.info(f"Actor {actor.uid} terminated")
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@ -14,22 +14,30 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.

-"""
+'''
 Our classy exception set.

-"""
+'''
+from __future__ import annotations
+import builtins
+import importlib
+from pprint import pformat
 from typing import (
    Any,
-    Optional,
    Type,
+    TYPE_CHECKING,
 )
-import importlib
-import builtins
 import traceback

 import exceptiongroup as eg
 import trio

+from ._state import current_actor
+
+if TYPE_CHECKING:
+    from ._context import Context
+    from ._stream import MsgStream
+    from .log import StackLevelAdapter

 _this_mod = importlib.import_module(__name__)

@ -38,36 +46,82 @@ class ActorFailure(Exception):
    "General actor failure"


+# TODO: rename to just `RemoteError`?
 class RemoteActorError(Exception):
-    # TODO: local recontruction of remote exception deats
-    "Remote actor exception bundled locally"
+    '''
+    A box(ing) type which bundles a remote actor `BaseException` for
+    (near identical, and only if possible,) local object/instance
+    re-construction in the local process memory domain.
+
+    Normally each instance is expected to be constructed from
+    a special "error" IPC msg sent by some remote actor-runtime.
+
+    '''
    def __init__(
        self,
        message: str,
-        suberror_type: Optional[Type[BaseException]] = None,
+        suberror_type: Type[BaseException] | None = None,
        **msgdata

    ) -> None:
        super().__init__(message)

-        self.type = suberror_type
-        self.msgdata = msgdata
+        # TODO: maybe a better name?
+        # - .errtype
+        # - .retype
+        # - .boxed_errtype
+        # - .boxed_type
+        # - .remote_type
+        # also pertains to our long long oustanding issue XD
+        # https://github.com/goodboy/tractor/issues/5
+        self.type: str = suberror_type
+        self.msgdata: dict[str, Any] = msgdata
+
+    @property
+    def src_actor_uid(self) -> tuple[str, str] | None:
+        return self.msgdata.get('src_actor_uid')
+
+    def __repr__(self) -> str:
+        if remote_tb := self.msgdata.get('tb_str'):
+            pformat(remote_tb)
+            return (
+                f'{type(self).__name__}(\n'
+                f'msgdata={pformat(self.msgdata)}\n'
+                ')'
+            )
+
+        return super().__repr__()
+
+    # TODO: local recontruction of remote exception deats
+    # def unbox(self) -> BaseException:
+    #     ...


 class InternalActorError(RemoteActorError):
-    """Remote internal ``tractor`` error indicating
+    '''
+    Remote internal ``tractor`` error indicating
    failure of some primitive or machinery.
-    """
+
+    '''
+
+
+class ContextCancelled(RemoteActorError):
+    '''
+    Inter-actor task context was cancelled by either a call to
+    ``Portal.cancel_actor()`` or ``Context.cancel()``.
+
+    '''
+    @property
+    def canceller(self) -> tuple[str, str] | None:
+        value = self.msgdata.get('canceller')
+        if value:
+            return tuple(value)


 class TransportClosed(trio.ClosedResourceError):
    "Underlying channel transport was closed prior to use"


-class ContextCancelled(RemoteActorError):
-    "Inter-actor task context cancelled itself on the callee side."
-
-
 class NoResult(RuntimeError):
    "No final result is expected for this actor"

@ -92,27 +146,40 @@ class AsyncioCancelled(Exception):

    '''

+class MessagingError(Exception):
+    'Some kind of unexpected SC messaging dialog issue'
+

 def pack_error(
    exc: BaseException,
-    tb=None,
+    tb: str | None = None,

-) -> dict[str, Any]:
-    """Create an "error message" for tranmission over
-    a channel (aka the wire).
-    """
+) -> dict[str, dict]:
+    '''
+    Create an "error message" encoded for wire transport via an IPC
+    `Channel`; expected to be unpacked on the receiver side using
+    `unpack_error()` below.
+
+    '''
    if tb:
        tb_str = ''.join(traceback.format_tb(tb))
    else:
        tb_str = traceback.format_exc()

-    return {
-        'error': {
-            'tb_str': tb_str,
-            'type_str': type(exc).__name__,
-        }
+    error_msg: dict[
+        str,
+        str | tuple[str, str]
+    ] = {
+        'tb_str': tb_str,
+        'type_str': type(exc).__name__,
+        'src_actor_uid': current_actor().uid,
    }

+    if isinstance(exc, ContextCancelled):
+        error_msg.update(exc.msgdata)
+
+    return {'error': error_msg}
+

 def unpack_error(

@ -120,23 +187,33 @@ def unpack_error(
    chan=None,
    err_type=RemoteActorError

-) -> Exception:
+) -> None | Exception:
    '''
    Unpack an 'error' message from the wire
-    into a local ``RemoteActorError``.
+    into a local `RemoteActorError` (subtype).
+
+    NOTE: this routine DOES not RAISE the embedded remote error,
+    which is the responsibilitiy of the caller.

    '''
-    __tracebackhide__ = True
-    error = msg['error']
+    __tracebackhide__: bool = True

-    tb_str = error.get('tb_str', '')
-    message = f"{chan.uid}\n" + tb_str
-    type_name = error['type_str']
+    error_dict: dict[str, dict] | None
+    if (
+        error_dict := msg.get('error')
+    ) is None:
+        # no error field, nothing to unpack.
+        return None
+
+    # retrieve the remote error's msg encoded details
+    tb_str: str = error_dict.get('tb_str', '')
+    message: str = f'{chan.uid}\n' + tb_str
+    type_name: str = error_dict['type_str']
    suberror_type: Type[BaseException] = Exception

    if type_name == 'ContextCancelled':
        err_type = ContextCancelled
-        suberror_type = trio.Cancelled
+        suberror_type = err_type

    else:  # try to lookup a suitable local error type
        for ns in [
@ -145,18 +222,19 @@ def unpack_error(
            eg,
            trio,
        ]:
-            try:
-                suberror_type = getattr(ns, type_name)
+            if suberror_type := getattr(
+                ns,
+                type_name,
+                False,
+            ):
                break
-            except AttributeError:
-                continue

    exc = err_type(
        message,
        suberror_type=suberror_type,

        # unpack other fields into error type init
-        **msg['error'],
+        **error_dict,
    )

    return exc
@ -175,3 +253,88 @@ def is_multi_cancelled(exc: BaseException) -> bool:
        ) is not None

    return False
+
+
+def _raise_from_no_key_in_msg(
+    ctx: Context,
+    msg: dict,
+    src_err: KeyError,
+    log: StackLevelAdapter,  # caller specific `log` obj
+    expect_key: str = 'yield',
+    stream: MsgStream | None = None,
+
+) -> bool:
+    '''
+    Raise an appopriate local error when a `MsgStream` msg arrives
+    which does not contain the expected (under normal operation)
+    `'yield'` field.
+
+    '''
+    __tracebackhide__: bool = True
+
+    # internal error should never get here
+    try:
+        cid: str = msg['cid']
+    except KeyError as src_err:
+        raise MessagingError(
+            f'IPC `Context` rx-ed msg without a ctx-id (cid)!?\n'
+            f'cid: {cid}\n'
+            'received msg:\n'
+            f'{pformat(msg)}\n'
+        ) from src_err
+
+    # TODO: test that shows stream raising an expected error!!!
+    if msg.get('error'):
+        # raise the error message
+        raise unpack_error(
+            msg,
+            ctx.chan,
+        ) from None
+
+    elif (
+        msg.get('stop')
+        or (
+            stream
+            and stream._eoc
+        )
+    ):
+        log.debug(
+            f'Context[{cid}] stream was stopped by remote side\n'
+            f'cid: {cid}\n'
+        )
+
+        # XXX: important to set so that a new ``.receive()``
+        # call (likely by another task using a broadcast receiver)
+        # doesn't accidentally pull the ``return`` message
+        # value out of the underlying feed mem chan!
+        stream._eoc: bool = True
+
+        # # when the send is closed we assume the stream has
+        # # terminated and signal this local iterator to stop
+        # await stream.aclose()
+
+        # XXX: this causes ``ReceiveChannel.__anext__()`` to
+        # raise a ``StopAsyncIteration`` **and** in our catch
+        # block below it will trigger ``.aclose()``.
+        raise trio.EndOfChannel(
+                'Context[{cid}] stream ended due to msg:\n'
+                f'{pformat(msg)}'
+        ) from src_err
+
+
+    if (
+        stream
+        and stream._closed
+    ):
+        raise trio.ClosedResourceError('This stream was closed')
+
+
+    # always re-raise the source error if no translation error case
+    # is activated above.
+    _type: str = 'Stream' if stream else 'Context'
+    raise MessagingError(
+        f'{_type} was expecting a `{expect_key}` message'
+        ' BUT received a non-`error` msg:\n'
+        f'cid: {cid}\n'
+        '{pformat(msg)}'
+    ) from src_err
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@ -294,9 +294,11 @@ class Channel:
        self._agen = self._aiter_recv()
        self._exc: Optional[Exception] = None  # set if far end actor errors
        self._closed: bool = False
-        # flag set on ``Portal.cancel_actor()`` indicating
-        # remote (peer) cancellation of the far end actor runtime.
-        self._cancel_called: bool = False  # set on ``Portal.cancel_actor()``
+
+        # flag set by ``Portal.cancel_actor()`` indicating remote
+        # (possibly peer) cancellation of the far end actor
+        # runtime.
+        self._cancel_called: bool = False

    @classmethod
    def from_stream(
@ -327,8 +329,11 @@ class Channel:
    def __repr__(self) -> str:
        if self.msgstream:
            return repr(
-                self.msgstream.stream.socket._sock).replace(  # type: ignore
-                        "socket.socket", "Channel")
+                self.msgstream.stream.socket._sock
+            ).replace(  # type: ignore
+                "socket.socket",
+                "Channel",
+            )
        return object.__repr__(self)

    @property
@ -467,7 +472,9 @@ class Channel:

@asynccontextmanager
 async def _connect_chan(
-    host: str, port: int
+    host: str,
+    port: int
+
 ) -> typing.AsyncGenerator[Channel, None]:
    '''
    Create and connect a channel with disconnect on context manager
--- a/tractor/_multiaddr.py
+++ b/tractor/_multiaddr.py
@ -0,0 +1,151 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Multiaddress parser and utils according the spec(s) defined by
+`libp2p` and used in dependent project such as `ipfs`:
+
+- https://docs.libp2p.io/concepts/fundamentals/addressing/
+- https://github.com/libp2p/specs/blob/master/addressing/README.md
+
+'''
+from typing import Iterator
+
+from bidict import bidict
+
+# TODO: see if we can leverage libp2p ecosys projects instead of
+# rolling our own (parser) impls of the above addressing specs:
+# - https://github.com/libp2p/py-libp2p
+# - https://docs.libp2p.io/concepts/nat/circuit-relay/#relay-addresses
+# prots: bidict[int, str] = bidict({
+prots: bidict[int, str] = {
+    'ipv4': 3,
+    'ipv6': 3,
+    'wg': 3,
+
+    'tcp': 4,
+    'udp': 4,
+
+    # TODO: support the next-gen shite Bo
+    # 'quic': 4,
+    # 'ssh': 7,  # via rsyscall bootstrapping
+}
+
+prot_params: dict[str, tuple[str]] = {
+    'ipv4': ('addr',),
+    'ipv6': ('addr',),
+    'wg': ('addr', 'port', 'pubkey'),
+
+    'tcp': ('port',),
+    'udp': ('port',),
+
+    # 'quic': ('port',),
+    # 'ssh': ('port',),
+}
+
+
+def iter_prot_layers(
+    multiaddr: str,
+) -> Iterator[
+    tuple[
+        int,
+        list[str]
+    ]
+]:
+    '''
+    Unpack a libp2p style "multiaddress" into multiple "segments"
+    for each "layer" of the protocoll stack (in OSI terms).
+
+    '''
+    tokens: list[str] = multiaddr.split('/')
+    root, tokens = tokens[0], tokens[1:]
+    assert not root  # there is a root '/' on LHS
+    itokens = iter(tokens)
+
+    prot: str | None = None
+    params: list[str] = []
+    for token in itokens:
+        # every prot path should start with a known
+        # key-str.
+        if token in prots:
+            if prot is None:
+                prot: str = token
+            else:
+                yield prot, params
+                prot = token
+
+            params = []
+
+        elif token not in prots:
+            params.append(token)
+
+    else:
+        yield prot, params
+
+
+def parse_maddr(
+    multiaddr: str,
+) -> dict[str, str | int | dict]:
+    '''
+    Parse a libp2p style "multiaddress" into its distinct protocol
+    segments where each segment is of the form:
+
+        `../<protocol>/<param0>/<param1>/../<paramN>`
+
+    and is loaded into a (order preserving) `layers: dict[str,
+    dict[str, Any]` which holds each protocol-layer-segment of the
+    original `str` path as a separate entry according to its approx
+    OSI "layer number".
+
+    Any `paramN` in the path must be distinctly defined by a str-token in the
+    (module global) `prot_params` table.
+
+    For eg. for wireguard which requires an address, port number and publickey
+    the protocol params are specified as the entry:
+
+        'wg': ('addr', 'port', 'pubkey'),
+
+    and are thus parsed from a maddr in that order:
+        `'/wg/1.1.1.1/51820/<pubkey>'`
+
+    '''
+    layers: dict[str, str | int | dict] = {}
+    for (
+        prot_key,
+        params,
+    ) in iter_prot_layers(multiaddr):
+
+        layer: int = prots[prot_key]  # OSI layer used for sorting
+        ep: dict[str, int | str] = {'layer': layer}
+        layers[prot_key] = ep
+
+        # TODO; validation and resolving of names:
+        # - each param via a validator provided as part of the
+        #   prot_params def? (also see `"port"` case below..)
+        # - do a resolv step that will check addrs against
+        #   any loaded network.resolv: dict[str, str]
+        rparams: list = list(reversed(params))
+        for key in prot_params[prot_key]:
+            val: str | int = rparams.pop()
+
+            # TODO: UGHH, dunno what we should do for validation
+            # here, put it in the params spec somehow?
+            if key == 'port':
+                val = int(val)
+
+            ep[key] = val
+
+    return layers
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@ -15,8 +15,12 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.

 '''
-Memory boundary "Portals": an API for structured
-concurrency linked tasks running in disparate memory domains.
+Memory "portal" contruct.
+
+"Memory portals" are both an API and set of IPC wrapping primitives
+for managing structured concurrency "cancel-scope linked" tasks
+running in disparate virtual memory domains - at least in different
+OS processes, possibly on different (hardware) hosts.

 '''
 from __future__ import annotations
@ -29,7 +33,6 @@ from typing import (
 )
 from functools import partial
 from dataclasses import dataclass
-from pprint import pformat
 import warnings

 import trio
@ -41,14 +44,18 @@ from ._ipc import Channel
 from .log import get_logger
 from .msg import NamespacePath
 from ._exceptions import (
+    _raise_from_no_key_in_msg,
    unpack_error,
    NoResult,
    ContextCancelled,
 )
-from ._streaming import (
+from ._context import (
    Context,
+)
+from ._streaming import (
    MsgStream,
 )
+from .devx._debug import maybe_wait_for_debugger


 log = get_logger(__name__)
@ -62,26 +69,22 @@ def _unwrap_msg(
    __tracebackhide__ = True
    try:
        return msg['return']
-    except KeyError:
+    except KeyError as ke:
        # internal error should never get here
        assert msg.get('cid'), "Received internal error at portal?"
-        raise unpack_error(msg, channel) from None
-
-
-class MessagingError(Exception):
-    'Some kind of unexpected SC messaging dialog issue'
+        raise unpack_error(msg, channel) from ke


 class Portal:
    '''
-    A 'portal' to a(n) (remote) ``Actor``.
+    A 'portal' to a memory-domain-separated `Actor`.

    A portal is "opened" (and eventually closed) by one side of an
    inter-actor communication context. The side which opens the portal
    is equivalent to a "caller" in function parlance and usually is
    either the called actor's parent (in process tree hierarchy terms)
    or a client interested in scheduling work to be done remotely in a
-    far process.
+    process which has a separate (virtual) memory domain.

    The portal api allows the "caller" actor to invoke remote routines
    and receive results through an underlying ``tractor.Channel`` as
@ -91,9 +94,9 @@ class Portal:
    like having a "portal" between the seperate actor memory spaces.

    '''
-    # the timeout for a remote cancel request sent to
-    # a(n) (peer) actor.
-    cancel_timeout = 0.5
+    # global timeout for remote cancel requests sent to
+    # connected (peer) actors.
+    cancel_timeout: float = 0.5

    def __init__(self, channel: Channel) -> None:
        self.channel = channel
@ -103,7 +106,7 @@ class Portal:
        # When set to a ``Context`` (when _submit_for_result is called)
        # it is expected that ``result()`` will be awaited at some
        # point.
-        self._expect_result: Optional[Context] = None
+        self._expect_result: Context | None = None
        self._streams: set[MsgStream] = set()
        self.actor = current_actor()

@ -193,7 +196,15 @@ class Portal:

    ) -> bool:
        '''
-        Cancel the actor on the other end of this portal.
+        Cancel the actor runtime (and thus process) on the far
+        end of this portal.
+
+        **NOTE** THIS CANCELS THE ENTIRE RUNTIME AND THE
+        SUBPROCESS, it DOES NOT just cancel the remote task. If you
+        want to have a handle to cancel a remote ``tri.Task`` look
+        at `.open_context()` and the definition of
+        `._context.Context.cancel()` which CAN be used for this
+        purpose.

        '''
        if not self.channel.connected():
@ -208,11 +219,18 @@ class Portal:

        try:
            # send cancel cmd - might not get response
-            # XXX: sure would be nice to make this work with a proper shield
-            with trio.move_on_after(timeout or self.cancel_timeout) as cs:
+            # XXX: sure would be nice to make this work with
+            # a proper shield
+            with trio.move_on_after(
+                timeout
+                or self.cancel_timeout
+            ) as cs:
                cs.shield = True

-                await self.run_from_ns('self', 'cancel')
+                await self.run_from_ns(
+                    'self',
+                    'cancel',
+                )
                return True

            if cs.cancelled_caught:
@ -330,7 +348,9 @@ class Portal:
                    f'{async_gen_func} must be an async generator function!')

        fn_mod_path, fn_name = NamespacePath.from_ref(
-            async_gen_func).to_tuple()
+            async_gen_func
+        ).to_tuple()
+
        ctx = await self.actor.start_remote_task(
            self.channel,
            fn_mod_path,
@ -377,16 +397,37 @@ class Portal:

        self,
        func: Callable,
+        allow_overruns: bool = False,
        **kwargs,

    ) -> AsyncGenerator[tuple[Context, Any], None]:
        '''
-        Open an inter-actor task context.
+        Open an inter-actor "task context"; a remote task is
+        scheduled and cancel-scope-state-linked to a `trio.run()` across
+        memory boundaries in another actor's runtime.

-        This is a synchronous API which allows for deterministic
-        setup/teardown of a remote task. The yielded ``Context`` further
-        allows for opening bidirectional streams, explicit cancellation
-        and synchronized final result collection. See ``tractor.Context``.
+        This is an `@acm` API which allows for deterministic setup
+        and teardown of a remotely scheduled task in another remote
+        actor. Once opened, the 2 now "linked" tasks run completely
+        in parallel in each actor's runtime with their enclosing
+        `trio.CancelScope`s kept in a synced state wherein if
+        either side errors or cancels an equivalent error is
+        relayed to the other side via an SC-compat IPC protocol.
+
+        The yielded `tuple` is a pair delivering a `tractor.Context`
+        and any first value "sent" by the "callee" task via a call
+        to `Context.started(<value: Any>)`; this side of the
+        context does not unblock until the "callee" task calls
+        `.started()` in similar style to `trio.Nursery.start()`.
+        When the "callee" (side that is "called"/started by a call
+        to *this* method) returns, the caller side (this) unblocks
+        and any final value delivered from the other end can be
+        retrieved using the `Contex.result()` api.
+
+        The yielded ``Context`` instance further allows for opening
+        bidirectional streams, explicit cancellation and
+        structurred-concurrency-synchronized final result-msg
+        collection. See ``tractor.Context`` for more details.

        '''
        # conduct target func method structural checks
@ -396,135 +437,306 @@ class Portal:
            raise TypeError(
                f'{func} must be an async generator function!')

+        # TODO: i think from here onward should probably
+        # just be factored into an `@acm` inside a new
+        # a new `_context.py` mod.
        fn_mod_path, fn_name = NamespacePath.from_ref(func).to_tuple()

        ctx = await self.actor.start_remote_task(
            self.channel,
            fn_mod_path,
            fn_name,
-            kwargs
+            kwargs,
+
+            # NOTE: it's imporant to expose this since you might
+            # get the case where the parent who opened the context does
+            # not open a stream until after some slow startup/init
+            # period, in which case when the first msg is read from
+            # the feeder mem chan, say when first calling
+            # `Context.open_stream(allow_overruns=True)`, the overrun condition will be
+            # raised before any ignoring of overflow msgs can take
+            # place..
+            allow_overruns=allow_overruns,
        )

        assert ctx._remote_func_type == 'context'
-        msg = await ctx._recv_chan.receive()
+        msg: dict = await ctx._recv_chan.receive()

        try:
            # the "first" value here is delivered by the callee's
            # ``Context.started()`` call.
-            first = msg['started']
-            ctx._started_called = True
+            first: Any = msg['started']
+            ctx._started_called: bool = True

-        except KeyError:
-            assert msg.get('cid'), ("Received internal error at context?")
+        except KeyError as src_error:

-            if msg.get('error'):
-                # raise kerr from unpack_error(msg, self.channel)
-                raise unpack_error(msg, self.channel) from None
-            else:
-                raise MessagingError(
-                    f'Context for {ctx.cid} was expecting a `started` message'
-                    f' but received a non-error msg:\n{pformat(msg)}'
-                )
+            _raise_from_no_key_in_msg(
+                ctx=ctx,
+                msg=msg,
+                src_err=src_error,
+                log=log,
+                expect_key='started',
+            )

-        _err: Optional[BaseException] = None
-        ctx._portal = self
+        ctx._portal: Portal = self
+        uid: tuple = self.channel.uid
+        cid: str = ctx.cid

-        uid = self.channel.uid
-        cid = ctx.cid
-        etype: Optional[Type[BaseException]] = None
-
-        # deliver context instance and .started() msg value in open tuple.
+        # placeholder for any exception raised in the runtime
+        # or by user tasks which cause this context's closure.
+        scope_err: BaseException | None = None
        try:
-            async with trio.open_nursery() as scope_nursery:
-                ctx._scope_nursery = scope_nursery
-
-                # do we need this?
-                # await trio.lowlevel.checkpoint()
+            async with trio.open_nursery() as nurse:
+                ctx._scope_nursery: trio.Nursery = nurse
+                ctx._scope: trio.CancelScope = nurse.cancel_scope

+                # deliver context instance and .started() msg value
+                # in enter tuple.
                yield ctx, first

-        except ContextCancelled as err:
-            _err = err
-            if not ctx._cancel_called:
-                # context was cancelled at the far end but was
-                # not part of this end requesting that cancel
-                # so raise for the local task to respond and handle.
+                # when in allow_overruns mode there may be
+                # lingering overflow sender tasks remaining?
+                if nurse.child_tasks:
+                    # XXX: ensure we are in overrun state
+                    # with ``._allow_overruns=True`` bc otherwise
+                    # there should be no tasks in this nursery!
+                    if (
+                        not ctx._allow_overruns
+                        or len(nurse.child_tasks) > 1
+                    ):
+                        raise RuntimeError(
+                            'Context has sub-tasks but is '
+                            'not in `allow_overruns=True` mode!?'
+                        )
+
+                    # ensure cancel of all overflow sender tasks
+                    # started in the ctx nursery.
+                    ctx._scope.cancel()
+
+        # XXX NOTE XXX: maybe shield against
+        # self-context-cancellation (which raises a local
+        # `ContextCancelled`) when requested (via
+        # `Context.cancel()`) by the same task (tree) which entered
+        # THIS `.open_context()`.
+        #
+        # NOTE: There are 2 operating cases for a "graceful cancel"
+        # of a `Context`. In both cases any `ContextCancelled`
+        # raised in this scope-block came from a transport msg
+        # relayed from some remote-actor-task which our runtime set
+        # as a `Context._remote_error`
+        #
+        # the CASES:
+        #
+        # - if that context IS THE SAME ONE that called
+        #   `Context.cancel()`, we want to absorb the error
+        #   silently and let this `.open_context()` block to exit
+        #   without raising.
+        #
+        # - if it is from some OTHER context (we did NOT call
+        #   `.cancel()`), we want to re-RAISE IT whilst also
+        #   setting our own ctx's "reason for cancel" to be that
+        #   other context's cancellation condition; we set our
+        #   `.canceller: tuple[str, str]` to be same value as
+        #   caught here in a `ContextCancelled.canceller`.
+        #
+        # Again, there are 2 cases:
+        #
+        # 1-some other context opened in this `.open_context()`
+        #   block cancelled due to a self or peer cancellation
+        #   request in which case we DO let the error bubble to the
+        #   opener.
+        #
+        # 2-THIS "caller" task somewhere invoked `Context.cancel()`
+        #   and received a `ContextCanclled` from the "callee"
+        #   task, in which case we mask the `ContextCancelled` from
+        #   bubbling to this "caller" (much like how `trio.Nursery`
+        #   swallows any `trio.Cancelled` bubbled by a call to
+        #   `Nursery.cancel_scope.cancel()`)
+        except ContextCancelled as ctxc:
+            scope_err = ctxc
+
+            # CASE 2: context was cancelled by local task calling
+            # `.cancel()`, we don't raise and the exit block should
+            # exit silently.
+            if (
+                ctx._cancel_called
+                and (
+                    ctxc is ctx._remote_error
+                    or
+                    ctxc.canceller is self.canceller
+                )
+            ):
+                log.debug(
+                    f'Context {ctx} cancelled gracefully with:\n'
+                    f'{ctxc}'
+                )
+            # CASE 1: this context was never cancelled via a local
+            # task (tree) having called `Context.cancel()`, raise
+            # the error since it was caused by someone else!
+            else:
                raise

-            # if the context was cancelled by client code
-            # then we don't need to raise since user code
-            # is expecting this and the block should exit.
-            else:
-                log.debug(f'Context {ctx} cancelled gracefully')
-
+        # the above `._scope` can be cancelled due to:
+        # 1. an explicit self cancel via `Context.cancel()` or
+        #    `Actor.cancel()`,
+        # 2. any "callee"-side remote error, possibly also a cancellation
+        #    request by some peer,
+        # 3. any "caller" (aka THIS scope's) local error raised in the above `yield`
        except (
-            BaseException,
+            # CASE 3: standard local error in this caller/yieldee
+            Exception,

-            # more specifically, we need to handle these but not
-            # sure it's worth being pedantic:
-            # Exception,
-            # trio.Cancelled,
-            # KeyboardInterrupt,
+            # CASES 1 & 2: normally manifested as
+            # a `Context._scope_nursery` raised
+            # exception-group of,
+            # 1.-`trio.Cancelled`s, since
+            #   `._scope.cancel()` will have been called and any
+            #   `ContextCancelled` absorbed and thus NOT RAISED in
+            #   any `Context._maybe_raise_remote_err()`,
+            # 2.-`BaseExceptionGroup[ContextCancelled | RemoteActorError]`
+            #    from any error raised in the "callee" side with
+            #    a group only raised if there was any more then one
+            #    task started here in the "caller" in the
+            #    `yield`-ed to task.
+            BaseExceptionGroup,  # since overrun handler tasks may have been spawned
+            trio.Cancelled,  # NOTE: NOT from inside the ctx._scope
+            KeyboardInterrupt,

        ) as err:
-            etype = type(err)
-            # the context cancels itself on any cancel
-            # causing error.
+            scope_err = err

-            if ctx.chan.connected():
-                log.cancel(
-                    'Context cancelled for task, sending cancel request..\n'
-                    f'task:{cid}\n'
-                    f'actor:{uid}'
-                )
+            # XXX: ALWAYS request the context to CANCEL ON any ERROR.
+            # NOTE: `Context.cancel()` is conversely NEVER CALLED in
+            # the `ContextCancelled` "self cancellation absorbed" case
+            # handled in the block above!
+            log.cancel(
+                'Context cancelled for task due to\n'
+                f'{err}\n'
+                'Sending cancel request..\n'
+                f'task:{cid}\n'
+                f'actor:{uid}'
+            )
+            try:
                await ctx.cancel()
-            else:
+            except trio.BrokenResourceError:
                log.warning(
                    'IPC connection for context is broken?\n'
                    f'task:{cid}\n'
                    f'actor:{uid}'
                )

-            raise
+            raise  # duh

-        finally:
-            # in the case where a runtime nursery (due to internal bug)
-            # or a remote actor transmits an error we want to be
-            # sure we get the error the underlying feeder mem chan.
-            # if it's not raised here it *should* be raised from the
-            # msg loop nursery right?
+        # no local scope error, the "clean exit with a result" case.
+        else:
            if ctx.chan.connected():
                log.info(
                    'Waiting on final context-task result for\n'
                    f'task: {cid}\n'
                    f'actor: {uid}'
                )
-                result = await ctx.result()
-                log.runtime(
-                    f'Context {fn_name} returned '
-                    f'value from callee `{result}`'
-                )
+                # XXX NOTE XXX: the below call to
+                # `Context.result()` will ALWAYS raise
+                # a `ContextCancelled` (via an embedded call to
+                # `Context._maybe_raise_remote_err()`) IFF
+                # a `Context._remote_error` was set by the runtime
+                # via a call to
+                # `Context._maybe_cancel_and_set_remote_error()`.
+                # As per `Context._deliver_msg()`, that error IS
+                # ALWAYS SET any time "callee" side fails and causes "caller
+                # side" cancellation via a `ContextCancelled` here.
+                # result = await ctx.result()
+                try:
+                    result = await ctx.result()
+                    log.runtime(
+                        f'Context {fn_name} returned value from callee:\n'
+                        f'`{result}`'
+                    )
+                except BaseException as berr:
+                    # on normal teardown, if we get some error
+                    # raised in `Context.result()` we still want to
+                    # save that error on the ctx's state to
+                    # determine things like `.cancelled_caught` for
+                    # cases where there was remote cancellation but
+                    # this task didn't know until final teardown
+                    # / value collection.
+                    scope_err = berr
+                    raise

+        finally:
            # though it should be impossible for any tasks
            # operating *in* this scope to have survived
            # we tear down the runtime feeder chan last
            # to avoid premature stream clobbers.
-            if ctx._recv_chan is not None:
-                # should we encapsulate this in the context api?
-                await ctx._recv_chan.aclose()
+            rxchan: trio.ReceiveChannel = ctx._recv_chan
+            if (
+                 rxchan

-            if etype:
+                # maybe TODO: yes i know the below check is
+                # touching `trio` memchan internals..BUT, there are
+                # only a couple ways to avoid a `trio.Cancelled`
+                # bubbling from the `.aclose()` call below:
+                #
+                # - catch and mask it via the cancel-scope-shielded call
+                #   as we are rn (manual and frowned upon) OR,
+                # - specially handle the case where `scope_err` is
+                #   one of {`BaseExceptionGroup`, `trio.Cancelled`}
+                #   and then presume that the `.aclose()` call will
+                #   raise a `trio.Cancelled` and just don't call it
+                #   in those cases..
+                #
+                # that latter approach is more logic, LOC, and more
+                # convoluted so for now stick with the first
+                # psuedo-hack-workaround where we just try to avoid
+                # the shielded call as much as we can detect from
+                # the memchan's `._closed` state..
+                #
+                # XXX MOTIVATION XXX-> we generally want to raise
+                # any underlying actor-runtime/internals error that
+                # surfaces from a bug in tractor itself so it can
+                # be easily detected/fixed AND, we also want to
+                # minimize noisy runtime tracebacks (normally due
+                # to the cross-actor linked task scope machinery
+                # teardown) displayed to user-code and instead only
+                # displaying `ContextCancelled` traces where the
+                # cause of crash/exit IS due to something in
+                # user/app code on either end of the context.
+                and not rxchan._closed
+            ):
+                # XXX NOTE XXX: and again as per above, we mask any
+                # `trio.Cancelled` raised here so as to NOT mask
+                # out any exception group or legit (remote) ctx
+                # error that sourced from the remote task or its
+                # runtime.
+                with trio.CancelScope(shield=True):
+                    await ctx._recv_chan.aclose()
+
+            # XXX: we always raise remote errors locally and
+            # generally speaking mask runtime-machinery related
+            # multi-`trio.Cancelled`s. As such, any `scope_error`
+            # which was the underlying cause of this context's exit
+            # should be stored as the `Context._local_error` and
+            # used in determining `Context.cancelled_caught: bool`.
+            if scope_err is not None:
+                ctx._local_error: BaseException = scope_err
+                etype: Type[BaseException] = type(scope_err)
+
+                # CASE 2
                if ctx._cancel_called:
                    log.cancel(
-                        f'Context {fn_name} cancelled by caller with\n{etype}'
+                        f'Context {fn_name} cancelled by caller with\n'
+                        f'{etype}'
                    )
-                elif _err is not None:
+
+                # CASE 1
+                else:
                    log.cancel(
-                        f'Context for task cancelled by callee with {etype}\n'
+                        f'Context cancelled by callee with {etype}\n'
                        f'target: `{fn_name}`\n'
                        f'task:{cid}\n'
                        f'actor:{uid}'
                    )
+
            # XXX: (MEGA IMPORTANT) if this is a root opened process we
            # wait for any immediate child in debug before popping the
            # context from the runtime msg loop otherwise inside
@ -533,10 +745,10 @@ class Portal:
            # a "stop" msg for a stream), this can result in a deadlock
            # where the root is waiting on the lock to clear but the
            # child has already cleared it and clobbered IPC.
-            from ._debug import maybe_wait_for_debugger
            await maybe_wait_for_debugger()

-            # remove the context from runtime tracking
+            # FINALLY, remove the context from runtime tracking and
+            # exit!
            self.actor._contexts.pop(
                (self.channel.uid, ctx.cid),
                None,
@ -555,7 +767,12 @@ class LocalPortal:
    actor: 'Actor'  # type: ignore # noqa
    channel: Channel

-    async def run_from_ns(self, ns: str, func_name: str, **kwargs) -> Any:
+    async def run_from_ns(
+        self,
+        ns: str,
+        func_name: str,
+        **kwargs,
+    ) -> Any:
        '''
        Run a requested local function from a namespace path and
        return it's result.
--- a/tractor/_root.py
+++ b/tractor/_root.py
@ -25,7 +25,6 @@ import logging
 import signal
 import sys
 import os
-import typing
 import warnings


@ -37,7 +36,7 @@ from ._runtime import (
    Arbiter,
    async_main,
 )
-from . import _debug
+from .devx import _debug
 from . import _spawn
 from . import _state
 from . import log
@ -46,8 +45,14 @@ from ._exceptions import is_multi_cancelled


 # set at startup and after forks
-_default_arbiter_host: str = '127.0.0.1'
-_default_arbiter_port: int = 1616
+_default_host: str = '127.0.0.1'
+_default_port: int = 1616
+
+# default registry always on localhost
+_default_lo_addrs: list[tuple[str, int]] = [(
+    _default_host,
+    _default_port,
+)]


 logger = log.get_logger('tractor')
@ -58,10 +63,10 @@ async def open_root_actor(

    *,
    # defaults are above
-    arbiter_addr: tuple[str, int] | None = None,
+    registry_addrs: list[tuple[str, int]] | None = None,

    # defaults are above
-    registry_addr: tuple[str, int] | None = None,
+    arbiter_addr: tuple[str, int] | None = None,

    name: str | None = 'root',

@ -79,7 +84,11 @@ async def open_root_actor(
    enable_modules: list | None = None,
    rpc_module_paths: list | None = None,

-) -> typing.Any:
+    # NOTE: allow caller to ensure that only one registry exists
+    # and that this call creates it.
+    ensure_registry: bool = False,
+
+) -> Actor:
    '''
    Runtime init entry point for ``tractor``.

@ -89,7 +98,7 @@ async def open_root_actor(
    # https://github.com/python-trio/trio/issues/1155#issuecomment-742964018
    builtin_bp_handler = sys.breakpointhook
    orig_bp_path: str | None = os.environ.get('PYTHONBREAKPOINT', None)
-    os.environ['PYTHONBREAKPOINT'] = 'tractor._debug._set_trace'
+    os.environ['PYTHONBREAKPOINT'] = 'tractor.devx._debug.pause_from_sync'

    # attempt to retreive ``trio``'s sigint handler and stash it
    # on our debugger lock state.
@ -115,20 +124,19 @@ async def open_root_actor(

    if arbiter_addr is not None:
        warnings.warn(
-            '`arbiter_addr` is now deprecated and has been renamed to'
-            '`registry_addr`.\nUse that instead..',
+            '`arbiter_addr` is now deprecated\n'
+            'Use `registry_addrs: list[tuple]` instead..',
            DeprecationWarning,
            stacklevel=2,
        )
+        registry_addrs = [arbiter_addr]

-    registry_addr = (host, port) = (
-        registry_addr
-        or arbiter_addr
-        or (
-            _default_arbiter_host,
-            _default_arbiter_port,
-        )
+    registry_addrs: list[tuple[str, int]] = (
+        registry_addrs
+        or
+        _default_lo_addrs
    )
+    assert registry_addrs

    loglevel = (loglevel or log._default_loglevel).upper()

@ -137,7 +145,7 @@ async def open_root_actor(

        # expose internal debug module to every actor allowing
        # for use of ``await tractor.breakpoint()``
-        enable_modules.append('tractor._debug')
+        enable_modules.append('tractor.devx._debug')

        # if debug mode get's enabled *at least* use that level of
        # logging for some informative console prompts.
@ -157,73 +165,131 @@ async def open_root_actor(

    log.get_console_log(loglevel)

-    try:
-        # make a temporary connection to see if an arbiter exists,
-        # if one can't be made quickly we assume none exists.
-        arbiter_found = False
+    # closed into below ping task-func
+    ponged_addrs: list[tuple[str, int]] = []

-        # TODO: this connect-and-bail forces us to have to carefully
-        # rewrap TCP 104-connection-reset errors as EOF so as to avoid
-        # propagating cancel-causing errors to the channel-msg loop
-        # machinery.  Likely it would be better to eventually have
-        # a "discovery" protocol with basic handshake instead.
-        with trio.move_on_after(1):
-            async with _connect_chan(host, port):
-                arbiter_found = True
+    async def ping_tpt_socket(
+        addr: tuple[str, int],
+        timeout: float = 1,
+    ) -> None:
+        '''
+        Attempt temporary connection to see if a registry is
+        listening at the requested address by a tranport layer
+        ping.

-    except OSError:
-        # TODO: make this a "discovery" log level?
-        logger.warning(f"No actor registry found @ {host}:{port}")
+        If a connection can't be made quickly we assume none no
+        server is listening at that addr.

-    # create a local actor and start up its main routine/task
-    if arbiter_found:
+        '''
+        try:
+            # TODO: this connect-and-bail forces us to have to
+            # carefully rewrap TCP 104-connection-reset errors as
+            # EOF so as to avoid propagating cancel-causing errors
+            # to the channel-msg loop machinery. Likely it would
+            # be better to eventually have a "discovery" protocol
+            # with basic handshake instead?
+            with trio.move_on_after(timeout):
+                async with _connect_chan(*addr):
+                    ponged_addrs.append(addr)
+
+        except OSError:
+            # TODO: make this a "discovery" log level?
+            logger.warning(f'No actor registry found @ {addr}')
+
+    async with trio.open_nursery() as tn:
+        for addr in registry_addrs:
+            tn.start_soon(
+                ping_tpt_socket,
+                tuple(addr),  # TODO: just drop this requirement?
+            )
+
+    trans_bind_addrs: list[tuple[str, int]] = []
+
+    # Create a new local root-actor instance which IS NOT THE
+    # REGISTRAR
+    if ponged_addrs:
+
+        if ensure_registry:
+            raise RuntimeError(
+                 f'Failed to open `{name}`@{ponged_addrs}: '
+                'registry socket(s) already bound'
+            )

        # we were able to connect to an arbiter
-        logger.info(f"Arbiter seems to exist @ {host}:{port}")
+        logger.info(
+            f'Registry(s) seem(s) to exist @ {ponged_addrs}'
+        )

        actor = Actor(
-            name or 'anonymous',
-            arbiter_addr=registry_addr,
+            name=name or 'anonymous',
+            registry_addrs=ponged_addrs,
            loglevel=loglevel,
            enable_modules=enable_modules,
        )
-        host, port = (host, 0)
+        # DO NOT use the registry_addrs as the transport server
+        # addrs for this new non-registar, root-actor.
+        for host, port in ponged_addrs:
+            # NOTE: zero triggers dynamic OS port allocation
+            trans_bind_addrs.append((host, 0))

+    # Start this local actor as the "registrar", aka a regular
+    # actor who manages the local registry of "mailboxes" of
+    # other process-tree-local sub-actors.
    else:
-        # start this local actor as the arbiter (aka a regular actor who
-        # manages the local registry of "mailboxes")

-        # Note that if the current actor is the arbiter it is desirable
-        # for it to stay up indefinitely until a re-election process has
-        # taken place - which is not implemented yet FYI).
+        # NOTE that if the current actor IS THE REGISTAR, the
+        # following init steps are taken:
+        # - the tranport layer server is bound to each (host, port)
+        #   pair defined in provided registry_addrs, or the default.
+        trans_bind_addrs = registry_addrs
+
+        # - it is normally desirable for any registrar to stay up
+        #   indefinitely until either all registered (child/sub)
+        #   actors are terminated (via SC supervision) or,
+        #   a re-election process has taken place. 
+        # NOTE: all of ^ which is not implemented yet - see:
+        # https://github.com/goodboy/tractor/issues/216
+        # https://github.com/goodboy/tractor/pull/348
+        # https://github.com/goodboy/tractor/issues/296

        actor = Arbiter(
-            name or 'arbiter',
-            arbiter_addr=registry_addr,
+            name or 'registrar',
+            registry_addrs=registry_addrs,
            loglevel=loglevel,
            enable_modules=enable_modules,
        )

+    # Start up main task set via core actor-runtime nurseries.
    try:
        # assign process-local actor
        _state._current_actor = actor

        # start local channel-server and fake the portal API
        # NOTE: this won't block since we provide the nursery
-        logger.info(f"Starting local {actor} @ {host}:{port}")
+        ml_addrs_str: str = '\n'.join(
+            f'@{addr}' for addr in trans_bind_addrs
+        )
+        logger.info(
+            f'Starting local {actor.uid} on the following transport addrs:\n'
+            f'{ml_addrs_str}'
+        )

        # start the actor runtime in a new task
        async with trio.open_nursery() as nursery:

-            # ``_runtime.async_main()`` creates an internal nursery and
-            # thus blocks here until the entire underlying actor tree has
-            # terminated thereby conducting structured concurrency.
-
+            # ``_runtime.async_main()`` creates an internal nursery
+            # and blocks here until any underlying actor(-process)
+            # tree has terminated thereby conducting so called
+            # "end-to-end" structured concurrency throughout an
+            # entire hierarchical python sub-process set; all
+            # "actor runtime" primitives are SC-compat and thus all
+            # transitively spawned actors/processes must be as
+            # well.
            await nursery.start(
                partial(
                    async_main,
                    actor,
-                    accept_addr=(host, port),
+                    accept_addrs=trans_bind_addrs,
                    parent_addr=None
                )
            )
@ -235,12 +301,16 @@ async def open_root_actor(
                BaseExceptionGroup,
            ) as err:

-                entered = await _debug._maybe_enter_pm(err)
+                entered: bool = await _debug._maybe_enter_pm(err)

-                if not entered and not is_multi_cancelled(err):
+                if (
+                    not entered
+                    and not is_multi_cancelled(err)
+                ):
                    logger.exception("Root actor crashed:")

-                # always re-raise
+                # ALWAYS re-raise any error bubbled up from the
+                # runtime!
                raise

            finally:
@ -254,11 +324,13 @@ async def open_root_actor(
                #         tempn.start_soon(an.exited.wait)

                logger.cancel("Shutting down root actor")
-                await actor.cancel()
+                await actor.cancel(
+                    requesting_uid=actor.uid,
+                )
    finally:
        _state._current_actor = None

-        # restore breakpoint hook state
+        # restore built-in `breakpoint()` hook state
        sys.breakpointhook = builtin_bp_handler
        if orig_bp_path is not None:
            os.environ['PYTHONBREAKPOINT'] = orig_bp_path
@ -274,10 +346,7 @@ def run_daemon(

    # runtime kwargs
    name: str | None = 'root',
-    registry_addr: tuple[str, int] = (
-        _default_arbiter_host,
-        _default_arbiter_port,
-    ),
+    registry_addrs: list[tuple[str, int]] = _default_lo_addrs,

    start_method: str | None = None,
    debug_mode: bool = False,
@ -301,7 +370,7 @@ def run_daemon(
    async def _main():

        async with open_root_actor(
-            registry_addr=registry_addr,
+            registry_addrs=registry_addrs,
            name=name,
            start_method=start_method,
            debug_mode=debug_mode,
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
--- a/tractor/_shm.py
+++ b/tractor/_shm.py
@ -0,0 +1,833 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+"""
+SC friendly shared memory management geared at real-time
+processing.
+
+Support for ``numpy`` compatible array-buffers is provided but is
+considered optional within the context of this runtime-library.
+
+"""
+from __future__ import annotations
+from sys import byteorder
+import time
+from typing import Optional
+from multiprocessing import shared_memory as shm
+from multiprocessing.shared_memory import (
+    SharedMemory,
+    ShareableList,
+)
+
+from msgspec import Struct
+import tractor
+
+from .log import get_logger
+
+
+_USE_POSIX = getattr(shm, '_USE_POSIX', False)
+if _USE_POSIX:
+    from _posixshmem import shm_unlink
+
+
+try:
+    import numpy as np
+    from numpy.lib import recfunctions as rfn
+    import nptyping
+except ImportError:
+    pass
+
+
+log = get_logger(__name__)
+
+
+def disable_mantracker():
+    '''
+    Disable all ``multiprocessing``` "resource tracking" machinery since
+    it's an absolute multi-threaded mess of non-SC madness.
+
+    '''
+    from multiprocessing import resource_tracker as mantracker
+
+    # Tell the "resource tracker" thing to fuck off.
+    class ManTracker(mantracker.ResourceTracker):
+        def register(self, name, rtype):
+            pass
+
+        def unregister(self, name, rtype):
+            pass
+
+        def ensure_running(self):
+            pass
+
+    # "know your land and know your prey"
+    # https://www.dailymotion.com/video/x6ozzco
+    mantracker._resource_tracker = ManTracker()
+    mantracker.register = mantracker._resource_tracker.register
+    mantracker.ensure_running = mantracker._resource_tracker.ensure_running
+    mantracker.unregister = mantracker._resource_tracker.unregister
+    mantracker.getfd = mantracker._resource_tracker.getfd
+
+
+disable_mantracker()
+
+
+class SharedInt:
+    '''
+    Wrapper around a single entry shared memory array which
+    holds an ``int`` value used as an index counter.
+
+    '''
+    def __init__(
+        self,
+        shm: SharedMemory,
+    ) -> None:
+        self._shm = shm
+
+    @property
+    def value(self) -> int:
+        return int.from_bytes(self._shm.buf, byteorder)
+
+    @value.setter
+    def value(self, value) -> None:
+        self._shm.buf[:] = value.to_bytes(self._shm.size, byteorder)
+
+    def destroy(self) -> None:
+        if _USE_POSIX:
+            # We manually unlink to bypass all the "resource tracker"
+            # nonsense meant for non-SC systems.
+            name = self._shm.name
+            try:
+                shm_unlink(name)
+            except FileNotFoundError:
+                # might be a teardown race here?
+                log.warning(f'Shm for {name} already unlinked?')
+
+
+class NDToken(Struct, frozen=True):
+    '''
+    Internal represenation of a shared memory ``numpy`` array "token"
+    which can be used to key and load a system (OS) wide shm entry
+    and correctly read the array by type signature.
+
+    This type is msg safe.
+
+    '''
+    shm_name: str  # this servers as a "key" value
+    shm_first_index_name: str
+    shm_last_index_name: str
+    dtype_descr: tuple
+    size: int  # in struct-array index / row terms
+
+    # TODO: use nptyping here on dtypes
+    @property
+    def dtype(self) -> list[tuple[str, str, tuple[int, ...]]]:
+        return np.dtype(
+            list(
+                map(tuple, self.dtype_descr)
+            )
+        ).descr
+
+    def as_msg(self):
+        return self.to_dict()
+
+    @classmethod
+    def from_msg(cls, msg: dict) -> NDToken:
+        if isinstance(msg, NDToken):
+            return msg
+
+        # TODO: native struct decoding
+        # return _token_dec.decode(msg)
+
+        msg['dtype_descr'] = tuple(map(tuple, msg['dtype_descr']))
+        return NDToken(**msg)
+
+
+# _token_dec = msgspec.msgpack.Decoder(NDToken)
+
+# TODO: this api?
+# _known_tokens = tractor.ActorVar('_shm_tokens', {})
+# _known_tokens = tractor.ContextStack('_known_tokens', )
+# _known_tokens = trio.RunVar('shms', {})
+
+# TODO: this should maybe be provided via
+# a `.trionics.maybe_open_context()` wrapper factory?
+# process-local store of keys to tokens
+_known_tokens: dict[str, NDToken] = {}
+
+
+def get_shm_token(key: str) -> NDToken | None:
+    '''
+    Convenience func to check if a token
+    for the provided key is known by this process.
+
+    Returns either the ``numpy`` token or a string for a shared list.
+
+    '''
+    return _known_tokens.get(key)
+
+
+def _make_token(
+    key: str,
+    size: int,
+    dtype: np.dtype,
+
+) -> NDToken:
+    '''
+    Create a serializable token that can be used
+    to access a shared array.
+
+    '''
+    return NDToken(
+        shm_name=key,
+        shm_first_index_name=key + "_first",
+        shm_last_index_name=key + "_last",
+        dtype_descr=tuple(np.dtype(dtype).descr),
+        size=size,
+    )
+
+
+class ShmArray:
+    '''
+    A shared memory ``numpy.ndarray`` API.
+
+    An underlying shared memory buffer is allocated based on
+    a user specified ``numpy.ndarray``. This fixed size array
+    can be read and written to by pushing data both onto the "front"
+    or "back" of a set index range. The indexes for the "first" and
+    "last" index are themselves stored in shared memory (accessed via
+    ``SharedInt`` interfaces) values such that multiple processes can
+    interact with the same array using a synchronized-index.
+
+    '''
+    def __init__(
+        self,
+        shmarr: np.ndarray,
+        first: SharedInt,
+        last: SharedInt,
+        shm: SharedMemory,
+        # readonly: bool = True,
+    ) -> None:
+        self._array = shmarr
+
+        # indexes for first and last indices corresponding
+        # to fille data
+        self._first = first
+        self._last = last
+
+        self._len = len(shmarr)
+        self._shm = shm
+        self._post_init: bool = False
+
+        # pushing data does not write the index (aka primary key)
+        self._write_fields: list[str] | None = None
+        dtype = shmarr.dtype
+        if dtype.fields:
+            self._write_fields = list(shmarr.dtype.fields.keys())[1:]
+
+    # TODO: ringbuf api?
+
+    @property
+    def _token(self) -> NDToken:
+        return NDToken(
+            shm_name=self._shm.name,
+            shm_first_index_name=self._first._shm.name,
+            shm_last_index_name=self._last._shm.name,
+            dtype_descr=tuple(self._array.dtype.descr),
+            size=self._len,
+        )
+
+    @property
+    def token(self) -> dict:
+        """Shared memory token that can be serialized and used by
+        another process to attach to this array.
+        """
+        return self._token.as_msg()
+
+    @property
+    def index(self) -> int:
+        return self._last.value % self._len
+
+    @property
+    def array(self) -> np.ndarray:
+        '''
+        Return an up-to-date ``np.ndarray`` view of the
+        so-far-written data to the underlying shm buffer.
+
+        '''
+        a = self._array[self._first.value:self._last.value]
+
+        # first, last = self._first.value, self._last.value
+        # a = self._array[first:last]
+
+        # TODO: eventually comment this once we've not seen it in the
+        # wild in a long time..
+        # XXX: race where first/last indexes cause a reader
+        # to load an empty array..
+        if len(a) == 0 and self._post_init:
+            raise RuntimeError('Empty array race condition hit!?')
+            # breakpoint()
+
+        return a
+
+    def ustruct(
+        self,
+        fields: Optional[list[str]] = None,
+
+        # type that all field values will be cast to
+        # in the returned view.
+        common_dtype: np.dtype = float,
+
+    ) -> np.ndarray:
+
+        array = self._array
+
+        if fields:
+            selection = array[fields]
+            # fcount = len(fields)
+        else:
+            selection = array
+            # fcount = len(array.dtype.fields)
+
+        # XXX: manual ``.view()`` attempt that also doesn't work.
+        # uview = selection.view(
+        #     dtype='<f16',
+        # ).reshape(-1, 4, order='A')
+
+        # assert len(selection) == len(uview)
+
+        u = rfn.structured_to_unstructured(
+            selection,
+            # dtype=float,
+            copy=True,
+        )
+
+        # unstruct = np.ndarray(u.shape, dtype=a.dtype, buffer=shm.buf)
+        # array[:] = a[:]
+        return u
+        # return ShmArray(
+        #     shmarr=u,
+        #     first=self._first,
+        #     last=self._last,
+        #     shm=self._shm
+        # )
+
+    def last(
+        self,
+        length: int = 1,
+
+    ) -> np.ndarray:
+        '''
+        Return the last ``length``'s worth of ("row") entries from the
+        array.
+
+        '''
+        return self.array[-length:]
+
+    def push(
+        self,
+        data: np.ndarray,
+
+        field_map: Optional[dict[str, str]] = None,
+        prepend: bool = False,
+        update_first: bool = True,
+        start: int | None = None,
+
+    ) -> int:
+        '''
+        Ring buffer like "push" to append data
+        into the buffer and return updated "last" index.
+
+        NB: no actual ring logic yet to give a "loop around" on overflow
+        condition, lel.
+
+        '''
+        length = len(data)
+
+        if prepend:
+            index = (start or self._first.value) - length
+
+            if index < 0:
+                raise ValueError(
+                    f'Array size of {self._len} was overrun during prepend.\n'
+                    f'You have passed {abs(index)} too many datums.'
+                )
+
+        else:
+            index = start if start is not None else self._last.value
+
+        end = index + length
+
+        if field_map:
+            src_names, dst_names = zip(*field_map.items())
+        else:
+            dst_names = src_names = self._write_fields
+
+        try:
+            self._array[
+                list(dst_names)
+            ][index:end] = data[list(src_names)][:]
+
+            # NOTE: there was a race here between updating
+            # the first and last indices and when the next reader
+            # tries to access ``.array`` (which due to the index
+            # overlap will be empty). Pretty sure we've fixed it now
+            # but leaving this here as a reminder.
+            if (
+                prepend
+                and update_first
+                and length
+            ):
+                assert index < self._first.value
+
+            if (
+                index < self._first.value
+                and update_first
+            ):
+                assert prepend, 'prepend=True not passed but index decreased?'
+                self._first.value = index
+
+            elif not prepend:
+                self._last.value = end
+
+            self._post_init = True
+            return end
+
+        except ValueError as err:
+            if field_map:
+                raise
+
+            # should raise if diff detected
+            self.diff_err_fields(data)
+            raise err
+
+    def diff_err_fields(
+        self,
+        data: np.ndarray,
+    ) -> None:
+        # reraise with any field discrepancy
+        our_fields, their_fields = (
+            set(self._array.dtype.fields),
+            set(data.dtype.fields),
+        )
+
+        only_in_ours = our_fields - their_fields
+        only_in_theirs = their_fields - our_fields
+
+        if only_in_ours:
+            raise TypeError(
+                f"Input array is missing field(s): {only_in_ours}"
+            )
+        elif only_in_theirs:
+            raise TypeError(
+                f"Input array has unknown field(s): {only_in_theirs}"
+            )
+
+    # TODO: support "silent" prepends that don't update ._first.value?
+    def prepend(
+        self,
+        data: np.ndarray,
+    ) -> int:
+        end = self.push(data, prepend=True)
+        assert end
+
+    def close(self) -> None:
+        self._first._shm.close()
+        self._last._shm.close()
+        self._shm.close()
+
+    def destroy(self) -> None:
+        if _USE_POSIX:
+            # We manually unlink to bypass all the "resource tracker"
+            # nonsense meant for non-SC systems.
+            shm_unlink(self._shm.name)
+
+        self._first.destroy()
+        self._last.destroy()
+
+    def flush(self) -> None:
+        # TODO: flush to storage backend like markestore?
+        ...
+
+
+def open_shm_ndarray(
+    size: int,
+    key: str | None = None,
+    dtype: np.dtype | None = None,
+    append_start_index: int | None = None,
+    readonly: bool = False,
+
+) -> ShmArray:
+    '''
+    Open a memory shared ``numpy`` using the standard library.
+
+    This call unlinks (aka permanently destroys) the buffer on teardown
+    and thus should be used from the parent-most accessor (process).
+
+    '''
+    # create new shared mem segment for which we
+    # have write permission
+    a = np.zeros(size, dtype=dtype)
+    a['index'] = np.arange(len(a))
+
+    shm = SharedMemory(
+        name=key,
+        create=True,
+        size=a.nbytes
+    )
+    array = np.ndarray(
+        a.shape,
+        dtype=a.dtype,
+        buffer=shm.buf
+    )
+    array[:] = a[:]
+    array.setflags(write=int(not readonly))
+
+    token = _make_token(
+        key=key,
+        size=size,
+        dtype=dtype,
+    )
+
+    # create single entry arrays for storing an first and last indices
+    first = SharedInt(
+        shm=SharedMemory(
+            name=token.shm_first_index_name,
+            create=True,
+            size=4,  # std int
+        )
+    )
+
+    last = SharedInt(
+        shm=SharedMemory(
+            name=token.shm_last_index_name,
+            create=True,
+            size=4,  # std int
+        )
+    )
+
+    # Start the "real-time" append-updated (or "pushed-to") section
+    # after some start index: ``append_start_index``. This allows appending
+    # from a start point in the array which isn't the 0 index and looks
+    # something like,
+    # -------------------------
+    # |              |        i
+    # _________________________
+    # <-------------> <------->
+    #  history         real-time
+    #
+    # Once fully "prepended", the history section will leave the
+    # ``ShmArray._start.value: int = 0`` and the yet-to-be written
+    # real-time section will start at ``ShmArray.index: int``.
+
+    # this sets the index to nearly 2/3rds into the the length of
+    # the buffer leaving at least a "days worth of second samples"
+    # for the real-time section.
+    if append_start_index is None:
+        append_start_index = round(size * 0.616)
+
+    last.value = first.value = append_start_index
+
+    shmarr = ShmArray(
+        array,
+        first,
+        last,
+        shm,
+    )
+
+    assert shmarr._token == token
+    _known_tokens[key] = shmarr.token
+
+    # "unlink" created shm on process teardown by
+    # pushing teardown calls onto actor context stack
+    stack = tractor.current_actor().lifetime_stack
+    stack.callback(shmarr.close)
+    stack.callback(shmarr.destroy)
+
+    return shmarr
+
+
+def attach_shm_ndarray(
+    token: tuple[str, str, tuple[str, str]],
+    readonly: bool = True,
+
+) -> ShmArray:
+    '''
+    Attach to an existing shared memory array previously
+    created by another process using ``open_shared_array``.
+
+    No new shared mem is allocated but wrapper types for read/write
+    access are constructed.
+
+    '''
+    token = NDToken.from_msg(token)
+    key = token.shm_name
+
+    if key in _known_tokens:
+        assert NDToken.from_msg(_known_tokens[key]) == token, "WTF"
+
+    # XXX: ugh, looks like due to the ``shm_open()`` C api we can't
+    # actually place files in a subdir, see discussion here:
+    # https://stackoverflow.com/a/11103289
+
+    # attach to array buffer and view as per dtype
+    _err: Optional[Exception] = None
+    for _ in range(3):
+        try:
+            shm = SharedMemory(
+                name=key,
+                create=False,
+            )
+            break
+        except OSError as oserr:
+            _err = oserr
+            time.sleep(0.1)
+    else:
+        if _err:
+            raise _err
+
+    shmarr = np.ndarray(
+        (token.size,),
+        dtype=token.dtype,
+        buffer=shm.buf
+    )
+    shmarr.setflags(write=int(not readonly))
+
+    first = SharedInt(
+        shm=SharedMemory(
+            name=token.shm_first_index_name,
+            create=False,
+            size=4,  # std int
+        ),
+    )
+    last = SharedInt(
+        shm=SharedMemory(
+            name=token.shm_last_index_name,
+            create=False,
+            size=4,  # std int
+        ),
+    )
+
+    # make sure we can read
+    first.value
+
+    sha = ShmArray(
+        shmarr,
+        first,
+        last,
+        shm,
+    )
+    # read test
+    sha.array
+
+    # Stash key -> token knowledge for future queries
+    # via `maybe_opepn_shm_array()` but only after we know
+    # we can attach.
+    if key not in _known_tokens:
+        _known_tokens[key] = token
+
+    # "close" attached shm on actor teardown
+    tractor.current_actor().lifetime_stack.callback(sha.close)
+
+    return sha
+
+
+def maybe_open_shm_ndarray(
+    key: str,  # unique identifier for segment
+    size: int,
+    dtype: np.dtype | None = None,
+    append_start_index: int = 0,
+    readonly: bool = True,
+
+) -> tuple[ShmArray, bool]:
+    '''
+    Attempt to attach to a shared memory block using a "key" lookup
+    to registered blocks in the users overall "system" registry
+    (presumes you don't have the block's explicit token).
+
+    This function is meant to solve the problem of discovering whether
+    a shared array token has been allocated or discovered by the actor
+    running in **this** process. Systems where multiple actors may seek
+    to access a common block can use this function to attempt to acquire
+    a token as discovered by the actors who have previously stored
+    a "key" -> ``NDToken`` map in an actor local (aka python global)
+    variable.
+
+    If you know the explicit ``NDToken`` for your memory segment instead
+    use ``attach_shm_array``.
+
+    '''
+    try:
+        # see if we already know this key
+        token = _known_tokens[key]
+        return (
+            attach_shm_ndarray(
+                token=token,
+                readonly=readonly,
+            ),
+            False,  # not newly opened
+        )
+    except KeyError:
+        log.warning(f"Could not find {key} in shms cache")
+        if dtype:
+            token = _make_token(
+                key,
+                size=size,
+                dtype=dtype,
+            )
+        else:
+
+            try:
+                return (
+                    attach_shm_ndarray(
+                        token=token,
+                        readonly=readonly,
+                    ),
+                    False,
+                )
+            except FileNotFoundError:
+                log.warning(f"Could not attach to shm with token {token}")
+
+        # This actor does not know about memory
+        # associated with the provided "key".
+        # Attempt to open a block and expect
+        # to fail if a block has been allocated
+        # on the OS by someone else.
+        return (
+            open_shm_ndarray(
+                key=key,
+                size=size,
+                dtype=dtype,
+                append_start_index=append_start_index,
+                readonly=readonly,
+            ),
+            True,
+        )
+
+
+class ShmList(ShareableList):
+    '''
+    Carbon copy of ``.shared_memory.ShareableList`` with a few
+    enhancements:
+
+    - readonly mode via instance var flag  `._readonly: bool`
+    - ``.__getitem__()`` accepts ``slice`` inputs
+    - exposes the underlying buffer "name" as a ``.key: str``
+
+    '''
+    def __init__(
+        self,
+        sequence: list | None = None,
+        *,
+        name: str | None = None,
+        readonly: bool = True
+
+    ) -> None:
+        self._readonly = readonly
+        self._key = name
+        return super().__init__(
+            sequence=sequence,
+            name=name,
+        )
+
+    @property
+    def key(self) -> str:
+        return self._key
+
+    @property
+    def readonly(self) -> bool:
+        return self._readonly
+
+    def __setitem__(
+        self,
+        position,
+        value,
+
+    ) -> None:
+
+        # mimick ``numpy`` error
+        if self._readonly:
+            raise ValueError('assignment destination is read-only')
+
+        return super().__setitem__(position, value)
+
+    def __getitem__(
+        self,
+        indexish,
+    ) -> list:
+
+        # NOTE: this is a non-writeable view (copy?) of the buffer
+        # in a new list instance.
+        if isinstance(indexish, slice):
+            return list(self)[indexish]
+
+        return super().__getitem__(indexish)
+
+    # TODO: should we offer a `.array` and `.push()` equivalent
+    # to the `ShmArray`?
+    # currently we have the following limitations:
+    # - can't write slices of input using traditional slice-assign
+    #   syntax due to the ``ShareableList.__setitem__()`` implementation.
+    # - ``list(shmlist)`` returns a non-mutable copy instead of
+    #   a writeable view which would be handier numpy-style ops.
+
+
+def open_shm_list(
+    key: str,
+    sequence: list | None = None,
+    size: int = int(2 ** 10),
+    dtype: float | int | bool | str | bytes | None = float,
+    readonly: bool = True,
+
+) -> ShmList:
+
+    if sequence is None:
+        default = {
+            float: 0.,
+            int: 0,
+            bool: True,
+            str: 'doggy',
+            None: None,
+        }[dtype]
+        sequence = [default] * size
+
+    shml = ShmList(
+        sequence=sequence,
+        name=key,
+        readonly=readonly,
+    )
+
+    # "close" attached shm on actor teardown
+    try:
+        actor = tractor.current_actor()
+        actor.lifetime_stack.callback(shml.shm.close)
+        actor.lifetime_stack.callback(shml.shm.unlink)
+    except RuntimeError:
+        log.warning('tractor runtime not active, skipping teardown steps')
+
+    return shml
+
+
+def attach_shm_list(
+    key: str,
+    readonly: bool = False,
+
+) -> ShmList:
+
+    return ShmList(
+        name=key,
+        readonly=readonly,
+    )
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@ -19,6 +19,7 @@ Machinery for actor process spawning using multiple backends.

 """
 from __future__ import annotations
+import multiprocessing as mp
 import sys
 import platform
 from typing import (
@ -34,7 +35,7 @@ from exceptiongroup import BaseExceptionGroup
 import trio
 from trio_typing import TaskStatus

-from ._debug import (
+from .devx._debug import (
    maybe_wait_for_debugger,
    acquire_debug_lock,
 )
@ -53,7 +54,6 @@ from ._exceptions import ActorFailure

 if TYPE_CHECKING:
    from ._supervise import ActorNursery
-    import multiprocessing as mp
    ProcessType = TypeVar('ProcessType', mp.Process, trio.Process)

 log = get_logger('tractor')
@ -70,7 +70,6 @@ _spawn_method: SpawnMethodKey = 'trio'

 if platform.system() == 'Windows':

-    import multiprocessing as mp
    _ctx = mp.get_context("spawn")

    async def proc_waiter(proc: mp.Process) -> None:
@ -200,7 +199,26 @@ async def do_hard_kill(
    proc: trio.Process,
    terminate_after: int = 3,

+    # NOTE: for mucking with `.pause()`-ing inside the runtime
+    # whilst also hacking on it XD
+    # terminate_after: int = 99999,
+
 ) -> None:
+    '''
+    Un-gracefully terminate an OS level `trio.Process` after timeout.
+
+    Used in 2 main cases:
+
+    - "unknown remote runtime state": a hanging/stalled actor that
+      isn't responding after sending a (graceful) runtime cancel
+      request via an IPC msg.
+    - "cancelled during spawn": a process who's actor runtime was
+      cancelled before full startup completed (such that
+      cancel-request-handling machinery was never fully
+      initialized) and thus a "cancel request msg" is never going
+      to be handled.
+
+    '''
    # NOTE: this timeout used to do nothing since we were shielding
    # the ``.wait()`` inside ``new_proc()`` which will pretty much
    # never release until the process exits, now it acts as
@ -216,6 +234,9 @@ async def do_hard_kill(
        # and wait for it to exit. If cancelled, kills the process and
        # waits for it to finish exiting before propagating the
        # cancellation.
+        #
+        # This code was originally triggred by ``proc.__aexit__()``
+        # but now must be called manually.
        with trio.CancelScope(shield=True):
            if proc.stdin is not None:
                await proc.stdin.aclose()
@ -231,10 +252,14 @@ async def do_hard_kill(
                with trio.CancelScope(shield=True):
                    await proc.wait()

+    # XXX NOTE XXX: zombie squad dispatch:
+    # (should ideally never, but) If we do get here it means
+    # graceful termination of a process failed and we need to
+    # resort to OS level signalling to interrupt and cancel the
+    # (presumably stalled or hung) actor. Since we never allow
+    # zombies (as a feature) we ask the OS to do send in the
+    # removal swad as the last resort.
    if cs.cancelled_caught:
-        # XXX: should pretty much never get here unless we have
-        # to move the bits from ``proc.__aexit__()`` out and
-        # into here.
        log.critical(f"#ZOMBIE_LORD_IS_HERE: {proc}")
        proc.kill()

@ -249,10 +274,13 @@ async def soft_wait(
    portal: Portal,

 ) -> None:
-    # Wait for proc termination but **dont' yet** call
-    # ``trio.Process.__aexit__()`` (it tears down stdio
-    # which will kill any waiting remote pdb trace).
-    # This is a "soft" (cancellable) join/reap.
+    '''
+    Wait for proc termination but **dont' yet** teardown
+    std-streams (since it will clobber any ongoing pdb REPL
+    session). This is our "soft" (and thus itself cancellable)
+    join/reap on an actor-runtime-in-process.
+
+    '''
    uid = portal.channel.uid
    try:
        log.cancel(f'Soft waiting on actor:\n{uid}')
@ -275,7 +303,13 @@ async def soft_wait(
                await wait_func(proc)
                n.cancel_scope.cancel()

+            # start a task to wait on the termination of the
+            # process by itself waiting on a (caller provided) wait
+            # function which should unblock when the target process
+            # has terminated.
            n.start_soon(cancel_on_proc_deth)
+
+            # send the actor-runtime a cancel request.
            await portal.cancel_actor()

            if proc.poll() is None:  # type: ignore
@ -295,7 +329,7 @@ async def new_proc(
    errors: dict[tuple[str, str], Exception],

    # passed through to actor main
-    bind_addr: tuple[str, int],
+    bind_addrs: list[tuple[str, int]],
    parent_addr: tuple[str, int],
    _runtime_vars: dict[str, Any],  # serialized and sent to _child

@ -317,7 +351,7 @@ async def new_proc(
        actor_nursery,
        subactor,
        errors,
-        bind_addr,
+        bind_addrs,
        parent_addr,
        _runtime_vars,  # run time vars
        infect_asyncio=infect_asyncio,
@ -332,7 +366,7 @@ async def trio_proc(
    errors: dict[tuple[str, str], Exception],

    # passed through to actor main
-    bind_addr: tuple[str, int],
+    bind_addrs: list[tuple[str, int]],
    parent_addr: tuple[str, int],
    _runtime_vars: dict[str, Any],  # serialized and sent to _child
    *,
@ -418,12 +452,11 @@ async def trio_proc(

        # send additional init params
        await chan.send({
-            "_parent_main_data": subactor._parent_main_data,
-            "enable_modules": subactor.enable_modules,
-            "_arb_addr": subactor._arb_addr,
-            "bind_host": bind_addr[0],
-            "bind_port": bind_addr[1],
-            "_runtime_vars": _runtime_vars,
+            '_parent_main_data': subactor._parent_main_data,
+            'enable_modules': subactor.enable_modules,
+            'reg_addrs': subactor.reg_addrs,
+            'bind_addrs': bind_addrs,
+            '_runtime_vars': _runtime_vars,
        })

        # track subactor in current nursery
@ -457,7 +490,7 @@ async def trio_proc(

            # cancel result waiter that may have been spawned in
            # tandem if not done already
-            log.warning(
+            log.cancel(
                "Cancelling existing result waiter task for "
                f"{subactor.uid}")
            nursery.cancel_scope.cancel()
@ -510,7 +543,7 @@ async def mp_proc(
    subactor: Actor,
    errors: dict[tuple[str, str], Exception],
    # passed through to actor main
-    bind_addr: tuple[str, int],
+    bind_addrs: list[tuple[str, int]],
    parent_addr: tuple[str, int],
    _runtime_vars: dict[str, Any],  # serialized and sent to _child
    *,
@ -568,7 +601,7 @@ async def mp_proc(
        target=_mp_main,
        args=(
            subactor,
-            bind_addr,
+            bind_addrs,
            fs_info,
            _spawn_method,
            parent_addr,
--- a/tractor/_state.py
+++ b/tractor/_state.py
@ -23,11 +23,6 @@ from typing import (
    Any,
 )

-import trio
-
-from ._exceptions import NoRuntime
-
-
 _current_actor: Optional['Actor'] = None  # type: ignore # noqa
 _runtime_vars: dict[str, Any] = {
    '_debug_mode': False,
@ -37,8 +32,11 @@ _runtime_vars: dict[str, Any] = {


 def current_actor(err_on_no_runtime: bool = True) -> 'Actor':  # type: ignore # noqa
-    """Get the process-local actor instance.
-    """
+    '''
+    Get the process-local actor instance.
+
+    '''
+    from ._exceptions import NoRuntime
    if _current_actor is None and err_on_no_runtime:
        raise NoRuntime("No local actor has been initialized yet")

@ -46,16 +44,20 @@ def current_actor(err_on_no_runtime: bool = True) -> 'Actor':  # type: ignore #


 def is_main_process() -> bool:
-    """Bool determining if this actor is running in the top-most process.
-    """
+    '''
+    Bool determining if this actor is running in the top-most process.
+
+    '''
    import multiprocessing as mp
    return mp.current_process().name == 'MainProcess'


 def debug_mode() -> bool:
-    """Bool determining if "debug mode" is on which enables
+    '''
+    Bool determining if "debug mode" is on which enables
    remote subactor pdb entry on crashes.
-    """
+
+    '''
    return bool(_runtime_vars['_debug_mode'])


--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@ -14,31 +14,36 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.

-"""
+'''
 Message stream types and APIs.

-"""
+The machinery and types behind ``Context.open_stream()``
+
+'''
 from __future__ import annotations
 import inspect
-from contextlib import asynccontextmanager
-from dataclasses import dataclass
+from contextlib import asynccontextmanager as acm
 from typing import (
    Any,
-    Optional,
    Callable,
-    AsyncGenerator,
-    AsyncIterator
+    AsyncIterator,
+    TYPE_CHECKING,
 )
-
 import warnings

 import trio

-from ._ipc import Channel
-from ._exceptions import unpack_error, ContextCancelled
-from ._state import current_actor
+from ._exceptions import (
+    _raise_from_no_key_in_msg,
+)
 from .log import get_logger
-from .trionics import broadcast_receiver, BroadcastReceiver
+from .trionics import (
+    broadcast_receiver,
+    BroadcastReceiver,
+)
+
+if TYPE_CHECKING:
+    from ._context import Context


 log = get_logger(__name__)
@ -49,7 +54,6 @@ log = get_logger(__name__)
 #   messages? class ReceiveChannel(AsyncResource, Generic[ReceiveType]):
 # - use __slots__ on ``Context``?

-
 class MsgStream(trio.abc.Channel):
    '''
    A bidirectional message stream for receiving logically sequenced
@ -70,9 +74,9 @@ class MsgStream(trio.abc.Channel):
    '''
    def __init__(
        self,
-        ctx: 'Context',  # typing: ignore # noqa
+        ctx: Context,  # typing: ignore # noqa
        rx_chan: trio.MemoryReceiveChannel,
-        _broadcaster: Optional[BroadcastReceiver] = None,
+        _broadcaster: BroadcastReceiver | None = None,

    ) -> None:
        self._ctx = ctx
@ -86,13 +90,35 @@ class MsgStream(trio.abc.Channel):
    # delegate directly to underlying mem channel
    def receive_nowait(self):
        msg = self._rx_chan.receive_nowait()
-        return msg['yield']
+        try:
+            return msg['yield']
+        except KeyError as kerr:
+            _raise_from_no_key_in_msg(
+                ctx=self._ctx,
+                msg=msg,
+                src_err=kerr,
+                log=log,
+                expect_key='yield',
+                stream=self,
+            )

    async def receive(self):
-        '''Async receive a single msg from the IPC transport, the next
-        in sequence for this stream.
+        '''
+        Receive a single msg from the IPC transport, the next in
+        sequence sent by the far end task (possibly in order as
+        determined by the underlying protocol).

        '''
+        # NOTE: `trio.ReceiveChannel` implements
+        # EOC handling as follows (aka uses it
+        # to gracefully exit async for loops):
+        #
+        # async def __anext__(self) -> ReceiveType:
+        #     try:
+        #         return await self.receive()
+        #     except trio.EndOfChannel:
+        #         raise StopAsyncIteration
+
        # see ``.aclose()`` for notes on the old behaviour prior to
        # introducing this
        if self._eoc:
@ -105,43 +131,15 @@ class MsgStream(trio.abc.Channel):
            msg = await self._rx_chan.receive()
            return msg['yield']

-        except KeyError as err:
-            # internal error should never get here
-            assert msg.get('cid'), ("Received internal error at portal?")
-
-            # TODO: handle 2 cases with 3.10 match syntax
-            # - 'stop'
-            # - 'error'
-            # possibly just handle msg['stop'] here!
-
-            if self._closed:
-                raise trio.ClosedResourceError('This stream was closed')
-
-            if msg.get('stop') or self._eoc:
-                log.debug(f"{self} was stopped at remote end")
-
-                # XXX: important to set so that a new ``.receive()``
-                # call (likely by another task using a broadcast receiver)
-                # doesn't accidentally pull the ``return`` message
-                # value out of the underlying feed mem chan!
-                self._eoc = True
-
-                # # when the send is closed we assume the stream has
-                # # terminated and signal this local iterator to stop
-                # await self.aclose()
-
-                # XXX: this causes ``ReceiveChannel.__anext__()`` to
-                # raise a ``StopAsyncIteration`` **and** in our catch
-                # block below it will trigger ``.aclose()``.
-                raise trio.EndOfChannel from err
-
-            # TODO: test that shows stream raising an expected error!!!
-            elif msg.get('error'):
-                # raise the error message
-                raise unpack_error(msg, self._ctx.chan)
-
-            else:
-                raise
+        except KeyError as kerr:
+            _raise_from_no_key_in_msg(
+                ctx=self._ctx,
+                msg=msg,
+                src_err=kerr,
+                log=log,
+                expect_key='yield',
+                stream=self,
+            )

        except (
            trio.ClosedResourceError,  # by self._rx_chan
@ -275,7 +273,7 @@ class MsgStream(trio.abc.Channel):
        # still need to consume msgs that are "in transit" from the far
        # end (eg. for ``Context.result()``).

-    @asynccontextmanager
+    @acm
    async def subscribe(
        self,

@ -335,8 +333,8 @@ class MsgStream(trio.abc.Channel):
        Send a message over this stream to the far end.

        '''
-        if self._ctx._error:
-            raise self._ctx._error  # from None
+        if self._ctx._remote_error:
+            raise self._ctx._remote_error  # from None

        if self._closed:
            raise trio.ClosedResourceError('This stream was already closed')
@ -344,371 +342,11 @@ class MsgStream(trio.abc.Channel):
        await self._ctx.chan.send({'yield': data, 'cid': self._ctx.cid})


-@dataclass
-class Context:
-    '''
-    An inter-actor, ``trio`` task communication context.
-
-    NB: This class should never be instatiated directly, it is delivered
-    by either runtime machinery to a remotely started task or by entering
-    ``Portal.open_context()``.
-
-    Allows maintaining task or protocol specific state between
-    2 communicating actor tasks. A unique context is created on the
-    callee side/end for every request to a remote actor from a portal.
-
-    A context can be cancelled and (possibly eventually restarted) from
-    either side of the underlying IPC channel, open task oriented
-    message streams and acts as an IPC aware inter-actor-task cancel
-    scope.
-
-    '''
-    chan: Channel
-    cid: str
-
-    # these are the "feeder" channels for delivering
-    # message values to the local task from the runtime
-    # msg processing loop.
-    _recv_chan: trio.MemoryReceiveChannel
-    _send_chan: trio.MemorySendChannel
-
-    _remote_func_type: Optional[str] = None
-
-    # only set on the caller side
-    _portal: Optional['Portal'] = None    # type: ignore # noqa
-    _result: Optional[Any] = False
-    _error: Optional[BaseException] = None
-
-    # status flags
-    _cancel_called: bool = False
-    _cancel_msg: Optional[str] = None
-    _enter_debugger_on_cancel: bool = True
-    _started_called: bool = False
-    _started_received: bool = False
-    _stream_opened: bool = False
-
-    # only set on the callee side
-    _scope_nursery: Optional[trio.Nursery] = None
-
-    _backpressure: bool = False
-
-    async def send_yield(self, data: Any) -> None:
-
-        warnings.warn(
-            "`Context.send_yield()` is now deprecated. "
-            "Use ``MessageStream.send()``. ",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        await self.chan.send({'yield': data, 'cid': self.cid})
-
-    async def send_stop(self) -> None:
-        await self.chan.send({'stop': True, 'cid': self.cid})
-
-    async def _maybe_raise_from_remote_msg(
-        self,
-        msg: dict[str, Any],
-
-    ) -> None:
-        '''
-        (Maybe) unpack and raise a msg error into the local scope
-        nursery for this context.
-
-        Acts as a form of "relay" for a remote error raised
-        in the corresponding remote callee task.
-
-        '''
-        error = msg.get('error')
-        if error:
-            # If this is an error message from a context opened by
-            # ``Portal.open_context()`` we want to interrupt any ongoing
-            # (child) tasks within that context to be notified of the remote
-            # error relayed here.
-            #
-            # The reason we may want to raise the remote error immediately
-            # is that there is no guarantee the associated local task(s)
-            # will attempt to read from any locally opened stream any time
-            # soon.
-            #
-            # NOTE: this only applies when
-            # ``Portal.open_context()`` has been called since it is assumed
-            # (currently) that other portal APIs (``Portal.run()``,
-            # ``.run_in_actor()``) do their own error checking at the point
-            # of the call and result processing.
-            log.error(
-                f'Remote context error for {self.chan.uid}:{self.cid}:\n'
-                f'{msg["error"]["tb_str"]}'
-            )
-            error = unpack_error(msg, self.chan)
-            if (
-                isinstance(error, ContextCancelled) and
-                self._cancel_called
-            ):
-                # this is an expected cancel request response message
-                # and we don't need to raise it in scope since it will
-                # potentially override a real error
-                return
-
-            self._error = error
-
-            # TODO: tempted to **not** do this by-reraising in a
-            # nursery and instead cancel a surrounding scope, detect
-            # the cancellation, then lookup the error that was set?
-            if self._scope_nursery:
-
-                async def raiser():
-                    raise self._error from None
-
-                # from trio.testing import wait_all_tasks_blocked
-                # await wait_all_tasks_blocked()
-                if not self._scope_nursery._closed:  # type: ignore
-                    self._scope_nursery.start_soon(raiser)
-
-    async def cancel(
-        self,
-        msg: Optional[str] = None,
-
-    ) -> None:
-        '''
-        Cancel this inter-actor-task context.
-
-        Request that the far side cancel it's current linked context,
-        Timeout quickly in an attempt to sidestep 2-generals...
-
-        '''
-        side = 'caller' if self._portal else 'callee'
-        if msg:
-            assert side == 'callee', 'Only callee side can provide cancel msg'
-
-        log.cancel(f'Cancelling {side} side of context to {self.chan.uid}')
-
-        self._cancel_called = True
-
-        if side == 'caller':
-            if not self._portal:
-                raise RuntimeError(
-                    "No portal found, this is likely a callee side context"
-                )
-
-            cid = self.cid
-            with trio.move_on_after(0.5) as cs:
-                cs.shield = True
-                log.cancel(
-                    f"Cancelling stream {cid} to "
-                    f"{self._portal.channel.uid}")
-
-                # NOTE: we're telling the far end actor to cancel a task
-                # corresponding to *this actor*. The far end local channel
-                # instance is passed to `Actor._cancel_task()` implicitly.
-                await self._portal.run_from_ns('self', '_cancel_task', cid=cid)
-
-            if cs.cancelled_caught:
-                # XXX: there's no way to know if the remote task was indeed
-                # cancelled in the case where the connection is broken or
-                # some other network error occurred.
-                # if not self._portal.channel.connected():
-                if not self.chan.connected():
-                    log.cancel(
-                        "May have failed to cancel remote task "
-                        f"{cid} for {self._portal.channel.uid}")
-                else:
-                    log.cancel(
-                        "Timed out on cancelling remote task "
-                        f"{cid} for {self._portal.channel.uid}")
-
-        # callee side remote task
-        else:
-            self._cancel_msg = msg
-
-            # TODO: should we have an explicit cancel message
-            # or is relaying the local `trio.Cancelled` as an
-            # {'error': trio.Cancelled, cid: "blah"} enough?
-            # This probably gets into the discussion in
-            # https://github.com/goodboy/tractor/issues/36
-            assert self._scope_nursery
-            self._scope_nursery.cancel_scope.cancel()
-
-        if self._recv_chan:
-            await self._recv_chan.aclose()
-
-    @asynccontextmanager
-    async def open_stream(
-
-        self,
-        backpressure: Optional[bool] = True,
-        msg_buffer_size: Optional[int] = None,
-
-    ) -> AsyncGenerator[MsgStream, None]:
-        '''
-        Open a ``MsgStream``, a bi-directional stream connected to the
-        cross-actor (far end) task for this ``Context``.
-
-        This context manager must be entered on both the caller and
-        callee for the stream to logically be considered "connected".
-
-        A ``MsgStream`` is currently "one-shot" use, meaning if you
-        close it you can not "re-open" it for streaming and instead you
-        must re-establish a new surrounding ``Context`` using
-        ``Portal.open_context()``.  In the future this may change but
-        currently there seems to be no obvious reason to support
-        "re-opening":
-            - pausing a stream can be done with a message.
-            - task errors will normally require a restart of the entire
-              scope of the inter-actor task context due to the nature of
-              ``trio``'s cancellation system.
-
-        '''
-        actor = current_actor()
-
-        # here we create a mem chan that corresponds to the
-        # far end caller / callee.
-
-        # Likewise if the surrounding context has been cancelled we error here
-        # since it likely means the surrounding block was exited or
-        # killed
-
-        if self._cancel_called:
-            task = trio.lowlevel.current_task().name
-            raise ContextCancelled(
-                f'Context around {actor.uid[0]}:{task} was already cancelled!'
-            )
-
-        if not self._portal and not self._started_called:
-            raise RuntimeError(
-                'Context.started()` must be called before opening a stream'
-            )
-
-        # NOTE: in one way streaming this only happens on the
-        # caller side inside `Actor.start_remote_task()` so if you try
-        # to send a stop from the caller to the callee in the
-        # single-direction-stream case you'll get a lookup error
-        # currently.
-        ctx = actor.get_context(
-            self.chan,
-            self.cid,
-            msg_buffer_size=msg_buffer_size,
-        )
-        ctx._backpressure = backpressure
-        assert ctx is self
-
-        # XXX: If the underlying channel feeder receive mem chan has
-        # been closed then likely client code has already exited
-        # a ``.open_stream()`` block prior or there was some other
-        # unanticipated error or cancellation from ``trio``.
-
-        if ctx._recv_chan._closed:
-            raise trio.ClosedResourceError(
-                'The underlying channel for this stream was already closed!?')
-
-        async with MsgStream(
-            ctx=self,
-            rx_chan=ctx._recv_chan,
-        ) as stream:
-
-            if self._portal:
-                self._portal._streams.add(stream)
-
-            try:
-                self._stream_opened = True
-
-                # XXX: do we need this?
-                # ensure we aren't cancelled before yielding the stream
-                # await trio.lowlevel.checkpoint()
-                yield stream
-
-                # NOTE: Make the stream "one-shot use".  On exit, signal
-                # ``trio.EndOfChannel``/``StopAsyncIteration`` to the
-                # far end.
-                await stream.aclose()
-
-            finally:
-                if self._portal:
-                    try:
-                        self._portal._streams.remove(stream)
-                    except KeyError:
-                        log.warning(
-                            f'Stream was already destroyed?\n'
-                            f'actor: {self.chan.uid}\n'
-                            f'ctx id: {self.cid}'
-                        )
-
-    async def result(self) -> Any:
-        '''
-        From a caller side, wait for and return the final result from
-        the callee side task.
-
-        '''
-        assert self._portal, "Context.result() can not be called from callee!"
-        assert self._recv_chan
-
-        if self._result is False:
-
-            if not self._recv_chan._closed:  # type: ignore
-
-                # wait for a final context result consuming
-                # and discarding any bi dir stream msgs still
-                # in transit from the far end.
-                while True:
-
-                    msg = await self._recv_chan.receive()
-                    try:
-                        self._result = msg['return']
-                        break
-                    except KeyError as msgerr:
-
-                        if 'yield' in msg:
-                            # far end task is still streaming to us so discard
-                            log.warning(f'Discarding stream delivered {msg}')
-                            continue
-
-                        elif 'stop' in msg:
-                            log.debug('Remote stream terminated')
-                            continue
-
-                        # internal error should never get here
-                        assert msg.get('cid'), (
-                            "Received internal error at portal?")
-
-                        raise unpack_error(
-                            msg, self._portal.channel
-                        ) from msgerr
-
-        return self._result
-
-    async def started(
-        self,
-        value: Optional[Any] = None
-
-    ) -> None:
-        '''
-        Indicate to calling actor's task that this linked context
-        has started and send ``value`` to the other side.
-
-        On the calling side ``value`` is the second item delivered
-        in the tuple returned by ``Portal.open_context()``.
-
-        '''
-        if self._portal:
-            raise RuntimeError(
-                f"Caller side context {self} can not call started!")
-
-        elif self._started_called:
-            raise RuntimeError(
-                f"called 'started' twice on context with {self.chan.uid}")
-
-        await self.chan.send({'started': value, 'cid': self.cid})
-        self._started_called = True
-
-    # TODO: do we need a restart api?
-    # async def restart(self) -> None:
-    #     pass
-
-
 def stream(func: Callable) -> Callable:
-    """Mark an async function as a streaming routine with ``@stream``.
+    '''
+    Mark an async function as a streaming routine with ``@stream``.

-    """
-    # annotate
+    '''
    # TODO: apply whatever solution ``mypy`` ends up picking for this:
    # https://github.com/python/mypy/issues/2087#issuecomment-769266912
    func._tractor_stream_function = True  # type: ignore
@ -734,22 +372,3 @@ def stream(func: Callable) -> Callable:
            "(Or ``to_trio`` if using ``asyncio`` in guest mode)."
        )
    return func
-
-
-def context(func: Callable) -> Callable:
-    """Mark an async function as a streaming routine with ``@context``.
-
-    """
-    # annotate
-    # TODO: apply whatever solution ``mypy`` ends up picking for this:
-    # https://github.com/python/mypy/issues/2087#issuecomment-769266912
-    func._tractor_context_function = True  # type: ignore
-
-    sig = inspect.signature(func)
-    params = sig.parameters
-    if 'ctx' not in params:
-        raise TypeError(
-            "The first argument to the context function "
-            f"{func.__name__} must be `ctx: tractor.Context`"
-        )
-    return func
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@ -21,17 +21,14 @@
 from contextlib import asynccontextmanager as acm
 from functools import partial
 import inspect
-from typing import (
-    Optional,
-    TYPE_CHECKING,
-)
+from typing import TYPE_CHECKING
 import typing
 import warnings

 from exceptiongroup import BaseExceptionGroup
 import trio

-from ._debug import maybe_wait_for_debugger
+from .devx._debug import maybe_wait_for_debugger
 from ._state import current_actor, is_main_process
 from .log import get_logger, get_loglevel
 from ._runtime import Actor
@ -94,7 +91,7 @@ class ActorNursery:
            tuple[
                Actor,
                trio.Process | mp.Process,
-                Optional[Portal],
+                Portal | None,
            ]
        ] = {}
        # portals spawned with ``run_in_actor()`` are
@ -110,12 +107,12 @@ class ActorNursery:
        self,
        name: str,
        *,
-        bind_addr: tuple[str, int] = _default_bind_addr,
+        bind_addrs: list[tuple[str, int]] = [_default_bind_addr],
        rpc_module_paths: list[str] | None = None,
        enable_modules: list[str] | None = None,
        loglevel: str | None = None,  # set log level per subactor
        nursery: trio.Nursery | None = None,
-        debug_mode: Optional[bool] | None = None,
+        debug_mode: bool | None = None,
        infect_asyncio: bool = False,
    ) -> Portal:
        '''
@ -150,7 +147,9 @@ class ActorNursery:
            # modules allowed to invoked funcs from
            enable_modules=enable_modules,
            loglevel=loglevel,
-            arbiter_addr=current_actor()._arb_addr,
+
+            # verbatim relay this actor's registrar addresses
+            registry_addrs=current_actor().reg_addrs,
        )
        parent_addr = self._actor.accept_addr
        assert parent_addr
@ -167,7 +166,7 @@ class ActorNursery:
                self,
                subactor,
                self.errors,
-                bind_addr,
+                bind_addrs,
                parent_addr,
                _rtv,  # run time vars
                infect_asyncio=infect_asyncio,
@ -180,8 +179,8 @@ class ActorNursery:
        fn: typing.Callable,
        *,

-        name: Optional[str] = None,
-        bind_addr: tuple[str, int] = _default_bind_addr,
+        name: str | None = None,
+        bind_addrs: tuple[str, int] = [_default_bind_addr],
        rpc_module_paths: list[str] | None = None,
        enable_modules: list[str] | None = None,
        loglevel: str | None = None,  # set log level per subactor
@ -208,7 +207,7 @@ class ActorNursery:
            enable_modules=[mod_path] + (
                enable_modules or rpc_module_paths or []
            ),
-            bind_addr=bind_addr,
+            bind_addrs=bind_addrs,
            loglevel=loglevel,
            # use the run_in_actor nursery
            nursery=self._ria_nursery,
--- a/tractor/devx/init.py
+++ b/tractor/devx/init.py
@ -0,0 +1,47 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+"""
+Runtime "developer experience" utils and addons to aid our
+(advanced) users and core devs in building distributed applications
+and working with/on the actor runtime.
+
+"""
+from ._debug import (
+    maybe_wait_for_debugger,
+    acquire_debug_lock,
+    breakpoint,
+    pause,
+    pause_from_sync,
+    shield_sigint_handler,
+    MultiActorPdb,
+    open_crash_handler,
+    maybe_open_crash_handler,
+    post_mortem,
+)
+
+__all__ = [
+    'maybe_wait_for_debugger',
+    'acquire_debug_lock',
+    'breakpoint',
+    'pause',
+    'pause_from_sync',
+    'shield_sigint_handler',
+    'MultiActorPdb',
+    'open_crash_handler',
+    'maybe_open_crash_handler',
+    'post_mortem',
+]
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@ -27,10 +27,13 @@ from functools import (
    partial,
    cached_property,
 )
-from contextlib import asynccontextmanager as acm
+from contextlib import (
+    asynccontextmanager as acm,
+    contextmanager as cm,
+    nullcontext,
+)
 from typing import (
    Any,
-    Optional,
    Callable,
    AsyncIterator,
    AsyncGenerator,
@ -40,24 +43,29 @@ from types import FrameType
 import pdbp
 import tractor
 import trio
-from trio_typing import TaskStatus
+from trio_typing import (
+    TaskStatus,
+    # Task,
+)

-from .log import get_logger
-from ._discovery import get_root
-from ._state import (
+from ..log import get_logger
+from .._state import (
    is_root_process,
    debug_mode,
 )
-from ._exceptions import (
+from .._exceptions import (
    is_multi_cancelled,
    ContextCancelled,
 )
-from ._ipc import Channel
+from .._ipc import Channel

 log = get_logger(__name__)


-__all__ = ['breakpoint', 'post_mortem']
+__all__ = [
+    'breakpoint',
+    'post_mortem',
+]


 class Lock:
@ -69,10 +77,10 @@ class Lock:
    '''
    repl: MultiActorPdb | None = None
    # placeholder for function to set a ``trio.Event`` on debugger exit
-    # pdb_release_hook: Optional[Callable] = None
+    # pdb_release_hook: Callable | None = None

    _trio_handler: Callable[
-        [int, Optional[FrameType]], Any
+        [int, FrameType | None], Any
    ] | int | None = None

    # actor-wide variable pointing to current task name using debugger
@ -83,23 +91,23 @@ class Lock:
    # and must be cancelled if this actor is cancelled via IPC
    # request-message otherwise deadlocks with the parent actor may
    # ensure
-    _debugger_request_cs: Optional[trio.CancelScope] = None
+    _debugger_request_cs: trio.CancelScope | None = None

    # NOTE: set only in the root actor for the **local** root spawned task
    # which has acquired the lock (i.e. this is on the callee side of
    # the `lock_tty_for_child()` context entry).
-    _root_local_task_cs_in_debug: Optional[trio.CancelScope] = None
+    _root_local_task_cs_in_debug: trio.CancelScope | None = None

    # actor tree-wide actor uid that supposedly has the tty lock
-    global_actor_in_debug: Optional[tuple[str, str]] = None
+    global_actor_in_debug: tuple[str, str] = None

-    local_pdb_complete: Optional[trio.Event] = None
-    no_remote_has_tty: Optional[trio.Event] = None
+    local_pdb_complete: trio.Event | None = None
+    no_remote_has_tty: trio.Event | None = None

    # lock in root actor preventing multi-access to local tty
    _debug_lock: trio.StrictFIFOLock = trio.StrictFIFOLock()

-    _orig_sigint_handler: Optional[Callable] = None
+    _orig_sigint_handler: Callable | None = None
    _blocked: set[tuple[str, str]] = set()

    @classmethod
@ -110,6 +118,7 @@ class Lock:
        )

    @classmethod
+    @pdbp.hideframe  # XXX NOTE XXX see below in `.pause_from_sync()`
    def unshield_sigint(cls):
        # always restore ``trio``'s sigint handler. see notes below in
        # the pdb factory about the nightmare that is that code swapping
@ -129,10 +138,6 @@ class Lock:
            if owner:
                raise

-        # actor-local state, irrelevant for non-root.
-        cls.global_actor_in_debug = None
-        cls.local_task_in_debug = None
-
        try:
            # sometimes the ``trio`` might already be terminated in
            # which case this call will raise.
@ -143,6 +148,11 @@ class Lock:
            cls.unshield_sigint()
            cls.repl = None

+            # actor-local state, irrelevant for non-root.
+            cls.global_actor_in_debug = None
+            cls.local_task_in_debug = None
+
+

 class TractorConfig(pdbp.DefaultConfig):
    '''
@ -151,7 +161,7 @@ class TractorConfig(pdbp.DefaultConfig):
    '''
    use_pygments: bool = True
    sticky_by_default: bool = False
-    enable_hidden_frames: bool = False
+    enable_hidden_frames: bool = True

    # much thanks @mdmintz for the hot tip!
    # fixes line spacing issue when resizing terminal B)
@ -228,26 +238,23 @@ async def _acquire_debug_lock_from_root_task(
    to the ``pdb`` repl.

    '''
-    task_name = trio.lowlevel.current_task().name
+    task_name: str = trio.lowlevel.current_task().name
+    we_acquired: bool = False

    log.runtime(
        f"Attempting to acquire TTY lock, remote task: {task_name}:{uid}"
    )
-
-    we_acquired = False
-
    try:
        log.runtime(
            f"entering lock checkpoint, remote task: {task_name}:{uid}"
        )
-        we_acquired = True
-
        # NOTE: if the surrounding cancel scope from the
        # `lock_tty_for_child()` caller is cancelled, this line should
        # unblock and NOT leave us in some kind of
        # a "child-locked-TTY-but-child-is-uncontactable-over-IPC"
        # condition.
        await Lock._debug_lock.acquire()
+        we_acquired = True

        if Lock.no_remote_has_tty is None:
            # mark the tty lock as being in use so that the runtime
@ -323,7 +330,7 @@ async def lock_tty_for_child(
            f'Actor {subactor_uid} is blocked from acquiring debug lock\n'
            f"remote task: {task_name}:{subactor_uid}"
        )
-        ctx._enter_debugger_on_cancel = False
+        ctx._enter_debugger_on_cancel: bool = False
        await ctx.cancel(f'Debug lock blocked for {subactor_uid}')
        return 'pdb_lock_blocked'

@ -374,12 +381,14 @@ async def wait_for_parent_stdin_hijack(

    This function is used by any sub-actor to acquire mutex access to
    the ``pdb`` REPL and thus the root's TTY for interactive debugging
-    (see below inside ``_breakpoint()``). It can be used to ensure that
+    (see below inside ``pause()``). It can be used to ensure that
    an intermediate nursery-owning actor does not clobber its children
    if they are in debug (see below inside
    ``maybe_wait_for_debugger()``).

    '''
+    from .._discovery import get_root
+
    with trio.CancelScope(shield=True) as cs:
        Lock._debugger_request_cs = cs

@ -389,7 +398,7 @@ async def wait_for_parent_stdin_hijack(
                # this syncs to child's ``Context.started()`` call.
                async with portal.open_context(

-                    tractor._debug.lock_tty_for_child,
+                    lock_tty_for_child,
                    subactor_uid=actor_uid,

                ) as (ctx, val):
@ -440,38 +449,268 @@ def mk_mpdb() -> tuple[MultiActorPdb, Callable]:
    return pdb, Lock.unshield_sigint


-async def _breakpoint(
-
-    debug_func,
-
-    # TODO:
-    # shield: bool = False
+def shield_sigint_handler(
+    signum: int,
+    frame: 'frame',  # type: ignore # noqa
+    # pdb_obj: MultiActorPdb | None = None,
+    *args,

 ) -> None:
    '''
-    Breakpoint entry for engaging debugger instance sync-interaction,
-    from async code, executing in actor runtime (task).
+    Specialized, debugger-aware SIGINT handler.
+
+    In childred we always ignore to avoid deadlocks since cancellation
+    should always be managed by the parent supervising actor. The root
+    is always cancelled on ctrl-c.

    '''
    __tracebackhide__ = True
+
+    uid_in_debug: tuple[str, str] | None = Lock.global_actor_in_debug
+
+    actor = tractor.current_actor()
+    # print(f'{actor.uid} in HANDLER with ')
+
+    def do_cancel():
+        # If we haven't tried to cancel the runtime then do that instead
+        # of raising a KBI (which may non-gracefully destroy
+        # a ``trio.run()``).
+        if not actor._cancel_called:
+            actor.cancel_soon()
+
+        # If the runtime is already cancelled it likely means the user
+        # hit ctrl-c again because teardown didn't full take place in
+        # which case we do the "hard" raising of a local KBI.
+        else:
+            raise KeyboardInterrupt
+
+    any_connected: bool = False
+
+    if uid_in_debug is not None:
+        # try to see if the supposed (sub)actor in debug still
+        # has an active connection to *this* actor, and if not
+        # it's likely they aren't using the TTY lock / debugger
+        # and we should propagate SIGINT normally.
+        chans: list[tractor.Channel] = actor._peers.get(tuple(uid_in_debug))
+        if chans:
+            any_connected = any(chan.connected() for chan in chans)
+            if not any_connected:
+                log.warning(
+                    'A global actor reported to be in debug '
+                    'but no connection exists for this child:\n'
+                    f'{uid_in_debug}\n'
+                    'Allowing SIGINT propagation..'
+                )
+                return do_cancel()
+
+    # only set in the actor actually running the REPL
+    pdb_obj: MultiActorPdb | None = Lock.repl
+
+    # root actor branch that reports whether or not a child
+    # has locked debugger.
+    if (
+        is_root_process()
+        and uid_in_debug is not None
+
+        # XXX: only if there is an existing connection to the
+        # (sub-)actor in debug do we ignore SIGINT in this
+        # parent! Otherwise we may hang waiting for an actor
+        # which has already terminated to unlock.
+        and any_connected
+    ):
+        # we are root and some actor is in debug mode
+        # if uid_in_debug is not None:
+
+        if pdb_obj:
+            name = uid_in_debug[0]
+            if name != 'root':
+                log.pdb(
+                    f"Ignoring SIGINT, child in debug mode: `{uid_in_debug}`"
+                )
+
+            else:
+                log.pdb(
+                    "Ignoring SIGINT while in debug mode"
+                )
+    elif (
+        is_root_process()
+    ):
+        if pdb_obj:
+            log.pdb(
+                "Ignoring SIGINT since debug mode is enabled"
+            )
+
+        if (
+            Lock._root_local_task_cs_in_debug
+            and not Lock._root_local_task_cs_in_debug.cancel_called
+        ):
+            Lock._root_local_task_cs_in_debug.cancel()
+
+            # revert back to ``trio`` handler asap!
+            Lock.unshield_sigint()
+
+    # child actor that has locked the debugger
+    elif not is_root_process():
+
+        chan: Channel = actor._parent_chan
+        if not chan or not chan.connected():
+            log.warning(
+                'A global actor reported to be in debug '
+                'but no connection exists for its parent:\n'
+                f'{uid_in_debug}\n'
+                'Allowing SIGINT propagation..'
+            )
+            return do_cancel()
+
+        task: str | None = Lock.local_task_in_debug
+        if (
+            task
+            and pdb_obj
+        ):
+            log.pdb(
+                f"Ignoring SIGINT while task in debug mode: `{task}`"
+            )
+
+        # TODO: how to handle the case of an intermediary-child actor
+        # that **is not** marked in debug mode? See oustanding issue:
+        # https://github.com/goodboy/tractor/issues/320
+        # elif debug_mode():
+
+    else:  # XXX: shouldn't ever get here?
+        raise RuntimeError("WTFWTFWTF")
+        # raise KeyboardInterrupt("WTFWTFWTF")
+
+    # NOTE: currently (at least on ``fancycompleter`` 0.9.2)
+    # it looks to be that the last command that was run (eg. ll)
+    # will be repeated by default.
+
+    # maybe redraw/print last REPL output to console since
+    # we want to alert the user that more input is expect since
+    # nothing has been done dur to ignoring sigint.
+    if (
+        pdb_obj  # only when this actor has a REPL engaged
+    ):
+        # XXX: yah, mega hack, but how else do we catch this madness XD
+        if pdb_obj.shname == 'xonsh':
+            pdb_obj.stdout.write(pdb_obj.prompt)
+
+        pdb_obj.stdout.flush()
+
+        # TODO: make this work like sticky mode where if there is output
+        # detected as written to the tty we redraw this part underneath
+        # and erase the past draw of this same bit above?
+        # pdb_obj.sticky = True
+        # pdb_obj._print_if_sticky()
+
+        # also see these links for an approach from ``ptk``:
+        # https://github.com/goodboy/tractor/issues/130#issuecomment-663752040
+        # https://github.com/prompt-toolkit/python-prompt-toolkit/blob/c2c6af8a0308f9e5d7c0e28cb8a02963fe0ce07a/prompt_toolkit/patch_stdout.py
+
+
+def _set_trace(
+    actor: tractor.Actor | None = None,
+    pdb: MultiActorPdb | None = None,
+    shield: bool = False,
+):
+    __tracebackhide__: bool = True
+    actor: tractor.Actor = actor or tractor.current_actor()
+
+    # start 2 levels up in user code
+    frame: FrameType | None = sys._getframe()
+    if frame:
+        frame: FrameType = frame.f_back  # type: ignore
+
+    if (
+        frame
+        and (
+            pdb
+            and actor is not None
+        ) or shield
+    ):
+        # pdbp.set_trace()
+        log.pdb(f"\nAttaching pdb to actor: {actor.uid}\n")
+        # no f!#$&* idea, but when we're in async land
+        # we need 2x frames up?
+        frame = frame.f_back
+        # frame = frame.f_back
+
+        # if shield:
+        #     frame = frame.f_back
+
+    else:
+        pdb, undo_sigint = mk_mpdb()
+
+        # we entered the global ``breakpoint()`` built-in from sync
+        # code?
+        Lock.local_task_in_debug = 'sync'
+
+    pdb.set_trace(frame=frame)
+    # undo_
+
+
+async def pause(
+
+    debug_func: Callable = _set_trace,
+    release_lock_signal: trio.Event | None = None,
+
+    # TODO: allow caller to pause despite task cancellation,
+    # exactly the same as wrapping with:
+    # with CancelScope(shield=True):
+    #     await pause()
+    # => the REMAINING ISSUE is that the scope's .__exit__() frame
+    # is always show in the debugger on entry.. and there seems to
+    # be no way to override it?..
+    # shield: bool = False,
+
+    # TODO:
+    # shield: bool = False
+    task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED
+
+) -> None:
+    '''
+    A pause point (more commonly known as a "breakpoint") interrupt
+    instruction for engaging a blocking debugger instance to
+    conduct manual console-based-REPL-interaction from within
+    `tractor`'s async runtime, normally from some single-threaded
+    and currently executing actor-hosted-`trio`-task in some
+    (remote) process.
+
+    NOTE: we use the semantics "pause" since it better encompasses
+    the entirety of the necessary global-runtime-state-mutation any
+    actor-task must access and lock in order to get full isolated
+    control over the process tree's root TTY:
+    https://en.wikipedia.org/wiki/Breakpoint
+
+    '''
+    # __tracebackhide__ = True
    actor = tractor.current_actor()
    pdb, undo_sigint = mk_mpdb()
    task_name = trio.lowlevel.current_task().name

-    # TODO: is it possible to debug a trio.Cancelled except block?
-    # right now it seems like we can kinda do with by shielding
-    # around ``tractor.breakpoint()`` but not if we move the shielded
-    # scope here???
-    # with trio.CancelScope(shield=shield):
-    #     await trio.lowlevel.checkpoint()
-
    if (
        not Lock.local_pdb_complete
        or Lock.local_pdb_complete.is_set()
    ):
        Lock.local_pdb_complete = trio.Event()

-    # TODO: need a more robust check for the "root" actor
+    # if shield:
+    debug_func = partial(
+        debug_func,
+        # shield=shield,
+    )
+
+    # def _exit(self, *args, **kwargs):
+    #     __tracebackhide__: bool = True
+    #     super().__exit__(*args, **kwargs)
+
+    # trio.CancelScope.__exit__.__tracebackhide__ = True
+
+    # import types
+    # with trio.CancelScope(shield=shield) as cs:
+        # cs.__exit__ = types.MethodType(_exit, cs)
+        # cs.__exit__.__tracebackhide__ = True
+
+        # TODO: need a more robust check for the "root" actor
    if (
        not is_root_process()
        and actor._parent_chan  # a connected child
@ -559,10 +798,22 @@ async def _breakpoint(
        Lock.repl = pdb

    try:
-        # block here one (at the appropriate frame *up*) where
-        # ``breakpoint()`` was awaited and begin handling stdio.
-        log.debug("Entering the synchronous world of pdb")
-        debug_func(actor, pdb)
+        if debug_func is None:
+            # assert release_lock_signal, (
+            #     'Must pass `release_lock_signal: trio.Event` if no '
+            #     'trace func provided!'
+            # )
+            print(f"{actor.uid} ENTERING WAIT")
+            task_status.started()
+
+            # with trio.CancelScope(shield=True):
+            #     await release_lock_signal.wait()
+
+        else:
+            # block here one (at the appropriate frame *up*) where
+            # ``breakpoint()`` was awaited and begin handling stdio.
+            log.debug("Entering the synchronous world of pdb")
+            debug_func(actor, pdb)

    except bdb.BdbQuit:
        Lock.release()
@ -580,203 +831,80 @@ async def _breakpoint(
    #     # signal.signal = pdbp.hideframe(signal.signal)


-def shield_sigint_handler(
-    signum: int,
-    frame: 'frame',  # type: ignore # noqa
-    # pdb_obj: Optional[MultiActorPdb] = None,
-    *args,
+# TODO: allow pausing from sync code.
+# normally by remapping python's builtin breakpoint() hook to this
+# runtime aware version which takes care of all .
+def pause_from_sync() -> None:
+    print("ENTER SYNC PAUSE")
+    actor: tractor.Actor = tractor.current_actor(
+        err_on_no_runtime=False,
+    )
+    if actor:
+        try:
+            import greenback
+            # __tracebackhide__ = True

-) -> None:
-    '''
-    Specialized, debugger-aware SIGINT handler.

-    In childred we always ignore to avoid deadlocks since cancellation
-    should always be managed by the parent supervising actor. The root
-    is always cancelled on ctrl-c.
+            # task_can_release_tty_lock = trio.Event()

-    '''
-    __tracebackhide__ = True
-
-    uid_in_debug = Lock.global_actor_in_debug
-
-    actor = tractor.current_actor()
-    # print(f'{actor.uid} in HANDLER with ')
-
-    def do_cancel():
-        # If we haven't tried to cancel the runtime then do that instead
-        # of raising a KBI (which may non-gracefully destroy
-        # a ``trio.run()``).
-        if not actor._cancel_called:
-            actor.cancel_soon()
-
-        # If the runtime is already cancelled it likely means the user
-        # hit ctrl-c again because teardown didn't full take place in
-        # which case we do the "hard" raising of a local KBI.
-        else:
-            raise KeyboardInterrupt
-
-    any_connected = False
-
-    if uid_in_debug is not None:
-        # try to see if the supposed (sub)actor in debug still
-        # has an active connection to *this* actor, and if not
-        # it's likely they aren't using the TTY lock / debugger
-        # and we should propagate SIGINT normally.
-        chans = actor._peers.get(tuple(uid_in_debug))
-        if chans:
-            any_connected = any(chan.connected() for chan in chans)
-            if not any_connected:
-                log.warning(
-                    'A global actor reported to be in debug '
-                    'but no connection exists for this child:\n'
-                    f'{uid_in_debug}\n'
-                    'Allowing SIGINT propagation..'
-                )
-                return do_cancel()
-
-    # only set in the actor actually running the REPL
-    pdb_obj = Lock.repl
-
-    # root actor branch that reports whether or not a child
-    # has locked debugger.
-    if (
-        is_root_process()
-        and uid_in_debug is not None
-
-        # XXX: only if there is an existing connection to the
-        # (sub-)actor in debug do we ignore SIGINT in this
-        # parent! Otherwise we may hang waiting for an actor
-        # which has already terminated to unlock.
-        and any_connected
-    ):
-        # we are root and some actor is in debug mode
-        # if uid_in_debug is not None:
-
-        if pdb_obj:
-            name = uid_in_debug[0]
-            if name != 'root':
-                log.pdb(
-                    f"Ignoring SIGINT, child in debug mode: `{uid_in_debug}`"
-                )
-
-            else:
-                log.pdb(
-                    "Ignoring SIGINT while in debug mode"
-                )
-    elif (
-        is_root_process()
-    ):
-        if pdb_obj:
-            log.pdb(
-                "Ignoring SIGINT since debug mode is enabled"
+            # spawn bg task which will lock out the TTY, we poll
+            # just below until the release event is reporting that task as
+            # waiting.. not the most ideal but works for now ;)
+            greenback.await_(
+                actor._service_n.start(partial(
+                    pause,
+                    debug_func=None,
+                    # release_lock_signal=task_can_release_tty_lock,
+                ))
            )

-        if (
-            Lock._root_local_task_cs_in_debug
-            and not Lock._root_local_task_cs_in_debug.cancel_called
-        ):
-            Lock._root_local_task_cs_in_debug.cancel()
-
-            # revert back to ``trio`` handler asap!
-            Lock.unshield_sigint()
-
-    # child actor that has locked the debugger
-    elif not is_root_process():
-
-        chan: Channel = actor._parent_chan
-        if not chan or not chan.connected():
-            log.warning(
-                'A global actor reported to be in debug '
-                'but no connection exists for its parent:\n'
-                f'{uid_in_debug}\n'
-                'Allowing SIGINT propagation..'
-            )
-            return do_cancel()
-
-        task = Lock.local_task_in_debug
-        if (
-            task
-            and pdb_obj
-        ):
-            log.pdb(
-                f"Ignoring SIGINT while task in debug mode: `{task}`"
-            )
-
-        # TODO: how to handle the case of an intermediary-child actor
-        # that **is not** marked in debug mode? See oustanding issue:
-        # https://github.com/goodboy/tractor/issues/320
-        # elif debug_mode():
-
-    else:  # XXX: shouldn't ever get here?
-        print("WTFWTFWTF")
-        raise KeyboardInterrupt
-
-    # NOTE: currently (at least on ``fancycompleter`` 0.9.2)
-    # it looks to be that the last command that was run (eg. ll)
-    # will be repeated by default.
-
-    # maybe redraw/print last REPL output to console since
-    # we want to alert the user that more input is expect since
-    # nothing has been done dur to ignoring sigint.
-    if (
-        pdb_obj  # only when this actor has a REPL engaged
-    ):
-        # XXX: yah, mega hack, but how else do we catch this madness XD
-        if pdb_obj.shname == 'xonsh':
-            pdb_obj.stdout.write(pdb_obj.prompt)
-
-        pdb_obj.stdout.flush()
-
-        # TODO: make this work like sticky mode where if there is output
-        # detected as written to the tty we redraw this part underneath
-        # and erase the past draw of this same bit above?
-        # pdb_obj.sticky = True
-        # pdb_obj._print_if_sticky()
-
-        # also see these links for an approach from ``ptk``:
-        # https://github.com/goodboy/tractor/issues/130#issuecomment-663752040
-        # https://github.com/prompt-toolkit/python-prompt-toolkit/blob/c2c6af8a0308f9e5d7c0e28cb8a02963fe0ce07a/prompt_toolkit/patch_stdout.py
-
-        # XXX LEGACY: lol, see ``pdbpp`` issue:
-        # https://github.com/pdbpp/pdbpp/issues/496
-
-
-def _set_trace(
-    actor: tractor.Actor | None = None,
-    pdb: MultiActorPdb | None = None,
-):
-    __tracebackhide__ = True
-    actor = actor or tractor.current_actor()
-
-    # start 2 levels up in user code
-    frame: Optional[FrameType] = sys._getframe()
-    if frame:
-        frame = frame.f_back  # type: ignore
-
-    if (
-        frame
-        and pdb
-        and actor is not None
-    ):
-        log.pdb(f"\nAttaching pdb to actor: {actor.uid}\n")
-        # no f!#$&* idea, but when we're in async land
-        # we need 2x frames up?
-        frame = frame.f_back
-
+        except ModuleNotFoundError:
+            log.warning('NO GREENBACK FOUND')
    else:
-        pdb, undo_sigint = mk_mpdb()
+        log.warning('Not inside actor-runtime')

-        # we entered the global ``breakpoint()`` built-in from sync
-        # code?
-        Lock.local_task_in_debug = 'sync'
+    db, undo_sigint = mk_mpdb()
+    Lock.local_task_in_debug = 'sync'
+    # db.config.enable_hidden_frames = True

-    pdb.set_trace(frame=frame)
+    # we entered the global ``breakpoint()`` built-in from sync
+    # code?
+    frame: FrameType | None = sys._getframe()
+    # print(f'FRAME: {str(frame)}')
+    # assert not db._is_hidden(frame)
+
+    frame: FrameType = frame.f_back  # type: ignore
+    # print(f'FRAME: {str(frame)}')
+    # if not db._is_hidden(frame):
+    #     pdbp.set_trace()
+    # db._hidden_frames.append(
+    #     (frame, frame.f_lineno)
+    # )
+    db.set_trace(frame=frame)
+    # NOTE XXX: see the `@pdbp.hideframe` decoration
+    # on `Lock.unshield_sigint()`.. I have NO CLUE why
+    # the next instruction's def frame is being shown
+    # in the tb but it seems to be something wonky with
+    # the way `pdb` core works?
+    # undo_sigint()
+
+    # Lock.global_actor_in_debug = actor.uid
+    # Lock.release()
+    # task_can_release_tty_lock.set()


-breakpoint = partial(
-    _breakpoint,
-    _set_trace,
-)
+# using the "pause" semantics instead since
+# that better covers actually somewhat "pausing the runtime"
+# for this particular paralell task to do debugging B)
+# pp = pause  # short-hand for "pause point"
+
+
+async def breakpoint(**kwargs):
+    log.warning(
+        '`tractor.breakpoint()` is deprecated!\n'
+        'Please use `tractor.pause()` instead!\n'
+    )
+    await pause(**kwargs)


 def _post_mortem(
@ -801,7 +929,7 @@ def _post_mortem(


 post_mortem = partial(
-    _breakpoint,
+    pause,
    _post_mortem,
 )

@ -883,8 +1011,7 @@ async def maybe_wait_for_debugger(
        # will make the pdb repl unusable.
        # Instead try to wait for pdb to be released before
        # tearing down.
-
-        sub_in_debug = None
+        sub_in_debug: tuple[str, str] | None = None

        for _ in range(poll_steps):

@ -904,13 +1031,15 @@ async def maybe_wait_for_debugger(

                debug_complete = Lock.no_remote_has_tty
                if (
-                    (debug_complete and
-                     not debug_complete.is_set())
+                    debug_complete
+                    and sub_in_debug is not None
+                    and not debug_complete.is_set()
                ):
-                    log.debug(
+                    log.pdb(
                        'Root has errored but pdb is in use by '
                        f'child {sub_in_debug}\n'
-                        'Waiting on tty lock to release..')
+                        'Waiting on tty lock to release..'
+                    )

                    await debug_complete.wait()

@ -920,3 +1049,56 @@ async def maybe_wait_for_debugger(
            log.debug(
                    'Root acquired TTY LOCK'
            )
+
+
+# TODO: better naming and what additionals?
+# - [ ] optional runtime plugging?
+# - [ ] detection for sync vs. async code?
+# - [ ] specialized REPL entry when in distributed mode?
+# - [x] allow ignoring kbi Bo
+@cm
+def open_crash_handler(
+    catch: set[BaseException] = {
+        Exception,
+        BaseException,
+    },
+    ignore: set[BaseException] = {
+        KeyboardInterrupt,
+    },
+):
+    '''
+    Generic "post mortem" crash handler using `pdbp` REPL debugger.
+
+    We expose this as a CLI framework addon to both `click` and
+    `typer` users so they can quickly wrap cmd endpoints which get
+    automatically wrapped to use the runtime's `debug_mode: bool`
+    AND `pdbp.pm()` around any code that is PRE-runtime entry
+    - any sync code which runs BEFORE the main call to
+      `trio.run()`.
+
+    '''
+    try:
+        yield
+    except tuple(catch) as err:
+
+        if type(err) not in ignore:
+            pdbp.xpm()
+
+        raise
+
+
+@cm
+def maybe_open_crash_handler(pdb: bool = False):
+    '''
+    Same as `open_crash_handler()` but with bool input flag
+    to allow conditional handling.
+
+    Normally this is used with CLI endpoints such that if the --pdb
+    flag is passed the pdb REPL is engaed on any crashes B)
+    '''
+    rtctx = nullcontext
+    if pdb:
+        rtctx = open_crash_handler
+
+    with rtctx():
+        yield
--- a/tractor/devx/cli.py
+++ b/tractor/devx/cli.py
@ -0,0 +1,136 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+"""
+CLI framework extensions for hacking on the actor runtime.
+
+Currently popular frameworks supported are:
+
+  - `typer` via the `@callback` API
+
+"""
+from __future__ import annotations
+from contextlib import (
+    # asynccontextmanager as acm,
+    contextmanager as cm,
+)
+from typing import (
+    Any,
+    Callable,
+)
+from typing_extensions import Annotated
+
+import typer
+
+
+from ._debug import open_crash_handler
+
+
+_runtime_vars: dict[str, Any] = {}
+
+
+def load_runtime_vars(
+    ctx: typer.Context,
+    callback: Callable,
+    pdb: bool = False,  # --pdb
+    ll: Annotated[
+        str,
+        typer.Option(
+            '--loglevel',
+            '-l',
+            help='BigD logging level',
+        ),
+    ] = 'cancel',  # -l info
+):
+    '''
+    Maybe engage crash handling with `pdbp` when code inside
+    a `typer` CLI endpoint cmd raises.
+
+    To use this callback simply take your `app = typer.Typer()` instance
+    and decorate this function with it like so:
+
+    .. code:: python
+
+        from tractor.devx import cli
+
+        app = typer.Typer()
+
+        # manual decoration to hook into `click`'s context system!
+        cli.load_runtime_vars = app.callback(
+            invoke_without_command=True,
+        )
+
+    And then you can use the now augmented `click` CLI context as so,
+
+    .. code:: python
+
+        @app.command(
+            context_settings={
+                "allow_extra_args": True,
+                "ignore_unknown_options": True,
+            }
+        )
+        def my_cli_cmd(
+            ctx: typer.Context,
+        ):
+            rtvars: dict = ctx.runtime_vars
+            pdb: bool = rtvars['pdb']
+
+            with tractor.devx.cli.maybe_open_crash_handler(pdb=pdb):
+                trio.run(
+                    partial(
+                        my_tractor_main_task_func,
+                        debug_mode=pdb,
+                        loglevel=rtvars['ll'],
+                    )
+                )
+
+    which will enable log level and debug mode globally for the entire
+    `tractor` + `trio` runtime thereafter!
+
+    Bo
+
+    '''
+    global _runtime_vars
+    _runtime_vars |= {
+        'pdb': pdb,
+        'll': ll,
+    }
+
+    ctx.runtime_vars: dict[str, Any] = _runtime_vars
+    print(
+        f'`typer` sub-cmd: {ctx.invoked_subcommand}\n'
+        f'`tractor` runtime vars: {_runtime_vars}'
+    )
+
+    # XXX NOTE XXX: hackzone.. if no sub-cmd is specified (the
+    # default if the user just invokes `bigd`) then we simply
+    # invoke the sole `_bigd()` cmd passing in the "parent"
+    # typer.Context directly to that call since we're treating it
+    # as a "non sub-command" or wtv..
+    # TODO: ideally typer would have some kinda built-in way to get
+    # this behaviour without having to construct and manually
+    # invoke our own cmd..
+    if (
+        ctx.invoked_subcommand is None
+        or ctx.invoked_subcommand == callback.__name__
+    ):
+        cmd: typer.core.TyperCommand = typer.core.TyperCommand(
+            name='bigd',
+            callback=callback,
+        )
+        ctx.params = {'ctx': ctx}
+        cmd.invoke(ctx)
--- a/tractor/experimental/_pubsub.py
+++ b/tractor/experimental/_pubsub.py
@ -37,7 +37,7 @@ import trio
 import wrapt

 from ..log import get_logger
-from .._streaming import Context
+from .._context import Context


 __all__ = ['pub']
@ -148,7 +148,8 @@ def pub(
    *,
    tasks: set[str] = set(),
 ):
-    """Publisher async generator decorator.
+    '''
+    Publisher async generator decorator.

    A publisher can be called multiple times from different actors but
    will only spawn a finite set of internal tasks to stream values to
@ -227,7 +228,8 @@ def pub(
    running in a single actor to stream data to an arbitrary number of
    subscribers. If you are ok to have a new task running for every call
    to ``pub_service()`` then probably don't need this.
-    """
+
+    '''
    global _pubtask2lock

    # handle the decorator not called with () case
--- a/tractor/log.py
+++ b/tractor/log.py
@ -48,12 +48,15 @@ LOG_FORMAT = (

 DATE_FORMAT = '%b %d %H:%M:%S'

-LEVELS = {
+LEVELS: dict[str, int] = {
    'TRANSPORT': 5,
    'RUNTIME': 15,
    'CANCEL': 16,
    'PDB': 500,
 }
+# _custom_levels: set[str] = {
+#     lvlname.lower for lvlname in LEVELS.keys()
+# }

 STD_PALETTE = {
    'CRITICAL': 'red',
@ -82,6 +85,10 @@ class StackLevelAdapter(logging.LoggerAdapter):
        msg: str,

    ) -> None:
+        '''
+        IPC level msg-ing.
+
+        '''
        return self.log(5, msg)

    def runtime(
@ -94,22 +101,57 @@ class StackLevelAdapter(logging.LoggerAdapter):
        self,
        msg: str,
    ) -> None:
-        return self.log(16, msg)
+        '''
+        Cancellation logging, mostly for runtime reporting.
+
+        '''
+        return self.log(
+            level=16,
+            msg=msg,
+            # stacklevel=4,
+        )

    def pdb(
        self,
        msg: str,
    ) -> None:
+        '''
+        Debugger logging.
+
+        '''
        return self.log(500, msg)

-    def log(self, level, msg, *args, **kwargs):
-        """
+    def log(
+        self,
+        level,
+        msg,
+        *args,
+        **kwargs,
+    ):
+        '''
        Delegate a log call to the underlying logger, after adding
        contextual information from this adapter instance.
-        """
+
+        '''
        if self.isEnabledFor(level):
+            stacklevel: int = 3
+            if (
+                level in LEVELS.values()
+                # or level in _custom_levels
+            ):
+                stacklevel: int = 4
+
            # msg, kwargs = self.process(msg, kwargs)
-            self._log(level, msg, args, **kwargs)
+            self._log(
+                level=level,
+                msg=msg,
+                args=args,
+                # NOTE: not sure how this worked before but, it
+                # seems with our custom level methods defined above
+                # we do indeed (now) require another stack level??
+                stacklevel=stacklevel,
+                **kwargs,
+            )

    # LOL, the stdlib doesn't allow passing through ``stacklevel``..
    def _log(
@ -122,12 +164,15 @@ class StackLevelAdapter(logging.LoggerAdapter):
        stack_info=False,

        # XXX: bit we added to show fileinfo from actual caller.
-        # this level then ``.log()`` then finally the caller's level..
-        stacklevel=3,
+        # - this level
+        # - then ``.log()``
+        # - then finally the caller's level..
+        stacklevel=4,
    ):
-        """
+        '''
        Low-level log implementation, proxied to allow nested logger adapters.
-        """
+
+        '''
        return self.logger._log(
            level,
            msg,
@ -181,15 +226,39 @@ def get_logger(
    '''
    log = rlog = logging.getLogger(_root_name)

-    if name and name != _proj_name:
+    if (
+        name
+        and name != _proj_name
+    ):

-        # handling for modules that use ``get_logger(__name__)`` to
-        # avoid duplicate project-package token in msg output
-        rname, _, tail = name.partition('.')
-        if rname == _root_name:
-            name = tail
+        # NOTE: for handling for modules that use ``get_logger(__name__)``
+        # we make the following stylistic choice:
+        # - always avoid duplicate project-package token
+        #   in msg output: i.e. tractor.tractor _ipc.py in header
+        #   looks ridiculous XD
+        # - never show the leaf module name in the {name} part
+        #   since in python the {filename} is always this same
+        #   module-file.
+
+        sub_name: None | str = None
+        rname, _, sub_name = name.partition('.')
+        pkgpath, _, modfilename = sub_name.rpartition('.')
+
+        # NOTE: for tractor itself never include the last level
+        # module key in the name such that something like: eg.
+        # 'tractor.trionics._broadcast` only includes the first
+        # 2 tokens in the (coloured) name part.
+        if rname == 'tractor':
+            sub_name = pkgpath
+
+        if _root_name in sub_name:
+            duplicate, _, sub_name = sub_name.partition('.')
+
+        if not sub_name:
+            log = rlog
+        else:
+            log = rlog.getChild(sub_name)

-        log = rlog.getChild(name)
        log.level = rlog.level

    # add our actor-task aware adapter which will dynamically look up
@ -242,3 +311,7 @@ def get_console_log(

 def get_loglevel() -> str:
    return _default_loglevel
+
+
+# global module logger for tractor itself
+log = get_logger('tractor')
--- a/tractor/msg/init.py
+++ b/tractor/msg/init.py
@ -0,0 +1,26 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Built-in messaging patterns, types, APIs and helpers.
+
+'''
+from .ptr import (
+    NamespacePath as NamespacePath,
+)
+from .types import (
+    Struct as Struct,
+)
--- a/tractor/msg/ptr.py
+++ b/tractor/msg/ptr.py
@ -15,7 +15,7 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.

 '''
-Built-in messaging patterns, types, APIs and helpers.
+IPC-compat cross-mem-boundary object pointer.

 '''

@ -43,38 +43,62 @@ Built-in messaging patterns, types, APIs and helpers.
 # - https://github.com/msgpack/msgpack-python#packingunpacking-of-custom-data-type

 from __future__ import annotations
+from inspect import isfunction
 from pkgutil import resolve_name


 class NamespacePath(str):
    '''
-    A serializeable description of a (function) Python object location
-    described by the target's module path and namespace key meant as
-    a message-native "packet" to allows actors to point-and-load objects
-    by absolute reference.
+    A serializeable description of a (function) Python object
+    location described by the target's module path and namespace
+    key meant as a message-native "packet" to allows actors to
+    point-and-load objects by an absolute ``str`` (and thus
+    serializable) reference.

    '''
-    _ref: object = None
+    _ref: object | type | None = None

-    def load_ref(self) -> object:
+    def load_ref(self) -> object | type:
        if self._ref is None:
            self._ref = resolve_name(self)
        return self._ref

-    def to_tuple(
-        self,
+    @staticmethod
+    def _mk_fqnp(ref: type | object) -> tuple[str, str]:
+        '''
+        Generate a minial ``str`` pair which describes a python
+        object's namespace path and object/type name.

-    ) -> tuple[str, str]:
-        ref = self.load_ref()
-        return ref.__module__, getattr(ref, '__name__', '')
+        In more precise terms something like:
+          - 'py.namespace.path:object_name',
+          - eg.'tractor.msg:NamespacePath' will be the ``str`` form
+            of THIS type XD
+
+        '''
+        if (
+            isinstance(ref, object)
+            and not isfunction(ref)
+        ):
+            name: str = type(ref).__name__
+        else:
+            name: str = getattr(ref, '__name__')
+
+        # fully qualified namespace path, tuple.
+        fqnp: tuple[str, str] = (
+            ref.__module__,
+            name,
+        )
+        return fqnp

    @classmethod
    def from_ref(
        cls,
-        ref,
+        ref: type | object,

    ) -> NamespacePath:
-        return cls(':'.join(
-            (ref.__module__,
-             getattr(ref, '__name__', ''))
-        ))
+
+        fqnp: tuple[str, str] = cls._mk_fqnp(ref)
+        return cls(':'.join(fqnp))
+
+    def to_tuple(self) -> tuple[str, str]:
+        return self._mk_fqnp(self.load_ref())
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@ -0,0 +1,251 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Extensions to built-in or (heavily used but 3rd party) friend-lib
+types.
+
+'''
+from __future__ import annotations
+from collections import UserList
+from pprint import (
+    saferepr,
+)
+from typing import (
+    Any,
+    Iterator,
+)
+
+from msgspec import (
+    msgpack,
+    Struct as _Struct,
+    structs,
+)
+
+
+class DiffDump(UserList):
+    '''
+    Very simple list delegator that repr() dumps (presumed) tuple
+    elements of the form `tuple[str, Any, Any]` in a nice
+    multi-line readable form for analyzing `Struct` diffs.
+
+    '''
+    def __repr__(self) -> str:
+        if not len(self):
+            return super().__repr__()
+
+        # format by displaying item pair's ``repr()`` on multiple,
+        # indented lines such that they are more easily visually
+        # comparable when printed to console when printed to
+        # console.
+        repstr: str = '[\n'
+        for k, left, right in self:
+            repstr += (
+                f'({k},\n'
+                f'\t{repr(left)},\n'
+                f'\t{repr(right)},\n'
+                ')\n'
+            )
+        repstr += ']\n'
+        return repstr
+
+
+class Struct(
+    _Struct,
+
+    # https://jcristharif.com/msgspec/structs.html#tagged-unions
+    # tag='pikerstruct',
+    # tag=True,
+):
+    '''
+    A "human friendlier" (aka repl buddy) struct subtype.
+
+    '''
+    def _sin_props(self) -> Iterator[
+        tuple[
+            structs.FieldIinfo,
+            str,
+            Any,
+        ]
+    ]:
+        '''
+        Iterate over all non-@property fields of this struct.
+
+        '''
+        fi: structs.FieldInfo
+        for fi in structs.fields(self):
+            key: str = fi.name
+            val: Any = getattr(self, key)
+            yield fi, key, val
+
+    def to_dict(
+        self,
+        include_non_members: bool = True,
+
+    ) -> dict:
+        '''
+        Like it sounds.. direct delegation to:
+        https://jcristharif.com/msgspec/api.html#msgspec.structs.asdict
+
+        BUT, by default we pop all non-member (aka not defined as
+        struct fields) fields by default.
+
+        '''
+        asdict: dict = structs.asdict(self)
+        if include_non_members:
+            return asdict
+
+        # only return a dict of the struct members
+        # which were provided as input, NOT anything
+        # added as type-defined `@property` methods!
+        sin_props: dict = {}
+        fi: structs.FieldInfo
+        for fi, k, v in self._sin_props():
+            sin_props[k] = asdict[k]
+
+        return sin_props
+
+    def pformat(
+        self,
+        field_indent: int = 2,
+        indent: int = 0,
+
+    ) -> str:
+        '''
+        Recursion-safe `pprint.pformat()` style formatting of
+        a `msgspec.Struct` for sane reading by a human using a REPL.
+
+        '''
+        # global whitespace indent
+        ws: str = ' '*indent
+
+        # field whitespace indent
+        field_ws: str = ' '*(field_indent + indent)
+
+        # qtn: str = ws + self.__class__.__qualname__
+        qtn: str = self.__class__.__qualname__
+
+        obj_str: str = ''  # accumulator
+        fi: structs.FieldInfo
+        k: str
+        v: Any
+        for fi, k, v in self._sin_props():
+
+            # TODO: how can we prefer `Literal['option1',  'option2,
+            # ..]` over .__name__ == `Literal` but still get only the
+            # latter for simple types like `str | int | None` etc..?
+            ft: type = fi.type
+            typ_name: str = getattr(ft, '__name__', str(ft))
+
+            # recurse to get sub-struct's `.pformat()` output Bo
+            if isinstance(v, Struct):
+                val_str: str =  v.pformat(
+                    indent=field_indent + indent,
+                    field_indent=indent + field_indent,
+                )
+
+            else:  # the `pprint` recursion-safe format:
+                # https://docs.python.org/3.11/library/pprint.html#pprint.saferepr
+                val_str: str = saferepr(v)
+
+            obj_str += (field_ws + f'{k}: {typ_name} = {val_str},\n')
+
+        return (
+            f'{qtn}(\n'
+            f'{obj_str}'
+            f'{ws})'
+        )
+
+    # TODO: use a pprint.PrettyPrinter instance around ONLY rendering
+    # inside a known tty?
+    # def __repr__(self) -> str:
+    #     ...
+
+    # __str__ = __repr__ = pformat
+    __repr__ = pformat
+
+    def copy(
+        self,
+        update: dict | None = None,
+
+    ) -> Struct:
+        '''
+        Validate-typecast all self defined fields, return a copy of
+        us with all such fields.
+
+        NOTE: This is kinda like the default behaviour in
+        `pydantic.BaseModel` except a copy of the object is
+        returned making it compat with `frozen=True`.
+
+        '''
+        if update:
+            for k, v in update.items():
+                setattr(self, k, v)
+
+        # NOTE: roundtrip serialize to validate
+        # - enode to msgpack binary format,
+        # - decode that back to a struct.
+        return msgpack.Decoder(type=type(self)).decode(
+            msgpack.Encoder().encode(self)
+        )
+
+    def typecast(
+        self,
+
+        # TODO: allow only casting a named subset?
+        # fields: set[str] | None = None,
+
+    ) -> None:
+        '''
+        Cast all fields using their declared type annotations
+        (kinda like what `pydantic` does by default).
+
+        NOTE: this of course won't work on frozen types, use
+        ``.copy()`` above in such cases.
+
+        '''
+        # https://jcristharif.com/msgspec/api.html#msgspec.structs.fields
+        fi: structs.FieldInfo
+        for fi in structs.fields(self):
+            setattr(
+                self,
+                fi.name,
+                fi.type(getattr(self, fi.name)),
+            )
+
+    def __sub__(
+        self,
+        other: Struct,
+
+    ) -> DiffDump[tuple[str, Any, Any]]:
+        '''
+        Compare fields/items key-wise and return a ``DiffDump``
+        for easy visual REPL comparison B)
+
+        '''
+        diffs: DiffDump[tuple[str, Any, Any]] = DiffDump()
+        for fi in structs.fields(self):
+            attr_name: str = fi.name
+            ours: Any = getattr(self, attr_name)
+            theirs: Any = getattr(other, attr_name)
+            if ours != theirs:
+                diffs.append((
+                    attr_name,
+                    ours,
+                    theirs,
+                ))
+
+        return diffs
--- a/tractor/to_asyncio.py
+++ b/tractor/to_asyncio.py
@ -28,7 +28,6 @@ from typing import (
    Callable,
    AsyncIterator,
    Awaitable,
-    Optional,
 )

 import trio
@ -65,9 +64,9 @@ class LinkedTaskChannel(trio.abc.Channel):
    _trio_exited: bool = False

    # set after ``asyncio.create_task()``
-    _aio_task: Optional[asyncio.Task] = None
-    _aio_err: Optional[BaseException] = None
-    _broadcaster: Optional[BroadcastReceiver] = None
+    _aio_task: asyncio.Task | None = None
+    _aio_err: BaseException | None = None
+    _broadcaster: BroadcastReceiver | None = None

    async def aclose(self) -> None:
        await self._from_aio.aclose()
@ -188,7 +187,7 @@ def _run_asyncio_task(

    cancel_scope = trio.CancelScope()
    aio_task_complete = trio.Event()
-    aio_err: Optional[BaseException] = None
+    aio_err: BaseException | None = None

    chan = LinkedTaskChannel(
        aio_q,  # asyncio.Queue
@ -263,7 +262,7 @@ def _run_asyncio_task(
        '''
        nonlocal chan
        aio_err = chan._aio_err
-        task_err: Optional[BaseException] = None
+        task_err: BaseException | None = None

        # only to avoid ``asyncio`` complaining about uncaptured
        # task exceptions
@ -329,11 +328,11 @@ async def translate_aio_errors(
    '''
    trio_task = trio.lowlevel.current_task()

-    aio_err: Optional[BaseException] = None
+    aio_err: BaseException | None = None

    # TODO: make thisi a channel method?
    def maybe_raise_aio_err(
-        err: Optional[Exception] = None
+        err: Exception | None = None
    ) -> None:
        aio_err = chan._aio_err
        if (
--- a/tractor/trionics/init.py
+++ b/tractor/trionics/init.py
@ -19,22 +19,13 @@ Sugary patterns for trio + tractor designs.

 '''
 from ._mngrs import (
-    gather_contexts,
-    maybe_open_context,
-    maybe_open_nursery,
+    gather_contexts as gather_contexts,
+    maybe_open_context as maybe_open_context,
+    maybe_open_nursery as maybe_open_nursery,
 )
 from ._broadcast import (
-    broadcast_receiver,
-    BroadcastReceiver,
-    Lagged,
+    AsyncReceiver as AsyncReceiver,
+    broadcast_receiver as broadcast_receiver,
+    BroadcastReceiver as BroadcastReceiver,
+    Lagged as Lagged,
 )
-
-
-__all__ = [
-    'gather_contexts',
-    'broadcast_receiver',
-    'BroadcastReceiver',
-    'Lagged',
-    'maybe_open_context',
-    'maybe_open_nursery',
-]
--- a/tractor/trionics/_broadcast.py
+++ b/tractor/trionics/_broadcast.py
@ -25,8 +25,16 @@ from collections import deque
 from contextlib import asynccontextmanager
 from functools import partial
 from operator import ne
-from typing import Optional, Callable, Awaitable, Any, AsyncIterator, Protocol
-from typing import Generic, TypeVar
+from typing import (
+    Optional,
+    Callable,
+    Awaitable,
+    Any,
+    AsyncIterator,
+    Protocol,
+    Generic,
+    TypeVar,
+)

 import trio
 from trio._core._run import Task
--- a/tractor/trionics/_mngrs.py
+++ b/tractor/trionics/_mngrs.py
@ -70,6 +70,7 @@ async def _enter_and_wait(
    unwrapped: dict[int, T],
    all_entered: trio.Event,
    parent_exit: trio.Event,
+    seed: int,

 ) -> None:
    '''
@ -80,7 +81,10 @@ async def _enter_and_wait(
    async with mngr as value:
        unwrapped[id(mngr)] = value

-        if all(unwrapped.values()):
+        if all(
+            val != seed
+            for val in unwrapped.values()
+        ):
            all_entered.set()

        await parent_exit.wait()
@ -91,7 +95,13 @@ async def gather_contexts(

    mngrs: Sequence[AsyncContextManager[T]],

-) -> AsyncGenerator[tuple[Optional[T], ...], None]:
+) -> AsyncGenerator[
+    tuple[
+        T | None,
+        ...
+    ],
+    None,
+]:
    '''
    Concurrently enter a sequence of async context managers, each in
    a separate ``trio`` task and deliver the unwrapped values in the
@ -104,7 +114,11 @@ async def gather_contexts(
    entered and exited, and cancellation just works.

    '''
-    unwrapped: dict[int, Optional[T]] = {}.fromkeys(id(mngr) for mngr in mngrs)
+    seed: int = id(mngrs)
+    unwrapped: dict[int, T | None] = {}.fromkeys(
+        (id(mngr) for mngr in mngrs),
+        seed,
+    )

    all_entered = trio.Event()
    parent_exit = trio.Event()
@ -116,8 +130,9 @@ async def gather_contexts(

    if not mngrs:
        raise ValueError(
-            'input mngrs is empty?\n'
-            'Did try to use inline generator syntax?'
+            '`.trionics.gather_contexts()` input mngrs is empty?\n'
+            'Did try to use inline generator syntax?\n'
+            'Use a non-lazy iterator or sequence type intead!'
        )

    async with trio.open_nursery() as n:
@ -128,6 +143,7 @@ async def gather_contexts(
                unwrapped,
                all_entered,
                parent_exit,
+                seed,
            )

        # deliver control once all managers have started up
@ -209,6 +225,7 @@ async def maybe_open_context(

    # yielded output
    yielded: Any = None
+    lock_registered: bool = False

    # Lock resource acquisition around task racing  / ``trio``'s
    # scheduler protocol.
@ -216,6 +233,7 @@ async def maybe_open_context(
    # to allow re-entrant use cases where one `maybe_open_context()`
    # wrapped factor may want to call into another.
    lock = _Cache.locks.setdefault(fid, trio.Lock())
+    lock_registered: bool = True
    await lock.acquire()

    # XXX: one singleton nursery per actor and we want to
@ -237,7 +255,7 @@ async def maybe_open_context(
        yielded = _Cache.values[ctx_key]

    except KeyError:
-        log.info(f'Allocating new {acm_func} for {ctx_key}')
+        log.debug(f'Allocating new {acm_func} for {ctx_key}')
        mngr = acm_func(**kwargs)
        resources = _Cache.resources
        assert not resources.get(ctx_key), f'Resource exists? {ctx_key}'
@ -265,7 +283,7 @@ async def maybe_open_context(
        if yielded is not None:
            # if no more consumers, teardown the client
            if _Cache.users <= 0:
-                log.info(f'De-allocating resource for {ctx_key}')
+                log.debug(f'De-allocating resource for {ctx_key}')

                # XXX: if we're cancelled we the entry may have never
                # been entered since the nursery task was killed.
@ -275,4 +293,9 @@ async def maybe_open_context(
                    _, no_more_users = entry
                    no_more_users.set()

-                _Cache.locks.pop(fid)
+                if lock_registered:
+                    maybe_lock = _Cache.locks.pop(fid, None)
+                    if maybe_lock is None:
+                        log.error(
+                            f'Resource lock for {fid} ALREADY POPPED?'
+                        )