From cdcc1b42fca46d43886e3acb0e52ac40adfe7187 Mon Sep 17 00:00:00 2001 From: goodboy Date: Wed, 11 Feb 2026 17:43:05 -0500 Subject: [PATCH 1/4] Add test for non-registrar root sub-spawning Ensure non-registrar root actors can spawn children and that those children receive correct parent contact info. This test catches the bug reported in, https://github.com/goodboy/tractor/issues/410 Add new `test_non_registrar_spawns_child()` which spawns a sub-actor from a non-registrar root and verifies the child can manually connect back to its parent using `get_root()` API, auditing `._state._runtime_vars` addr propagation from rent to child. Also, - improve type hints throughout test suites (`subprocess.Popen`, `UnwrappedAddress`, `Aid` etc.) - rename `n` -> `an` for actor nursery vars - use multiline style for function signatures (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tests/test_multi_program.py | 103 ++++++++++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 11 deletions(-) diff --git a/tests/test_multi_program.py b/tests/test_multi_program.py index b0b145ee..ac80928d 100644 --- a/tests/test_multi_program.py +++ b/tests/test_multi_program.py @@ -1,8 +1,13 @@ """ Multiple python programs invoking the runtime. """ +from __future__ import annotations import platform +import subprocess import time +from typing import ( + TYPE_CHECKING, +) import pytest import trio @@ -10,14 +15,27 @@ import tractor from tractor._testing import ( tractor_test, ) +from tractor import ( + Actor, + Context, + Portal, +) from .conftest import ( sig_prog, _INT_SIGNAL, _INT_RETURN_CODE, ) +if TYPE_CHECKING: + from tractor.msg import Aid + from tractor._addr import ( + UnwrappedAddress, + ) -def test_abort_on_sigint(daemon): + +def test_abort_on_sigint( + daemon: subprocess.Popen, +): assert daemon.returncode is None time.sleep(0.1) sig_prog(daemon, _INT_SIGNAL) @@ -30,7 +48,10 @@ def test_abort_on_sigint(daemon): @tractor_test -async def test_cancel_remote_arbiter(daemon, reg_addr): +async def test_cancel_remote_arbiter( + daemon: subprocess.Popen, + reg_addr: UnwrappedAddress, +): assert not tractor.current_actor().is_arbiter async with tractor.get_registry(reg_addr) as portal: await portal.cancel_actor() @@ -45,24 +66,84 @@ async def test_cancel_remote_arbiter(daemon, reg_addr): pass -def test_register_duplicate_name(daemon, reg_addr): - +def test_register_duplicate_name( + daemon: subprocess.Popen, + reg_addr: UnwrappedAddress, +): async def main(): - async with tractor.open_nursery( registry_addrs=[reg_addr], - ) as n: + ) as an: assert not tractor.current_actor().is_arbiter - p1 = await n.start_actor('doggy') - p2 = await n.start_actor('doggy') + p1 = await an.start_actor('doggy') + p2 = await an.start_actor('doggy') async with tractor.wait_for_actor('doggy') as portal: assert portal.channel.uid in (p2.channel.uid, p1.channel.uid) - await n.cancel() + await an.cancel() - # run it manually since we want to start **after** - # the other "daemon" program + # XXX, run manually since we want to start this root **after** + # the other "daemon" program with it's own root. + trio.run(main) + + +@tractor.context +async def get_root_portal( + ctx: Context, +): + ''' + Connect back to the root actor manually (using `._discovery` API) + and ensure it's contact info is the same as our immediate parent. + + ''' + await ctx.started() + from tractor._discovery import get_root + ptl: Portal + async with get_root() as ptl: + root_aid: Aid = ptl.chan.aid + parent_ptl: Portal = tractor.current_actor().get_parent() + assert ( + root_aid.name == 'root' + and + parent_ptl.chan.aid == root_aid + ) + + +def test_non_registrar_spawns_child( + daemon: subprocess.Popen, + reg_addr: UnwrappedAddress, +): + ''' + Ensure a non-regristar (serving) root actor can spawn a sub and + that sub can connect back (manually) to it's rent that is the + root without issue. + + More or less this audits the global contact info in + `._state._runtime_vars`. + + ''' + async def main(): + async with tractor.open_nursery( + registry_addrs=[reg_addr], + ) as an: + + actor: Actor = tractor.current_actor() + assert not actor.is_registrar + sub_ptl: Portal = await an.start_actor( + name='sub', + enable_modules=[__name__], + ) + + async with sub_ptl.open_context( + get_root_portal, + ) as (ctx, first): + pass + + await an.cancel() + + # XXX, run manually since we want to start this root **after** + # the other "daemon" program with it's own root. trio.run(main) From 8aee24e83fcf430ad087987179466b5f388cb0db Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 10 Feb 2026 20:33:19 -0500 Subject: [PATCH 2/4] Fix when root-actor addrs is set as rtvs Move `_root_addrs` assignment to after `async_main()` unblocks (via `.started()`) which now delivers the bind addrs , ensuring correct `UnwrappedAddress` propagation into `._state._runtime_vars` for non-registar root actors.. Previously for non-registrar root actors the `._state._runtime_vars` entries were being set as `Address` values which ofc IPC serialize incorrectly rn vs. the unwrapped versions, (well until we add a msgspec for their structs anyway) and thus are passed in incorrect form to children/subactors during spawning.. This fixes the issue by waiting for the `.ipc.*` stack to bind-and-resolve any randomly allocated addrs (by the OS) until after the initial `Actor` startup is complete. Deats, - primarily, mv `_root_addrs` assignment from before `root_tn.start()` to after, using started(-ed) `accept_addrs` now delivered from `._runtime.async_main()`.. - update `task_status` type hints to match. - unpack and set the `(accept_addrs, reg_addrs)` tuple from `root_tn.start()` call into `._state._runtime_vars` entries. - improve and embolden comments distinguishing registrar vs non-registrar init paths, ensure typing reflects wrapped vs. unwrapped addrs. Also, - add a masked `mk_pdb().set_trace()` for debugging `raddrs` values being "off". - add TODO about using UDS on linux for root mailbox - rename `trans_bind_addrs` -> `tpt_bind_addrs` for clarity. - expand comment about random port allocation for non-registrar case (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tractor/_root.py | 66 ++++++++++++++++++++++++++++++++------------- tractor/_runtime.py | 39 ++++++++++++++++++++++++--- 2 files changed, 82 insertions(+), 23 deletions(-) diff --git a/tractor/_root.py b/tractor/_root.py index 2fb7755d..2330449a 100644 --- a/tractor/_root.py +++ b/tractor/_root.py @@ -88,7 +88,8 @@ async def maybe_block_bp( bp_blocked: bool if ( debug_mode - and maybe_enable_greenback + and + maybe_enable_greenback and ( maybe_mod := await debug.maybe_init_greenback( raise_not_found=False, @@ -385,10 +386,13 @@ async def open_root_actor( addr, ) - trans_bind_addrs: list[UnwrappedAddress] = [] + tpt_bind_addrs: list[ + Address # `Address.get_random()` case + |UnwrappedAddress # registrar case `= uw_reg_addrs` + ] = [] - # Create a new local root-actor instance which IS NOT THE - # REGISTRAR + # ------ NON-REGISTRAR ------ + # create a new root-actor instance. if ponged_addrs: if ensure_registry: raise RuntimeError( @@ -415,12 +419,21 @@ async def open_root_actor( # XXX INSTEAD, bind random addrs using the same tpt # proto. for addr in ponged_addrs: - trans_bind_addrs.append( + tpt_bind_addrs.append( + # XXX, these are `Address` NOT `UnwrappedAddress`. + # + # NOTE, in the case of posix/berkley socket + # protos we allocate port=0 such that the system + # allocates a random value at bind time; this + # happens in the `.ipc.*` stack's backend. addr.get_random( bindspace=addr.bindspace, ) ) + # ------ REGISTRAR ------ + # create a new "registry providing" root-actor instance. + # # Start this local actor as the "registrar", aka a regular # actor who manages the local registry of "mailboxes" of # other process-tree-local sub-actors. @@ -429,7 +442,7 @@ async def open_root_actor( # following init steps are taken: # - the tranport layer server is bound to each addr # pair defined in provided registry_addrs, or the default. - trans_bind_addrs = uw_reg_addrs + tpt_bind_addrs = uw_reg_addrs # - it is normally desirable for any registrar to stay up # indefinitely until either all registered (child/sub) @@ -449,20 +462,10 @@ async def open_root_actor( enable_modules=enable_modules, ) # XXX, in case the root actor runtime was actually run from - # `tractor.to_asyncio.run_as_asyncio_guest()` and NOt + # `tractor.to_asyncio.run_as_asyncio_guest()` and NOT # `.trio.run()`. actor._infected_aio = _state._runtime_vars['_is_infected_aio'] - # NOTE, only set the loopback addr for the - # process-tree-global "root" mailbox since all sub-actors - # should be able to speak to their root actor over that - # channel. - raddrs: list[Address] = _state._runtime_vars['_root_addrs'] - raddrs.extend(trans_bind_addrs) - # TODO, remove once we have also removed all usage; - # eventually all (root-)registry apis should expect > 1 addr. - _state._runtime_vars['_root_mailbox'] = raddrs[0] - # Start up main task set via core actor-runtime nurseries. try: # assign process-local actor @@ -499,14 +502,39 @@ async def open_root_actor( # "actor runtime" primitives are SC-compat and thus all # transitively spawned actors/processes must be as # well. - await root_tn.start( + accept_addrs: list[UnwrappedAddress] + reg_addrs: list[UnwrappedAddress] + ( + accept_addrs, + reg_addrs, + ) = await root_tn.start( partial( _runtime.async_main, actor, - accept_addrs=trans_bind_addrs, + accept_addrs=tpt_bind_addrs, parent_addr=None ) ) + # NOTE, only set a local-host addr (i.e. like + # `lo`-loopback for TCP) for the process-tree-global + # "root"-process (its tree-wide "mailbox") since all + # sub-actors should be able to speak to their root + # actor over that channel. + # + # ?TODO, per-OS non-network-proto alt options? + # -[ ] on linux we should be able to always use UDS? + # + raddrs: list[Address] = _state._runtime_vars['_root_addrs'] + raddrs.extend( + accept_addrs, + ) + # TODO, remove once we have also removed all usage; + # eventually all (root-)registry apis should expect > 1 addr. + _state._runtime_vars['_root_mailbox'] = raddrs[0] + # if 'chart' in actor.aid.name: + # from tractor.devx import mk_pdb + # mk_pdb().set_trace() + try: yield actor except ( diff --git a/tractor/_runtime.py b/tractor/_runtime.py index f18e0d61..c6ff9e4c 100644 --- a/tractor/_runtime.py +++ b/tractor/_runtime.py @@ -147,6 +147,8 @@ def get_mod_nsps2fps(mod_ns_paths: list[str]) -> dict[str, str]: return nsp2fp +_bp = False + class Actor: ''' The fundamental "runtime" concurrency primitive. @@ -272,7 +274,9 @@ class Actor: stacklevel=2, ) - registry_addrs: list[Address] = [wrap_address(arbiter_addr)] + registry_addrs: list[Address] = [ + wrap_address(arbiter_addr) + ] # marked by the process spawning backend at startup # will be None for the parent most process started manually @@ -959,6 +963,21 @@ class Actor: rvs['_is_root'] = False # obvi XD + # TODO, remove! left in just while protoing init fix! + # global _bp + # if ( + # 'chart' in self.aid.name + # and + # isinstance( + # rvs['_root_addrs'][0], + # dict, + # ) + # and + # not _bp + # ): + # _bp = True + # breakpoint() + _state._runtime_vars.update(rvs) # `SpawnSpec.reg_addrs` @@ -1455,7 +1474,12 @@ async def async_main( # be False when running as root actor and True when as # a subactor. parent_addr: UnwrappedAddress|None = None, - task_status: TaskStatus[None] = trio.TASK_STATUS_IGNORED, + task_status: TaskStatus[ + tuple[ + list[UnwrappedAddress], # accept_addrs + list[UnwrappedAddress], # reg_addrs + ] + ] = trio.TASK_STATUS_IGNORED, ) -> None: ''' @@ -1634,6 +1658,7 @@ async def async_main( # if addresses point to the same actor.. # So we need a way to detect that? maybe iterate # only on unique actor uids? + addr: UnwrappedAddress for addr in actor.reg_addrs: try: waddr = wrap_address(addr) @@ -1642,7 +1667,9 @@ async def async_main( await debug.pause() # !TODO, get rid of the local-portal crap XD + reg_portal: Portal async with get_registry(addr) as reg_portal: + accept_addr: UnwrappedAddress for accept_addr in accept_addrs: accept_addr = wrap_address(accept_addr) @@ -1658,8 +1685,12 @@ async def async_main( is_registered: bool = True - # init steps complete - task_status.started() + # init steps complete, deliver IPC-server and + # registrar addrs back to caller. + task_status.started(( + accept_addrs, + actor.reg_addrs, + )) # Begin handling our new connection back to our # parent. This is done last since we don't want to From 6cab363c513c5cbbaa075354e7e8b24333a5d7af Mon Sep 17 00:00:00 2001 From: goodboy Date: Wed, 11 Feb 2026 18:20:59 -0500 Subject: [PATCH 3/4] Catch-n-fail on stale `_root_addrs` state.. Turns out we aren't clearing the `._state._runtime_vars` entries in between `open_root_actor` calls.. This test refinement catches that by adding runtime-vars asserts on the expected root-addrs value; ensure `_runtime_vars['_root_addrs'] ONLY match the values provided by the test's CURRENT root actor. This causes a failure when the (just added) `test_non_registrar_spawns_child` is run as part of the module suite, it's fine when run standalone. --- tests/test_multi_program.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/tests/test_multi_program.py b/tests/test_multi_program.py index ac80928d..eff9a731 100644 --- a/tests/test_multi_program.py +++ b/tests/test_multi_program.py @@ -16,6 +16,8 @@ from tractor._testing import ( tractor_test, ) from tractor import ( + current_actor, + _state, Actor, Context, Portal, @@ -52,7 +54,7 @@ async def test_cancel_remote_arbiter( daemon: subprocess.Popen, reg_addr: UnwrappedAddress, ): - assert not tractor.current_actor().is_arbiter + assert not current_actor().is_arbiter async with tractor.get_registry(reg_addr) as portal: await portal.cancel_actor() @@ -75,7 +77,7 @@ def test_register_duplicate_name( registry_addrs=[reg_addr], ) as an: - assert not tractor.current_actor().is_arbiter + assert not current_actor().is_arbiter p1 = await an.start_actor('doggy') p2 = await an.start_actor('doggy') @@ -99,22 +101,42 @@ async def get_root_portal( and ensure it's contact info is the same as our immediate parent. ''' - await ctx.started() + sub: Actor = current_actor() + rtvs: dict = _state._runtime_vars + raddrs: list[UnwrappedAddress] = rtvs['_root_addrs'] + + # await tractor.pause() + # XXX, in case the sub->root discovery breaks you might need + # this (i know i did Xp)!! + # from tractor.devx import mk_pdb + # mk_pdb().set_trace() + + assert ( + len(raddrs) == 1 + and + list(sub._parent_chan.raddr.unwrap()) in raddrs + ) + + # connect back to our immediate parent which should also + # be the actor-tree's root. from tractor._discovery import get_root ptl: Portal async with get_root() as ptl: root_aid: Aid = ptl.chan.aid - parent_ptl: Portal = tractor.current_actor().get_parent() + parent_ptl: Portal = current_actor().get_parent() assert ( root_aid.name == 'root' and parent_ptl.chan.aid == root_aid ) + await ctx.started() def test_non_registrar_spawns_child( daemon: subprocess.Popen, reg_addr: UnwrappedAddress, + loglevel: str, + debug_mode: bool, ): ''' Ensure a non-regristar (serving) root actor can spawn a sub and @@ -128,6 +150,8 @@ def test_non_registrar_spawns_child( async def main(): async with tractor.open_nursery( registry_addrs=[reg_addr], + loglevel=loglevel, + debug_mode=debug_mode, ) as an: actor: Actor = tractor.current_actor() @@ -140,7 +164,7 @@ def test_non_registrar_spawns_child( async with sub_ptl.open_context( get_root_portal, ) as (ctx, first): - pass + print('Waiting for `sub` to connect back to us..') await an.cancel() From 2ed9e65530e8ece738b57cb1c8a473404ed45815 Mon Sep 17 00:00:00 2001 From: goodboy Date: Wed, 11 Feb 2026 18:30:07 -0500 Subject: [PATCH 4/4] Clear rtvs state on root shutdown.. Fixes the bug discovered in last test update, not sure how this wasn't caught already XD --- tractor/_root.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tractor/_root.py b/tractor/_root.py index 2330449a..f480a619 100644 --- a/tractor/_root.py +++ b/tractor/_root.py @@ -616,6 +616,13 @@ async def open_root_actor( ): _state._runtime_vars['_debug_mode'] = False + # !XXX, clear ALL prior contact info state, this is MEGA + # important if you are opening the runtime multiple times + # from the same parent process (like in our test + # harness)! + _state._runtime_vars['_root_addrs'].clear() + _state._runtime_vars['_root_mailbox'] = None + _state._current_actor = None _state._last_actor_terminated = actor