From 3b0724eba85b4014170ed95773e1e41a60d5c513 Mon Sep 17 00:00:00 2001 From: goodboy Date: Thu, 7 May 2026 22:18:29 -0400 Subject: [PATCH] Add `wait_for_peer_or_proc_death()` to `_spawn` Race `IPCServer.wait_for_peer(uid)` against the sub-proc's `.wait()` inside a `trio` nursery; whichever completes first cancels the other. Prevents the spawning task from parking forever on an unsignalled `_peer_connected[uid]` event when a sub-actor dies during boot (e.g. crashed on import before reaching `_actor_child_main`). Instead of hanging, raises `ActorFailure` w/ the proc's exit code for clean supervisor error reporting. Also, - use the new racer in `main_thread_forkserver_proc()` spawn path. - keep `proc_wait` generic so each backend passes its own callable (`trio.Process.wait`, `_ForkedProc.wait`, etc.). (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tractor/spawn/_main_thread_forkserver.py | 14 ++++- tractor/spawn/_spawn.py | 66 ++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) diff --git a/tractor/spawn/_main_thread_forkserver.py b/tractor/spawn/_main_thread_forkserver.py index f54d1ae7..90d136ba 100644 --- a/tractor/spawn/_main_thread_forkserver.py +++ b/tractor/spawn/_main_thread_forkserver.py @@ -368,6 +368,7 @@ from tractor.runtime._portal import Portal from ._spawn import ( cancel_on_completion, soft_kill, + wait_for_peer_or_proc_death, ) if TYPE_CHECKING: @@ -968,7 +969,18 @@ async def main_thread_forkserver_proc( f' |_{proc}\n' ) - event, chan = await ipc_server.wait_for_peer(uid) + # race the handshake-wait against proc-death so a + # sub that dies during boot (e.g. crashed on import + # before reaching `_actor_child_main`, leaving a + # zombie + no cmdline) surfaces as `ActorFailure` + # instead of parking the spawning task forever on + # an unsignalled `_peer_connected[uid]` event. + event, chan = await wait_for_peer_or_proc_death( + ipc_server, + uid, + proc_wait=proc.wait, + proc_repr=repr(proc), + ) except trio.Cancelled: cancelled_during_spawn = True diff --git a/tractor/spawn/_spawn.py b/tractor/spawn/_spawn.py index df3e928b..1f2203d3 100644 --- a/tractor/spawn/_spawn.py +++ b/tractor/spawn/_spawn.py @@ -43,6 +43,7 @@ from tractor.log import get_logger from tractor.discovery._addr import ( UnwrappedAddress, ) +from .._exceptions import ActorFailure from ._reap import unlink_uds_bind_addrs from tractor.runtime._portal import Portal from tractor.runtime._runtime import Actor @@ -106,6 +107,71 @@ else: await trio.lowlevel.wait_readable(proc.sentinel) +async def wait_for_peer_or_proc_death( + ipc_server, + uid: tuple[str, str], + # TODO? not not types? + proc_wait: 'Callable[[], Awaitable]', + proc_repr: str = '', + +) -> 'tuple[trio.Event, Channel]': + ''' + Race `IPCServer.wait_for_peer(uid)` against the sub-proc's + own `.wait()` coroutine. Whichever completes first cancels + the other. + + Used by every spawn-backend to detect a sub-actor that + *dies during boot* before completing the parent-handshake- + callback (e.g. crashed on import, exec'd-out, kernel-killed + pre-`_actor_child_main`). Without this race, the + handshake-wait — backed by an unsignalled `trio.Event` — + parks the spawning task forever and leaves the dead child + as a zombie since nobody calls `proc.wait()` to reap. + + On normal handshake-complete: returns `(event, chan)` + identical to a bare `wait_for_peer`. + + On proc-death-first: raises `ActorFailure` carrying the + proc's exit code, allowing the supervisor to surface a + clean error rather than hanging indefinitely. + + `proc_wait` is a 0-arg async callable returning the proc's + exit-status — kept generic so each backend can pass its + own (`trio.Process.wait`, `_ForkedProc.wait`, + `proc_waiter(mp.Process)`, etc.). + + `proc_repr` is an optional string used in the + `ActorFailure` message for diag. + + ''' + result: dict = {} + + async def _await_handshake(): + event, chan = await ipc_server.wait_for_peer(uid) + result['handshake'] = (event, chan) + boot_n.cancel_scope.cancel() + + async def _await_death(): + rc = await proc_wait() + result['died'] = rc + boot_n.cancel_scope.cancel() + + async with trio.open_nursery() as boot_n: + boot_n.start_soon(_await_handshake) + boot_n.start_soon(_await_death) + + if 'handshake' in result: + return result['handshake'] + + # only reached if proc-death won the race + raise ActorFailure( + f'Sub-actor {uid!r} died during boot ' + f'(rc={result.get("died")!r}) before completing ' + f'parent-handshake.\n' + f' proc: {proc_repr}' + ) + + def try_set_start_method( key: SpawnMethodKey