From 0cd0b633f1793b17f94ed0ba07c6540e44c60606 Mon Sep 17 00:00:00 2001 From: goodboy Date: Thu, 23 Apr 2026 15:30:39 -0400 Subject: [PATCH] Scrub inherited FDs in fork-child prelude MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements fix-direction (1)/blunt-close-all-FDs from b71705bd (`subint_forkserver` nested-cancel hang diag), targeting the multi-level cancel-cascade deadlock in `test_nested_multierrors[subint_forkserver]`. The diagnosis doc voted for surgical FD cleanup via `actor.ipc_server` handle as the cleanest approach, but going blunt is actually the right call: after `os.fork()`, the child immediately enters `_actor_child_main()` which opens its OWN IPC sockets / wakeup-fd / epoll-fd / etc. — none of the parent's FDs are needed. Closing everything except stdio is safe AND defends against future listener/IPC additions to the parent inheriting silently into children. Deats, - new `_close_inherited_fds(keep={0,1,2}) -> int` helper. Linux fast-path enumerates `/proc/self/fd`; POSIX fallback uses `RLIMIT_NOFILE` range. Matches the stdlib `subprocess._posixsubprocess.close_fds` strategy. Returns close-count for sanity logging - wire into `fork_from_worker_thread._worker()`'s post-fork child prelude — runs immediately after the pid-pipe `os.close(rfd/wfd)`, before the user `child_target` callable executes - docstring cross-refs the diagnosis doc + spells out the FD-inheritance-cascade mechanism and why the close-all approach is safe for our spawn shape Validation pending: re-run `test_nested_multierrors[subint_forkserver]` to confirm the deadlock is gone. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tractor/spawn/_subint_forkserver.py | 69 ++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/tractor/spawn/_subint_forkserver.py b/tractor/spawn/_subint_forkserver.py index 159d1462..34c5dd97 100644 --- a/tractor/spawn/_subint_forkserver.py +++ b/tractor/spawn/_subint_forkserver.py @@ -195,6 +195,69 @@ except ImportError: _has_subints: bool = False +def _close_inherited_fds( + keep: frozenset[int] = frozenset({0, 1, 2}), +) -> int: + ''' + Close every open file descriptor in the current process + EXCEPT those in `keep` (default: stdio only). + + Intended as the first thing a post-`os.fork()` child runs + after closing any communication pipes it knows about. This + is the fork-child FD hygiene discipline that + `subprocess.Popen(close_fds=True)` applies by default for + its exec-based children, but which we have to implement + ourselves because our `fork_from_worker_thread()` primitive + deliberately does NOT exec. + + Why it matters + -------------- + Without this, a forkserver-spawned subactor inherits the + parent actor's IPC listener sockets, trio-epoll fd, trio + wakeup-pipe, peer-channel sockets, etc. If that subactor + then itself forkserver-spawns a grandchild, the grandchild + inherits the FDs transitively from *both* its direct + parent AND the root actor — IPC message routing becomes + ambiguous and the cancel cascade deadlocks. See + `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md` + for the full diagnosis + the empirical repro. + + Fresh children will open their own IPC sockets via + `_actor_child_main()`, so they don't need any of the + parent's FDs. + + Returns the count of fds that were successfully closed — + useful for sanity-check logging at callsites. + + ''' + # Enumerate open fds via `/proc/self/fd` on Linux (the fast + + # precise path); fall back to `RLIMIT_NOFILE` range close on + # other platforms. Matches stdlib + # `subprocess._posixsubprocess.close_fds` strategy. + try: + fd_names: list[str] = os.listdir('/proc/self/fd') + candidates: list[int] = [ + int(n) for n in fd_names if n.isdigit() + ] + except (FileNotFoundError, PermissionError): + import resource + soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE) + candidates = list(range(3, soft)) + + closed: int = 0 + for fd in candidates: + if fd in keep: + continue + try: + os.close(fd) + closed += 1 + except OSError: + # fd was already closed (race with listdir) or + # otherwise unclosable — either is fine. + pass + return closed + + def _format_child_exit( status: int, ) -> str: @@ -302,9 +365,13 @@ def fork_from_worker_thread( pid: int = os.fork() if pid == 0: # CHILD: close the pid-pipe ends (we don't use - # them here), run the user callable if any, exit. + # them here), then scrub ALL other inherited FDs + # so the child starts with a clean slate + # (stdio-only). Critical for multi-level spawn + # trees — see `_close_inherited_fds()` docstring. os.close(rfd) os.close(wfd) + _close_inherited_fds() rc: int = 0 if child_target is not None: try: