diff --git a/tractor/_testing/_reap.py b/tractor/_testing/_reap.py index 42588c3d..d4620461 100644 --- a/tractor/_testing/_reap.py +++ b/tractor/_testing/_reap.py @@ -231,7 +231,8 @@ def find_runaway_subactors( `cpu_threshold` (default 95%) — the smoking-gun signature of a runaway tight-loop bug (e.g. a C-level `recvfrom` loop on a closed socket that missed EOF - detection; #452-class issue). + detection — see + `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`). `cpu_percent(interval=sample_interval)` is the canonical psutil API for a "what %CPU is this proc @@ -737,7 +738,7 @@ def _track_orphaned_uds_per_test(): `wait_for_actor`/`find_actor` discovery probes can accidentally hit (FileExistsError on rebind, or epoll register on a half-closed peer-FIN'd fd → see - issue #452). Catching the leak the test that caused + issue #454). Catching the leak the test that caused it (vs. blanket session-end sweep) makes blame obvious + prevents cascade flakiness. @@ -813,10 +814,12 @@ def _detect_runaway_subactors_per_test(): **Does NOT kill the runaway** — by design. The point of this fixture is to make tight-loop bugs (e.g. C-level `recvfrom` loop on a closed socket - that missed EOF detection — issue #452-class) loudly - visible AT the test that triggers, while keeping - the live pid available for hands-on diagnosis. The - session-end `_reap_orphaned_subactors` fixture will + that missed EOF detection — see + `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`) + loudly visible AT the test that triggers, while + keeping the live pid available for hands-on + diagnosis. The session-end + `_reap_orphaned_subactors` fixture will SIGINT-then-SIGKILL any survivors when the test session completes normally; if the user Ctrl-C's pytest mid-warning, the pid stays alive for as long @@ -892,8 +895,10 @@ def _detect_runaway_subactors_per_test(): # by THIS test that survived a normal teardown # (i.e. parent's `hard_kill` SIGKILL didn't actually # stop the runaway because it was in C tight-loop - # somewhere unreachable to signals — see issue #452 - # forkserver-worker post-fork-close gap). + # somewhere unreachable to signals — see + # `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md` + # for the canonical fork-spawn forkserver-worker + # post-fork-close gap). post_runaways: list[tuple[int, float, str]] = ( find_runaway_subactors( parent_pid, diff --git a/tractor/spawn/_reap.py b/tractor/spawn/_reap.py new file mode 100644 index 00000000..bf2c104d --- /dev/null +++ b/tractor/spawn/_reap.py @@ -0,0 +1,183 @@ +# tractor: structured concurrent "actors". +# Copyright 2018-eternity Tyler Goodlet. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +''' +Post-mortem subactor cleanup primitives — things the parent +runtime has to clean up because the dead-or-SIGKILL'd child +couldn't. + +Sibling of `tractor._testing._reap` which is the test-harness +equivalent (orphan-pid + leaked-shm + leaked-UDS-sock sweeper +fixtures). This module is the spawn-layer counterpart, called +inline from `hard_kill` and the broader subactor reap path. + +Today this is just `unlink_uds_bind_addrs()`. As future +post-mortem cleanup needs surface (e.g. `/dev/shm` segment +unlink for hard-crashed actors, leaked-pidfile cleanup), they +land here too. + +Future-work TODO — authoritative UDS bind-addr tracking +------------------------------------------------------- + +`unlink_uds_bind_addrs()` currently has two cleanup paths: + +1. Explicit `bind_addrs` (when parent set them at spawn time) +2. **Convention-based reconstruction** — + `/tractor/@.sock` — for the + common case where the subactor self-assigned a random sock + via `UDSAddress.get_random()`. + +Path (2) hardcodes the `@.sock` convention from +`tractor.ipc._uds.UDSAddress`. If that convention ever +changes — or the subactor binds to a non-default +`bindspace`/`filedir` — we'll silently fail to unlink. + +A more authoritative approach would be: + +- Subactors register their bound UDS sockpaths in a + per-process registry inside `tractor.ipc._uds` at + `start_listener()` time. +- The subactor reports its bound sockpath(s) back to the + parent over IPC immediately post-bind (extension to + `SpawnSpec` reply / a new handshake msg). +- Parent caches the subactor's authoritative sockpaths. +- `unlink_uds_bind_addrs()` checks the cache FIRST, falls + back to convention-reconstruction if the subactor died + before reporting (which is the SIGKILL case this fn + primarily exists for). + +Tracked as future work in #454 (the parent UDS-leak +issue this module addresses); a separate issue may be +filed if/when the registry impl is scoped. + +See also #452 — the discovery-client `CLOSE_WAIT` TCP +fd leak. Different bug class but same broader theme of +"fork-spawn unmasked latent cleanup gaps". + +''' +from __future__ import annotations + +import os +from typing import TYPE_CHECKING + +import trio + +from tractor.discovery._addr import ( + UnwrappedAddress, + wrap_address, +) +from tractor.ipc._uds import UDSAddress +from tractor.log import get_logger + + +if TYPE_CHECKING: + from tractor.runtime._runtime import Actor + + +log = get_logger('tractor') + + +def unlink_uds_bind_addrs( + proc: trio.Process, + *, + bind_addrs: list[UnwrappedAddress] | None = None, + subactor: Actor | None = None, +) -> None: + ''' + Best-effort post-mortem cleanup of any UDS sock-files + a hard-killed subactor was bound to. + + SIGKILL bypasses Python execution → the subactor's + `_serve_ipc_eps` `finally:` block (which normally calls + `os.unlink(addr.sockpath)`) never runs. Without this + parent-side cleanup, the dead subactor's + `${XDG_RUNTIME_DIR}/tractor/@.sock` file + accumulates on the filesystem (see issue #454 + the + autouse `_track_orphaned_uds_per_test` fixture). + + Two cleanup paths, in order: + + 1. **Explicit `bind_addrs`** — when the parent set the + subactor's bind addrs at spawn time, unlink each + UDS-flavored sockpath directly. + 2. **Self-assigned reconstruction** — when + `bind_addrs` is empty (the common case: subactor + picked its own random sock via + `UDSAddress.get_random()`), reconstruct the path + from `(subactor.aid.name, proc.pid)` using the + same `@.sock` convention. We can do this + because the subactor uses its OWN `os.getpid()` at + bind time, which equals `proc.pid` from the + parent's view. + + Idempotent: `FileNotFoundError` (graceful exit + already-unlinked, or sock never bound under early- + spawn cancel) is silenced; other `OSError`s log a + warning but never raise. TCP / non-UDS bind addrs are + skipped. + + ''' + sockpaths: list[str] = [] + + # path 1: explicit bind_addrs set at spawn time + for unwrapped in (bind_addrs or ()): + try: + addr = wrap_address(unwrapped) + except Exception: + log.exception( + f'Failed to wrap addr for UDS post-kill cleanup ' + f'— skipping {unwrapped!r}\n' + ) + continue + if isinstance(addr, UDSAddress): + sockpaths.append(str(addr.sockpath)) + + # path 2: reconstruct from subactor name + proc pid + # for the random-self-assign case (bind_addrs=None) + # + # TODO authoritative tracking — see module docstring. + if ( + not sockpaths + and subactor is not None + and proc.pid is not None + ): + sockname: str = f'{subactor.aid.name}@{proc.pid}.sock' + sockpath: str = str( + UDSAddress.def_bindspace / sockname + ) + sockpaths.append(sockpath) + + for sockpath in sockpaths: + try: + os.unlink(sockpath) + log.runtime( + f'Unlinked orphaned UDS sock-file post-SIGKILL\n' + f' |_{proc}\n' + f' |_{sockpath}\n' + ) + except FileNotFoundError: + # raced — subactor cleaned up before SIGKILL, + # OR sockfile never bound (early-spawn cancel), + # OR transport wasn't UDS this run. + pass + except OSError as exc: + log.warning( + f'Failed to unlink subactor UDS sock-file ' + f'post-SIGKILL\n' + f' |_{proc}\n' + f' |_{sockpath}\n' + f' |_{exc!r}\n' + ) diff --git a/tractor/spawn/_spawn.py b/tractor/spawn/_spawn.py index c22eeb70..df3e928b 100644 --- a/tractor/spawn/_spawn.py +++ b/tractor/spawn/_spawn.py @@ -40,7 +40,10 @@ from tractor.runtime._state import ( _runtime_vars, ) from tractor.log import get_logger -from tractor.discovery._addr import UnwrappedAddress +from tractor.discovery._addr import ( + UnwrappedAddress, +) +from ._reap import unlink_uds_bind_addrs from tractor.runtime._portal import Portal from tractor.runtime._runtime import Actor from tractor.msg import types as msgtypes @@ -279,6 +282,16 @@ async def hard_kill( # whilst also hacking on it XD # terminate_after: int = 99999, + *, + # Subactor's bind addresses + subactor record, used + # for post-SIGKILL UDS sockpath cleanup. Optional for + # legacy callers; new call sites should pass at least + # `subactor` (which lets us reconstruct the sock path + # from `aid.name + proc.pid` when `bind_addrs` is + # empty/self-assigned). See `._reap.unlink_uds_bind_addrs()`. + bind_addrs: list[UnwrappedAddress] | None = None, + subactor: Actor | None = None, + ) -> None: ''' Un-gracefully terminate an OS level `trio.Process` after timeout. @@ -366,6 +379,21 @@ async def hard_kill( ) proc.kill() + # Post-mortem UDS sockpath cleanup. SIGKILL bypassed + # the subactor's normal `os.unlink(addr.sockpath)` in + # `_serve_ipc_eps`'s `finally:`; the parent has the + # bind addrs (or can reconstruct from name + pid) so + # we do it here. Runs UNCONDITIONALLY (graceful-exit + # case is a no-op via `FileNotFoundError` skip in the + # helper) so the cleanup also covers the "cancelled + # during spawn" path where the subactor never reached + # its IPC server finally block. + unlink_uds_bind_addrs( + proc, + bind_addrs=bind_addrs, + subactor=subactor, + ) + async def soft_kill( proc: ProcessType, diff --git a/tractor/spawn/_trio.py b/tractor/spawn/_trio.py index 3b425256..c2e3ba47 100644 --- a/tractor/spawn/_trio.py +++ b/tractor/spawn/_trio.py @@ -39,7 +39,7 @@ from tractor.runtime._state import ( current_actor, is_root_process, debug_mode, - get_runtime_vars, + # get_runtime_vars, ) from tractor.log import get_logger from tractor.discovery._addr import UnwrappedAddress @@ -282,7 +282,23 @@ async def trio_proc( if proc.poll() is None: log.cancel(f"Attempting to hard kill {proc}") - await hard_kill(proc) + await hard_kill( + proc, + # NOTE, pass through so post-SIGKILL we + # can `os.unlink()` the subactor's + # orphaned UDS sock-file(s) — the + # subactor's own + # `_serve_ipc_eps`-`finally:` cleanup + # never runs under SIGKILL. `subactor` + # lets the helper reconstruct the + # sock path via `aid.name + proc.pid` + # when `bind_addrs` is the common + # self-assigned-random case + # (bind_addrs=None at spawn). See + # `_unlink_uds_bind_addrs()` in `_spawn`. + bind_addrs=bind_addrs, + subactor=subactor, + ) log.debug(f"Joined {proc}") else: