diff --git a/tractor/_testing/_reap.py b/tractor/_testing/_reap.py
index 42588c3d..d4620461 100644
--- a/tractor/_testing/_reap.py
+++ b/tractor/_testing/_reap.py
@@ -231,7 +231,8 @@ def find_runaway_subactors(
`cpu_threshold` (default 95%) — the smoking-gun
signature of a runaway tight-loop bug (e.g. a C-level
`recvfrom` loop on a closed socket that missed EOF
- detection; #452-class issue).
+ detection — see
+ `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`).
`cpu_percent(interval=sample_interval)` is the
canonical psutil API for a "what %CPU is this proc
@@ -737,7 +738,7 @@ def _track_orphaned_uds_per_test():
`wait_for_actor`/`find_actor` discovery probes can
accidentally hit (FileExistsError on rebind, or
epoll register on a half-closed peer-FIN'd fd → see
- issue #452). Catching the leak the test that caused
+ issue #454). Catching the leak the test that caused
it (vs. blanket session-end sweep) makes blame
obvious + prevents cascade flakiness.
@@ -813,10 +814,12 @@ def _detect_runaway_subactors_per_test():
**Does NOT kill the runaway** — by design.
The point of this fixture is to make tight-loop bugs
(e.g. C-level `recvfrom` loop on a closed socket
- that missed EOF detection — issue #452-class) loudly
- visible AT the test that triggers, while keeping
- the live pid available for hands-on diagnosis. The
- session-end `_reap_orphaned_subactors` fixture will
+ that missed EOF detection — see
+ `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`)
+ loudly visible AT the test that triggers, while
+ keeping the live pid available for hands-on
+ diagnosis. The session-end
+ `_reap_orphaned_subactors` fixture will
SIGINT-then-SIGKILL any survivors when the test
session completes normally; if the user Ctrl-C's
pytest mid-warning, the pid stays alive for as long
@@ -892,8 +895,10 @@ def _detect_runaway_subactors_per_test():
# by THIS test that survived a normal teardown
# (i.e. parent's `hard_kill` SIGKILL didn't actually
# stop the runaway because it was in C tight-loop
- # somewhere unreachable to signals — see issue #452
- # forkserver-worker post-fork-close gap).
+ # somewhere unreachable to signals — see
+ # `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`
+ # for the canonical fork-spawn forkserver-worker
+ # post-fork-close gap).
post_runaways: list[tuple[int, float, str]] = (
find_runaway_subactors(
parent_pid,
diff --git a/tractor/spawn/_reap.py b/tractor/spawn/_reap.py
new file mode 100644
index 00000000..bf2c104d
--- /dev/null
+++ b/tractor/spawn/_reap.py
@@ -0,0 +1,183 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+'''
+Post-mortem subactor cleanup primitives — things the parent
+runtime has to clean up because the dead-or-SIGKILL'd child
+couldn't.
+
+Sibling of `tractor._testing._reap` which is the test-harness
+equivalent (orphan-pid + leaked-shm + leaked-UDS-sock sweeper
+fixtures). This module is the spawn-layer counterpart, called
+inline from `hard_kill` and the broader subactor reap path.
+
+Today this is just `unlink_uds_bind_addrs()`. As future
+post-mortem cleanup needs surface (e.g. `/dev/shm` segment
+unlink for hard-crashed actors, leaked-pidfile cleanup), they
+land here too.
+
+Future-work TODO — authoritative UDS bind-addr tracking
+-------------------------------------------------------
+
+`unlink_uds_bind_addrs()` currently has two cleanup paths:
+
+1. Explicit `bind_addrs` (when parent set them at spawn time)
+2. **Convention-based reconstruction** —
+ `/tractor/@.sock` — for the
+ common case where the subactor self-assigned a random sock
+ via `UDSAddress.get_random()`.
+
+Path (2) hardcodes the `@.sock` convention from
+`tractor.ipc._uds.UDSAddress`. If that convention ever
+changes — or the subactor binds to a non-default
+`bindspace`/`filedir` — we'll silently fail to unlink.
+
+A more authoritative approach would be:
+
+- Subactors register their bound UDS sockpaths in a
+ per-process registry inside `tractor.ipc._uds` at
+ `start_listener()` time.
+- The subactor reports its bound sockpath(s) back to the
+ parent over IPC immediately post-bind (extension to
+ `SpawnSpec` reply / a new handshake msg).
+- Parent caches the subactor's authoritative sockpaths.
+- `unlink_uds_bind_addrs()` checks the cache FIRST, falls
+ back to convention-reconstruction if the subactor died
+ before reporting (which is the SIGKILL case this fn
+ primarily exists for).
+
+Tracked as future work in #454 (the parent UDS-leak
+issue this module addresses); a separate issue may be
+filed if/when the registry impl is scoped.
+
+See also #452 — the discovery-client `CLOSE_WAIT` TCP
+fd leak. Different bug class but same broader theme of
+"fork-spawn unmasked latent cleanup gaps".
+
+'''
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING
+
+import trio
+
+from tractor.discovery._addr import (
+ UnwrappedAddress,
+ wrap_address,
+)
+from tractor.ipc._uds import UDSAddress
+from tractor.log import get_logger
+
+
+if TYPE_CHECKING:
+ from tractor.runtime._runtime import Actor
+
+
+log = get_logger('tractor')
+
+
+def unlink_uds_bind_addrs(
+ proc: trio.Process,
+ *,
+ bind_addrs: list[UnwrappedAddress] | None = None,
+ subactor: Actor | None = None,
+) -> None:
+ '''
+ Best-effort post-mortem cleanup of any UDS sock-files
+ a hard-killed subactor was bound to.
+
+ SIGKILL bypasses Python execution → the subactor's
+ `_serve_ipc_eps` `finally:` block (which normally calls
+ `os.unlink(addr.sockpath)`) never runs. Without this
+ parent-side cleanup, the dead subactor's
+ `${XDG_RUNTIME_DIR}/tractor/@.sock` file
+ accumulates on the filesystem (see issue #454 + the
+ autouse `_track_orphaned_uds_per_test` fixture).
+
+ Two cleanup paths, in order:
+
+ 1. **Explicit `bind_addrs`** — when the parent set the
+ subactor's bind addrs at spawn time, unlink each
+ UDS-flavored sockpath directly.
+ 2. **Self-assigned reconstruction** — when
+ `bind_addrs` is empty (the common case: subactor
+ picked its own random sock via
+ `UDSAddress.get_random()`), reconstruct the path
+ from `(subactor.aid.name, proc.pid)` using the
+ same `@.sock` convention. We can do this
+ because the subactor uses its OWN `os.getpid()` at
+ bind time, which equals `proc.pid` from the
+ parent's view.
+
+ Idempotent: `FileNotFoundError` (graceful exit
+ already-unlinked, or sock never bound under early-
+ spawn cancel) is silenced; other `OSError`s log a
+ warning but never raise. TCP / non-UDS bind addrs are
+ skipped.
+
+ '''
+ sockpaths: list[str] = []
+
+ # path 1: explicit bind_addrs set at spawn time
+ for unwrapped in (bind_addrs or ()):
+ try:
+ addr = wrap_address(unwrapped)
+ except Exception:
+ log.exception(
+ f'Failed to wrap addr for UDS post-kill cleanup '
+ f'— skipping {unwrapped!r}\n'
+ )
+ continue
+ if isinstance(addr, UDSAddress):
+ sockpaths.append(str(addr.sockpath))
+
+ # path 2: reconstruct from subactor name + proc pid
+ # for the random-self-assign case (bind_addrs=None)
+ #
+ # TODO authoritative tracking — see module docstring.
+ if (
+ not sockpaths
+ and subactor is not None
+ and proc.pid is not None
+ ):
+ sockname: str = f'{subactor.aid.name}@{proc.pid}.sock'
+ sockpath: str = str(
+ UDSAddress.def_bindspace / sockname
+ )
+ sockpaths.append(sockpath)
+
+ for sockpath in sockpaths:
+ try:
+ os.unlink(sockpath)
+ log.runtime(
+ f'Unlinked orphaned UDS sock-file post-SIGKILL\n'
+ f' |_{proc}\n'
+ f' |_{sockpath}\n'
+ )
+ except FileNotFoundError:
+ # raced — subactor cleaned up before SIGKILL,
+ # OR sockfile never bound (early-spawn cancel),
+ # OR transport wasn't UDS this run.
+ pass
+ except OSError as exc:
+ log.warning(
+ f'Failed to unlink subactor UDS sock-file '
+ f'post-SIGKILL\n'
+ f' |_{proc}\n'
+ f' |_{sockpath}\n'
+ f' |_{exc!r}\n'
+ )
diff --git a/tractor/spawn/_spawn.py b/tractor/spawn/_spawn.py
index c22eeb70..df3e928b 100644
--- a/tractor/spawn/_spawn.py
+++ b/tractor/spawn/_spawn.py
@@ -40,7 +40,10 @@ from tractor.runtime._state import (
_runtime_vars,
)
from tractor.log import get_logger
-from tractor.discovery._addr import UnwrappedAddress
+from tractor.discovery._addr import (
+ UnwrappedAddress,
+)
+from ._reap import unlink_uds_bind_addrs
from tractor.runtime._portal import Portal
from tractor.runtime._runtime import Actor
from tractor.msg import types as msgtypes
@@ -279,6 +282,16 @@ async def hard_kill(
# whilst also hacking on it XD
# terminate_after: int = 99999,
+ *,
+ # Subactor's bind addresses + subactor record, used
+ # for post-SIGKILL UDS sockpath cleanup. Optional for
+ # legacy callers; new call sites should pass at least
+ # `subactor` (which lets us reconstruct the sock path
+ # from `aid.name + proc.pid` when `bind_addrs` is
+ # empty/self-assigned). See `._reap.unlink_uds_bind_addrs()`.
+ bind_addrs: list[UnwrappedAddress] | None = None,
+ subactor: Actor | None = None,
+
) -> None:
'''
Un-gracefully terminate an OS level `trio.Process` after timeout.
@@ -366,6 +379,21 @@ async def hard_kill(
)
proc.kill()
+ # Post-mortem UDS sockpath cleanup. SIGKILL bypassed
+ # the subactor's normal `os.unlink(addr.sockpath)` in
+ # `_serve_ipc_eps`'s `finally:`; the parent has the
+ # bind addrs (or can reconstruct from name + pid) so
+ # we do it here. Runs UNCONDITIONALLY (graceful-exit
+ # case is a no-op via `FileNotFoundError` skip in the
+ # helper) so the cleanup also covers the "cancelled
+ # during spawn" path where the subactor never reached
+ # its IPC server finally block.
+ unlink_uds_bind_addrs(
+ proc,
+ bind_addrs=bind_addrs,
+ subactor=subactor,
+ )
+
async def soft_kill(
proc: ProcessType,
diff --git a/tractor/spawn/_trio.py b/tractor/spawn/_trio.py
index 3b425256..c2e3ba47 100644
--- a/tractor/spawn/_trio.py
+++ b/tractor/spawn/_trio.py
@@ -39,7 +39,7 @@ from tractor.runtime._state import (
current_actor,
is_root_process,
debug_mode,
- get_runtime_vars,
+ # get_runtime_vars,
)
from tractor.log import get_logger
from tractor.discovery._addr import UnwrappedAddress
@@ -282,7 +282,23 @@ async def trio_proc(
if proc.poll() is None:
log.cancel(f"Attempting to hard kill {proc}")
- await hard_kill(proc)
+ await hard_kill(
+ proc,
+ # NOTE, pass through so post-SIGKILL we
+ # can `os.unlink()` the subactor's
+ # orphaned UDS sock-file(s) — the
+ # subactor's own
+ # `_serve_ipc_eps`-`finally:` cleanup
+ # never runs under SIGKILL. `subactor`
+ # lets the helper reconstruct the
+ # sock path via `aid.name + proc.pid`
+ # when `bind_addrs` is the common
+ # self-assigned-random case
+ # (bind_addrs=None at spawn). See
+ # `_unlink_uds_bind_addrs()` in `_spawn`.
+ bind_addrs=bind_addrs,
+ subactor=subactor,
+ )
log.debug(f"Joined {proc}")
else: