From 2ee44a6fdd11b1750ae1363489b59e7bab4c47ba Mon Sep 17 00:00:00 2001 From: goodboy Date: Wed, 6 May 2026 14:11:51 -0400 Subject: [PATCH] Fix shutdown deadlock on UDS unlink race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap `os.unlink()` in `close_listener()` with a `FileNotFoundError` guard — under concurrent pytest sessions the sock-file can already be reaped. Without this the raise aborts `_serve_ipc_eps`'s finally before `_shutdown.set()`, deadlocking `wait_for_shutdown()` on `actor.cancel()`. Also, - close each endpoint independently in the finally so one raise doesn't strand the rest. - always signal `_shutdown.set()` regardless of remaining ep count. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tractor/ipc/_server.py | 32 ++++++++++++++++++++++---------- tractor/ipc/_uds.py | 13 ++++++++++++- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/tractor/ipc/_server.py b/tractor/ipc/_server.py index 3fd965c5..9701ec6d 100644 --- a/tractor/ipc/_server.py +++ b/tractor/ipc/_server.py @@ -1122,20 +1122,32 @@ async def _serve_ipc_eps( ) finally: + # close every endpoint INDEPENDENTLY: a close raising + # mid-iter (e.g. UDS `os.unlink` racing concurrent reap) must + # not strand the rest of the eps + must not skip the + # `_shutdown.set()` below. if eps: addr: Address ep: Endpoint - for addr, ep in server.epsdict().items(): - ep.close_listener() - server._endpoints.remove(ep) + for addr, ep in list(server.epsdict().items()): + try: + ep.close_listener() + except Exception as ep_close_err: + log.exception( + f'Endpoint close raised, continuing teardown\n' + f' |_{ep!r}\n' + f' |_{ep_close_err!r}\n' + ) + finally: + try: + server._endpoints.remove(ep) + except ValueError: + pass - # actor = _state.current_actor() - # if actor.is_arbiter: - # import pdbp; pdbp.set_trace() - - # signal the server is "shutdown"/"terminated" - # since no more active endpoints are active. - if not server._endpoints: + # always signal "shutdown" so `actor.cancel()` → + # `ipc_server.wait_for_shutdown()` doesn't deadlock when an + # endpoint close raised above. + if server._shutdown is not None: server._shutdown.set() @acm diff --git a/tractor/ipc/_uds.py b/tractor/ipc/_uds.py index 3b214f6a..8c57664d 100644 --- a/tractor/ipc/_uds.py +++ b/tractor/ipc/_uds.py @@ -344,7 +344,18 @@ def close_listener( ''' lstnr.socket.close() - os.unlink(addr.sockpath) + # tolerate the sock-file being already gone — under concurrent + # pytest sessions sharing the bindspace dir, another session's + # reap path can unlink it first; raising here aborts the + # `_serve_ipc_eps` finally before `_shutdown.set()`, deadlocking + # `wait_for_shutdown()` on `actor.cancel()`. + try: + os.unlink(addr.sockpath) + except FileNotFoundError: + log.warning( + f'UDS sock-file already unlinked, skipping\n' + f' |_{addr.sockpath}\n' + ) async def open_unix_socket_w_passcred(