Fix shutdown deadlock on UDS unlink race
Wrap `os.unlink()` in `close_listener()` with a `FileNotFoundError` guard — under concurrent pytest sessions the sock-file can already be reaped. Without this the raise aborts `_serve_ipc_eps`'s finally before `_shutdown.set()`, deadlocking `wait_for_shutdown()` on `actor.cancel()`. Also, - close each endpoint independently in the finally so one raise doesn't strand the rest. - always signal `_shutdown.set()` regardless of remaining ep count. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-codesubint_forkserver_backend
parent
7b14fdcd96
commit
2ee44a6fdd
|
|
@ -1122,20 +1122,32 @@ async def _serve_ipc_eps(
|
|||
)
|
||||
|
||||
finally:
|
||||
# close every endpoint INDEPENDENTLY: a close raising
|
||||
# mid-iter (e.g. UDS `os.unlink` racing concurrent reap) must
|
||||
# not strand the rest of the eps + must not skip the
|
||||
# `_shutdown.set()` below.
|
||||
if eps:
|
||||
addr: Address
|
||||
ep: Endpoint
|
||||
for addr, ep in server.epsdict().items():
|
||||
ep.close_listener()
|
||||
server._endpoints.remove(ep)
|
||||
for addr, ep in list(server.epsdict().items()):
|
||||
try:
|
||||
ep.close_listener()
|
||||
except Exception as ep_close_err:
|
||||
log.exception(
|
||||
f'Endpoint close raised, continuing teardown\n'
|
||||
f' |_{ep!r}\n'
|
||||
f' |_{ep_close_err!r}\n'
|
||||
)
|
||||
finally:
|
||||
try:
|
||||
server._endpoints.remove(ep)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# actor = _state.current_actor()
|
||||
# if actor.is_arbiter:
|
||||
# import pdbp; pdbp.set_trace()
|
||||
|
||||
# signal the server is "shutdown"/"terminated"
|
||||
# since no more active endpoints are active.
|
||||
if not server._endpoints:
|
||||
# always signal "shutdown" so `actor.cancel()` →
|
||||
# `ipc_server.wait_for_shutdown()` doesn't deadlock when an
|
||||
# endpoint close raised above.
|
||||
if server._shutdown is not None:
|
||||
server._shutdown.set()
|
||||
|
||||
@acm
|
||||
|
|
|
|||
|
|
@ -344,7 +344,18 @@ def close_listener(
|
|||
|
||||
'''
|
||||
lstnr.socket.close()
|
||||
os.unlink(addr.sockpath)
|
||||
# tolerate the sock-file being already gone — under concurrent
|
||||
# pytest sessions sharing the bindspace dir, another session's
|
||||
# reap path can unlink it first; raising here aborts the
|
||||
# `_serve_ipc_eps` finally before `_shutdown.set()`, deadlocking
|
||||
# `wait_for_shutdown()` on `actor.cancel()`.
|
||||
try:
|
||||
os.unlink(addr.sockpath)
|
||||
except FileNotFoundError:
|
||||
log.warning(
|
||||
f'UDS sock-file already unlinked, skipping\n'
|
||||
f' |_{addr.sockpath}\n'
|
||||
)
|
||||
|
||||
|
||||
async def open_unix_socket_w_passcred(
|
||||
|
|
|
|||
Loading…
Reference in New Issue