Fix shutdown deadlock on UDS unlink race
Wrap `os.unlink()` in `close_listener()` with a `FileNotFoundError` guard — under concurrent pytest sessions the sock-file can already be reaped. Without this the raise aborts `_serve_ipc_eps`'s finally before `_shutdown.set()`, deadlocking `wait_for_shutdown()` on `actor.cancel()`. Also, - close each endpoint independently in the finally so one raise doesn't strand the rest. - always signal `_shutdown.set()` regardless of remaining ep count. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-codesubint_forkserver_backend
parent
7b14fdcd96
commit
2ee44a6fdd
|
|
@ -1122,20 +1122,32 @@ async def _serve_ipc_eps(
|
||||||
)
|
)
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
# close every endpoint INDEPENDENTLY: a close raising
|
||||||
|
# mid-iter (e.g. UDS `os.unlink` racing concurrent reap) must
|
||||||
|
# not strand the rest of the eps + must not skip the
|
||||||
|
# `_shutdown.set()` below.
|
||||||
if eps:
|
if eps:
|
||||||
addr: Address
|
addr: Address
|
||||||
ep: Endpoint
|
ep: Endpoint
|
||||||
for addr, ep in server.epsdict().items():
|
for addr, ep in list(server.epsdict().items()):
|
||||||
ep.close_listener()
|
try:
|
||||||
server._endpoints.remove(ep)
|
ep.close_listener()
|
||||||
|
except Exception as ep_close_err:
|
||||||
|
log.exception(
|
||||||
|
f'Endpoint close raised, continuing teardown\n'
|
||||||
|
f' |_{ep!r}\n'
|
||||||
|
f' |_{ep_close_err!r}\n'
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
server._endpoints.remove(ep)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
# actor = _state.current_actor()
|
# always signal "shutdown" so `actor.cancel()` →
|
||||||
# if actor.is_arbiter:
|
# `ipc_server.wait_for_shutdown()` doesn't deadlock when an
|
||||||
# import pdbp; pdbp.set_trace()
|
# endpoint close raised above.
|
||||||
|
if server._shutdown is not None:
|
||||||
# signal the server is "shutdown"/"terminated"
|
|
||||||
# since no more active endpoints are active.
|
|
||||||
if not server._endpoints:
|
|
||||||
server._shutdown.set()
|
server._shutdown.set()
|
||||||
|
|
||||||
@acm
|
@acm
|
||||||
|
|
|
||||||
|
|
@ -344,7 +344,18 @@ def close_listener(
|
||||||
|
|
||||||
'''
|
'''
|
||||||
lstnr.socket.close()
|
lstnr.socket.close()
|
||||||
os.unlink(addr.sockpath)
|
# tolerate the sock-file being already gone — under concurrent
|
||||||
|
# pytest sessions sharing the bindspace dir, another session's
|
||||||
|
# reap path can unlink it first; raising here aborts the
|
||||||
|
# `_serve_ipc_eps` finally before `_shutdown.set()`, deadlocking
|
||||||
|
# `wait_for_shutdown()` on `actor.cancel()`.
|
||||||
|
try:
|
||||||
|
os.unlink(addr.sockpath)
|
||||||
|
except FileNotFoundError:
|
||||||
|
log.warning(
|
||||||
|
f'UDS sock-file already unlinked, skipping\n'
|
||||||
|
f' |_{addr.sockpath}\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def open_unix_socket_w_passcred(
|
async def open_unix_socket_w_passcred(
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue