Replace sleep with active poll in `daemon` fixture
First draft at resolving, https://github.com/goodboy/tractor/issues/424 `tests.conftest.py.daemon()` previously used a blind `time.sleep(_PROC_SPAWN_WAIT + uds_bonus + ci_bonus)` to "wait for the daemon to come up" before yielding the proc to the test. Two problems: 1. **Racy under load** — sleep is fixed at design time; loaded boxes / cold starts / fork-spawn cost spikes blow past it, leading to `ConnectionRefusedError` /`OSError: connect failed` flakes in `test_register_duplicate_name`. 2. **Wasteful when daemon comes up fast** — happy-path pays the FULL sleep regardless. ~3s of dead time per fixture invocation, ~10-20s per full suite run. Replace with `_wait_for_daemon_ready()` — active poll via stdlib `socket.create_connection` (TCP) or `socket.connect` (UDS) on the daemon's bind addr, with 50ms backoff and a 10s/15s deadline (CI gets extra headroom). Daemon-died-during-startup early-exit catches the case where `_PROC_SPAWN_WAIT` was silently masking daemon startup crashes. Why stdlib `socket` (Option 2 from the conc-anal doc) instead of `tractor`'s own `_root.ping_tpt_socket` closure or trio? - `tractor.run_daemon()` doesn't return from bootstrap until the runtime is fully ready to handle IPC, so probing listen-side acceptance is sufficient. - no need to do the full IPC handshake just to validate readiness. Sidesteps the `trio.run()` bootstrap cost (~50ms) per fixture too. `claude`'s verification: 10/10 runs of `tests/test_multi_program.py` pass on both `--tpt-proto=tcp` and `--tpt-proto=uds`. Per-test wall-time `test_register_duplicate_name`: 4.31s → 1.10s. Full file: ~12s → 3.27s per transport. Doc-tracked at: `ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md` Future work — session-scoped trio runtime in a bg thread to share fixture-side trio operations across many fixtures (currently overkill for the one fixture that needs it). (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-codesubint_forkserver_backend
parent
29f9928524
commit
ec8c4659c4
|
|
@ -7,6 +7,7 @@ import sys
|
||||||
import subprocess
|
import subprocess
|
||||||
import os
|
import os
|
||||||
import signal
|
import signal
|
||||||
|
import socket
|
||||||
import platform
|
import platform
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -244,6 +245,86 @@ def sig_prog(
|
||||||
assert ret
|
assert ret
|
||||||
|
|
||||||
|
|
||||||
|
def _wait_for_daemon_ready(
|
||||||
|
reg_addr: tuple,
|
||||||
|
tpt_proto: str,
|
||||||
|
*,
|
||||||
|
deadline: float = 10.0,
|
||||||
|
poll_interval: float = 0.05,
|
||||||
|
proc: subprocess.Popen|None = None,
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
Active-poll the daemon's bind address until it
|
||||||
|
accepts a connection (proving it has called
|
||||||
|
`bind() + listen()` and is ready to handle IPC).
|
||||||
|
|
||||||
|
Replaces the historical blind `time.sleep()` in the
|
||||||
|
`daemon` fixture which was racy under load — see
|
||||||
|
`ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md`.
|
||||||
|
|
||||||
|
Uses stdlib `socket` directly (no trio runtime
|
||||||
|
bootstrap cost) — sufficient because
|
||||||
|
`tractor.run_daemon()` doesn't return from
|
||||||
|
bootstrap until the runtime is fully ready to
|
||||||
|
accept IPC.
|
||||||
|
|
||||||
|
Raises `TimeoutError` on `deadline` exceeded. If
|
||||||
|
`proc` is given, ALSO raises early if the daemon
|
||||||
|
process exits non-zero before the deadline (catches
|
||||||
|
daemon-startup-crash that the blind sleep used to
|
||||||
|
silently mask).
|
||||||
|
|
||||||
|
'''
|
||||||
|
end: float = time.monotonic() + deadline
|
||||||
|
last_exc: Exception|None = None
|
||||||
|
while time.monotonic() < end:
|
||||||
|
# Daemon-died-during-startup early-exit. Without
|
||||||
|
# this, a crashed-on-import daemon would just
|
||||||
|
# eat the full deadline before raising opaque
|
||||||
|
# TimeoutError.
|
||||||
|
if proc is not None and proc.poll() is not None:
|
||||||
|
raise RuntimeError(
|
||||||
|
f'Daemon proc exited (rc={proc.returncode}) '
|
||||||
|
f'before becoming ready to accept on '
|
||||||
|
f'{reg_addr!r}'
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
if tpt_proto == 'tcp':
|
||||||
|
# `socket.create_connection` does the
|
||||||
|
# `socket() + connect()` dance with a
|
||||||
|
# builtin timeout — perfect primitive
|
||||||
|
# for a one-shot probe.
|
||||||
|
with socket.create_connection(
|
||||||
|
reg_addr,
|
||||||
|
timeout=poll_interval,
|
||||||
|
):
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
# UDS — `reg_addr` is a `(filedir, sockname)`
|
||||||
|
# tuple per `tractor.ipc._uds.UDSAddress.unwrap`.
|
||||||
|
sockpath: str = os.path.join(*reg_addr)
|
||||||
|
sock = socket.socket(socket.AF_UNIX)
|
||||||
|
try:
|
||||||
|
sock.settimeout(poll_interval)
|
||||||
|
sock.connect(sockpath)
|
||||||
|
return
|
||||||
|
finally:
|
||||||
|
sock.close()
|
||||||
|
except (
|
||||||
|
ConnectionRefusedError,
|
||||||
|
FileNotFoundError,
|
||||||
|
OSError,
|
||||||
|
socket.timeout,
|
||||||
|
) as exc:
|
||||||
|
last_exc = exc
|
||||||
|
time.sleep(poll_interval)
|
||||||
|
raise TimeoutError(
|
||||||
|
f'Daemon never accepted on {reg_addr!r} within '
|
||||||
|
f'{deadline}s (last connect-attempt exc: '
|
||||||
|
f'{last_exc!r})'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# TODO: factor into @cm and move to `._testing`?
|
# TODO: factor into @cm and move to `._testing`?
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def daemon(
|
def daemon(
|
||||||
|
|
@ -298,26 +379,25 @@ def daemon(
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO! we should poll for the registry socket-bind to take place
|
# Active-poll the daemon's bind address until it's
|
||||||
# and only once that's done yield to the requester!
|
# ready to accept connections — replaces the legacy
|
||||||
# -[ ] TCP: use the `._root.open_root_actor()`::`ping_tpt_socket()`
|
# blind `time.sleep(_PROC_SPAWN_WAIT + uds_bonus)`
|
||||||
# closure!
|
# which was racy under load (see
|
||||||
# -[ ] UDS: can we do something similar for 'pinging" the
|
# `ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md`).
|
||||||
# file-socket?
|
|
||||||
#
|
#
|
||||||
bg_daemon_spawn_delay: float = _PROC_SPAWN_WAIT
|
# Per-test deadline scales with platform: macOS/CI
|
||||||
# UDS sockets are **really** fast to bind()/listen()/connect()
|
# gets extra headroom; Linux dev boxes need very
|
||||||
# so it's often required that we delay a bit more starting
|
# little.
|
||||||
# the first actor-tree..
|
deadline: float = (
|
||||||
if tpt_proto == 'uds':
|
15.0 if (_non_linux and ci_env)
|
||||||
bg_daemon_spawn_delay += 1.6
|
else 10.0
|
||||||
|
)
|
||||||
if _non_linux and ci_env:
|
_wait_for_daemon_ready(
|
||||||
bg_daemon_spawn_delay += 1
|
reg_addr=reg_addr,
|
||||||
|
tpt_proto=tpt_proto,
|
||||||
# XXX, allow time for the sub-py-proc to boot up.
|
deadline=deadline,
|
||||||
# !TODO, see ping-polling ideas above!
|
proc=proc,
|
||||||
time.sleep(bg_daemon_spawn_delay)
|
)
|
||||||
|
|
||||||
assert not proc.returncode
|
assert not proc.returncode
|
||||||
yield proc
|
yield proc
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue