From ec8c4659c429353a15374df2ecb7c559d7bda741 Mon Sep 17 00:00:00 2001 From: goodboy Date: Mon, 4 May 2026 20:03:41 -0400 Subject: [PATCH] Replace sleep with active poll in `daemon` fixture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First draft at resolving, https://github.com/goodboy/tractor/issues/424 `tests.conftest.py.daemon()` previously used a blind `time.sleep(_PROC_SPAWN_WAIT + uds_bonus + ci_bonus)` to "wait for the daemon to come up" before yielding the proc to the test. Two problems: 1. **Racy under load** — sleep is fixed at design time; loaded boxes / cold starts / fork-spawn cost spikes blow past it, leading to `ConnectionRefusedError` /`OSError: connect failed` flakes in `test_register_duplicate_name`. 2. **Wasteful when daemon comes up fast** — happy-path pays the FULL sleep regardless. ~3s of dead time per fixture invocation, ~10-20s per full suite run. Replace with `_wait_for_daemon_ready()` — active poll via stdlib `socket.create_connection` (TCP) or `socket.connect` (UDS) on the daemon's bind addr, with 50ms backoff and a 10s/15s deadline (CI gets extra headroom). Daemon-died-during-startup early-exit catches the case where `_PROC_SPAWN_WAIT` was silently masking daemon startup crashes. Why stdlib `socket` (Option 2 from the conc-anal doc) instead of `tractor`'s own `_root.ping_tpt_socket` closure or trio? - `tractor.run_daemon()` doesn't return from bootstrap until the runtime is fully ready to handle IPC, so probing listen-side acceptance is sufficient. - no need to do the full IPC handshake just to validate readiness. Sidesteps the `trio.run()` bootstrap cost (~50ms) per fixture too. `claude`'s verification: 10/10 runs of `tests/test_multi_program.py` pass on both `--tpt-proto=tcp` and `--tpt-proto=uds`. Per-test wall-time `test_register_duplicate_name`: 4.31s → 1.10s. Full file: ~12s → 3.27s per transport. Doc-tracked at: `ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md` Future work — session-scoped trio runtime in a bg thread to share fixture-side trio operations across many fixtures (currently overkill for the one fixture that needs it). (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tests/conftest.py | 118 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 99 insertions(+), 19 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 7e246db2..aadb1fb6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,7 @@ import sys import subprocess import os import signal +import socket import platform import time from pathlib import Path @@ -244,6 +245,86 @@ def sig_prog( assert ret +def _wait_for_daemon_ready( + reg_addr: tuple, + tpt_proto: str, + *, + deadline: float = 10.0, + poll_interval: float = 0.05, + proc: subprocess.Popen|None = None, +) -> None: + ''' + Active-poll the daemon's bind address until it + accepts a connection (proving it has called + `bind() + listen()` and is ready to handle IPC). + + Replaces the historical blind `time.sleep()` in the + `daemon` fixture which was racy under load — see + `ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md`. + + Uses stdlib `socket` directly (no trio runtime + bootstrap cost) — sufficient because + `tractor.run_daemon()` doesn't return from + bootstrap until the runtime is fully ready to + accept IPC. + + Raises `TimeoutError` on `deadline` exceeded. If + `proc` is given, ALSO raises early if the daemon + process exits non-zero before the deadline (catches + daemon-startup-crash that the blind sleep used to + silently mask). + + ''' + end: float = time.monotonic() + deadline + last_exc: Exception|None = None + while time.monotonic() < end: + # Daemon-died-during-startup early-exit. Without + # this, a crashed-on-import daemon would just + # eat the full deadline before raising opaque + # TimeoutError. + if proc is not None and proc.poll() is not None: + raise RuntimeError( + f'Daemon proc exited (rc={proc.returncode}) ' + f'before becoming ready to accept on ' + f'{reg_addr!r}' + ) + try: + if tpt_proto == 'tcp': + # `socket.create_connection` does the + # `socket() + connect()` dance with a + # builtin timeout — perfect primitive + # for a one-shot probe. + with socket.create_connection( + reg_addr, + timeout=poll_interval, + ): + return + else: + # UDS — `reg_addr` is a `(filedir, sockname)` + # tuple per `tractor.ipc._uds.UDSAddress.unwrap`. + sockpath: str = os.path.join(*reg_addr) + sock = socket.socket(socket.AF_UNIX) + try: + sock.settimeout(poll_interval) + sock.connect(sockpath) + return + finally: + sock.close() + except ( + ConnectionRefusedError, + FileNotFoundError, + OSError, + socket.timeout, + ) as exc: + last_exc = exc + time.sleep(poll_interval) + raise TimeoutError( + f'Daemon never accepted on {reg_addr!r} within ' + f'{deadline}s (last connect-attempt exc: ' + f'{last_exc!r})' + ) + + # TODO: factor into @cm and move to `._testing`? @pytest.fixture def daemon( @@ -298,26 +379,25 @@ def daemon( **kwargs, ) - # TODO! we should poll for the registry socket-bind to take place - # and only once that's done yield to the requester! - # -[ ] TCP: use the `._root.open_root_actor()`::`ping_tpt_socket()` - # closure! - # -[ ] UDS: can we do something similar for 'pinging" the - # file-socket? + # Active-poll the daemon's bind address until it's + # ready to accept connections — replaces the legacy + # blind `time.sleep(_PROC_SPAWN_WAIT + uds_bonus)` + # which was racy under load (see + # `ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md`). # - bg_daemon_spawn_delay: float = _PROC_SPAWN_WAIT - # UDS sockets are **really** fast to bind()/listen()/connect() - # so it's often required that we delay a bit more starting - # the first actor-tree.. - if tpt_proto == 'uds': - bg_daemon_spawn_delay += 1.6 - - if _non_linux and ci_env: - bg_daemon_spawn_delay += 1 - - # XXX, allow time for the sub-py-proc to boot up. - # !TODO, see ping-polling ideas above! - time.sleep(bg_daemon_spawn_delay) + # Per-test deadline scales with platform: macOS/CI + # gets extra headroom; Linux dev boxes need very + # little. + deadline: float = ( + 15.0 if (_non_linux and ci_env) + else 10.0 + ) + _wait_for_daemon_ready( + reg_addr=reg_addr, + tpt_proto=tpt_proto, + deadline=deadline, + proc=proc, + ) assert not proc.returncode yield proc