From 54561959e6cd69876f31a573642777b5df9d4ff7 Mon Sep 17 00:00:00 2001 From: goodboy Date: Mon, 27 Apr 2026 15:57:55 -0400 Subject: [PATCH] Log subint bootstrap excs + cancel-leak state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two diagnostic gaps in `tractor.spawn._subint.subint_proc()` that hid otherwise-silent failures, plus tracking-issue links on the two open `subint_forkserver` follow-ups. Deats, - bootstrap-exc visibility: wrap the call to `_interpreters.exec(interp_id, bootstrap)` with `try/except BaseException` + `log.exception(...)`. * Without it, an `ImportError` / `SyntaxError` raised inside the dedicated driver thread goes only to Python's default thread excepthook — invisible to the parent, which then waits forever on `subint_exited.wait()`. * `?TODO` notes `anyio`'s `to_interpreter._interp_call` + `(retval, is_exception)` pattern as the next step for re-raising; skipped now bc it must coordinate with the `trio.Cancelled` paths around the existing `.wait()` calls. - cancel-leak disambiguation: when the driver thread doesn't exit within `_HARD_KILL_TIMEOUT`, also log `_interpreters.is_running(interp_id)` as `subint_still_running=...` so the operator can tell "thread leaked, subint already done" apart from "thread alive bc subint is wedged". * pattern borrowed from `trio-parallel`'s `_sint.SintWorker.is_alive()`. - `?TODO` near the `bootstrap` literal: future switch to `_interpreters.set___main___attrs()` — same API `anyio` uses in `to_interpreter._Worker.call()` — for passing non-`repr()`-roundtrippable values (`SpawnSpec` struct, callables, etc). * add cross-refs tracking issue `#379`. Also, - `Tracked at: [#449]` link on `subint_forkserver_test_cancellation_leak_issue.md`. - `Tracked at: [#450]` link on `subint_forkserver_thread_constraints_on_pep684_issue.md`. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- ...forkserver_test_cancellation_leak_issue.md | 2 + ...rver_thread_constraints_on_pep684_issue.md | 2 + tractor/spawn/_subint.py | 41 +++++++++++++++++++ 3 files changed, 45 insertions(+) diff --git a/ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md b/ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md index 8cf22119..a685f14f 100644 --- a/ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md +++ b/ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md @@ -1,5 +1,7 @@ # `subint_forkserver` backend: `test_cancellation.py` multi-level cancel cascade hang +> **Tracked at:** [#449](https://github.com/goodboy/tractor/issues/449) + Follow-up tracker: surfaced while wiring the new `subint_forkserver` spawn backend into the full tractor test matrix (step 2 of the post-backend-lands plan). diff --git a/ai/conc-anal/subint_forkserver_thread_constraints_on_pep684_issue.md b/ai/conc-anal/subint_forkserver_thread_constraints_on_pep684_issue.md index a1042451..b3c4563d 100644 --- a/ai/conc-anal/subint_forkserver_thread_constraints_on_pep684_issue.md +++ b/ai/conc-anal/subint_forkserver_thread_constraints_on_pep684_issue.md @@ -1,5 +1,7 @@ # Revisit `subint_forkserver` thread-cache constraints once msgspec PEP 684 support lands +> **Tracked at:** [#450](https://github.com/goodboy/tractor/issues/450) + Follow-up tracker for cleanup work gated on the msgspec PEP 684 adoption upstream ([jcrist/msgspec#563](https://github.com/jcrist/msgspec/issues/563)). diff --git a/tractor/spawn/_subint.py b/tractor/spawn/_subint.py index de0325b4..d3e4e38f 100644 --- a/tractor/spawn/_subint.py +++ b/tractor/spawn/_subint.py @@ -201,6 +201,15 @@ async def subint_proc( # `parent_addr` (`tuple[str, int|str]` — see `UnwrappedAddress`) # and `infect_asyncio` (`bool`) `repr()` to valid Python # literals, so we can embed them directly. + # + # ?TODO, future SpawnSpec enrichment: if we ever want to pass + # non-`repr()`-roundtrippable values (a pre-built `SpawnSpec` + # struct, a credential token, a callable) we should switch to + # `_interpreters.set___main___attrs(interp_id, {...})` — the + # API anyio uses in `to_interpreter._Worker.call()`. Pattern: + # https://github.com/agronholm/anyio/blob/master/src/anyio/to_interpreter.py + # (the `set___main___attrs` site). See also tracking issue + # `#379`. bootstrap: str = ( 'from tractor._child import _actor_child_main\n' '_actor_child_main(\n' @@ -237,6 +246,30 @@ async def subint_proc( ''' try: _interpreters.exec(interp_id, bootstrap) + # XXX without this catch, a hard subint-bootstrap + # failure (e.g. `ImportError` because the actor module + # isn't importable inside the subint, or a syntax error + # in the bootstrap str) goes only to Python's default + # thread-excepthook and is INVISIBLE to the parent + # task. At minimum, log via `log.exception` so the + # operator sees what failed. + # ?TODO, surface the captured exc to the parent task + # via a `nonlocal err` slot inspected after + # `subint_exited.wait()` — see anyio's + # `to_interpreter._interp_call` `(retval, is_exception)` + # tuple pattern + `_subint_forkserver.py:480-494`'s + # equivalent which already does this. Skipped here for + # now: re-raise from the parent must coordinate with + # the existing `trio.Cancelled` paths around the + # `subint_exited.wait()` calls (lines 327, 362). + # NOTE: this whole dedicated-thread machinery may go + # away under #450 (PEP 684 isolated mode), in which + # case `trio.to_thread.run_sync(Interpreter.exec, ...)` + # would handle exception propagation natively. + except BaseException: + log.exception( + f'subint bootstrap raised — interp_id={interp_id}' + ) finally: try: trio.from_thread.run_sync( @@ -397,11 +430,19 @@ async def subint_proc( abandon_on_cancel=True, ) if cs.cancelled_caught: + # Disambiguate "thread leaked but subint already + # done" from "thread alive because subint is + # genuinely wedged" — pattern borrowed from + # trio-parallel's `_sint.SintWorker.is_alive()`. + still_running: bool = _interpreters.is_running( + interp_id, + ) log.warning( f'Subint driver thread did not exit within ' f'{_HARD_KILL_TIMEOUT}s — abandoning.\n' f' |_interp_id={interp_id}\n' f' |_thread={driver_thread.name}\n' + f' |_subint_still_running={still_running}\n' f'(This usually means portal-cancel could ' f'not be delivered — e.g., IPC channel was ' f'already broken. The subint will continue '