From 34f333a0266d34d666296d311cc63072c8521423 Mon Sep 17 00:00:00 2001 From: goodboy Date: Thu, 7 May 2026 18:01:59 -0400 Subject: [PATCH] Escalate cancel-ack timeouts to `proc.terminate()` Wires SC-discipline cancel-then-escalate into `ActorNursery.cancel()`: graceful cancel-req -> bounded wait -> hard-kill Deats, - add `raise_on_timeout: bool = False` kwarg to `Portal.cancel_actor()`. When `True`, bounded- wait expiry raises `ActorTooSlowError` instead of the legacy DEBUG-log + return-`False` path. Default stays `False` for callers that handle their own escalation (e.g. `_spawn.soft_kill()` polling `proc.poll()`). - add `_try_cancel_then_kill()` helper in `_supervise` used by per-child cancel tasks. On `ActorTooSlowError`, escalates via `proc.terminate()` (SIGTERM) so a non-acking sub doesn't park `soft_kill()` forever waiting on `proc.poll()`. - replace `tn.start_soon(portal.cancel_actor)` in `ActorNursery.cancel()` with the helper. Debug-mode bypass: ----------------- skip escalation (fall back to legacy fire-and-forget cancel) when ANY of: - `Lock.ctx_in_debug is not None` (some actor is currently REPL-locked) - `_runtime_vars['_debug_mode']` (root opened with `debug_mode=True`). - `ActorNursery._at_least_one_child_in_debug` (per-child `debug_mode=` opt-in). ORing covers root-debug, child-debug, and active- REPL-lock cases without false-positively SIGTERM- ing a sub-tree proxying stdio for a REPL session. Motivated by the `subint_forkserver` dup-name hang where a same-named sibling subactor's cancel-RPC failed to ack within `Portal.cancel_timeout` (TCP+ forkserver register-RPC contention) and the nursery `__aexit__` deadlocked. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tractor/runtime/_portal.py | 48 +++++++++++---- tractor/runtime/_supervise.py | 110 ++++++++++++++++++++++++++++++++-- 2 files changed, 141 insertions(+), 17 deletions(-) diff --git a/tractor/runtime/_portal.py b/tractor/runtime/_portal.py index 8b0948c8..5ef82659 100644 --- a/tractor/runtime/_portal.py +++ b/tractor/runtime/_portal.py @@ -55,6 +55,7 @@ from ..msg import ( Return, ) from .._exceptions import ( + ActorTooSlowError, NoResult, TransportClosed, ) @@ -268,6 +269,7 @@ class Portal: async def cancel_actor( self, timeout: float | None = None, + raise_on_timeout: bool = False, ) -> bool: ''' @@ -281,6 +283,17 @@ class Portal: `._context.Context.cancel()` which CAN be used for this purpose. + `raise_on_timeout` (default `False`): + + - `False` (legacy): on bounded-wait expiry, log at DEBUG + and return `False`. Used by callers that issue cancel + fire-and-forget and have their own escalation + (e.g. `_spawn.soft_kill()` checks `proc.poll()` after). + - `True`: on bounded-wait expiry, raise `ActorTooSlowError` + so the caller MUST handle the failure explicitly. + `ActorNursery.cancel()` opts in so it can escalate via + `proc.terminate()` per SC-discipline. + ''' __runtimeframe__: int = 1 # noqa @@ -301,15 +314,16 @@ class Portal: # XXX the one spot we set it? chan._cancel_called: bool = True + cancel_timeout: float = ( + timeout + or + self.cancel_timeout + ) try: # send cancel cmd - might not get response # XXX: sure would be nice to make this work with # a proper shield - with trio.move_on_after( - timeout - or - self.cancel_timeout - ) as cs: + with trio.move_on_after(cancel_timeout) as cs: cs.shield: bool = True await self.run_from_ns( 'self', @@ -317,16 +331,24 @@ class Portal: ) return True - if cs.cancelled_caught: - # may timeout and we never get an ack (obvi racy) - # but that doesn't mean it wasn't cancelled. - log.debug( - f'May have failed to cancel peer?\n' - f'\n' - f'c)=?> {peer_id}\n' + # `move_on_after` fired — peer didn't ack within + # bounded window. Behaviour depends on + # `raise_on_timeout`: + assert cs.cancelled_caught + if raise_on_timeout: + raise ActorTooSlowError( + f'Peer {peer_id} did not ack `Actor.cancel()`' + f'-RPC within bounded wait of ' + f'{cancel_timeout!r}s' ) - # if we get here some weird cancellation case happened + # legacy fire-and-forget path: log + return False so + # the caller can decide whether to escalate. + log.debug( + f'May have failed to cancel peer?\n' + f'\n' + f'c)=?> {peer_id}\n' + ) return False except TransportClosed as tpt_err: diff --git a/tractor/runtime/_supervise.py b/tractor/runtime/_supervise.py index 6d2d573f..5e66fa8f 100644 --- a/tractor/runtime/_supervise.py +++ b/tractor/runtime/_supervise.py @@ -38,8 +38,14 @@ from ..discovery._addr import ( UnwrappedAddress, mk_uuid, ) -from ._state import current_actor, is_main_process -from ..log import get_logger, get_loglevel +from ._state import ( + current_actor, + is_main_process, +) +from ..log import ( + get_logger, + get_loglevel, +) from ._runtime import Actor from ._portal import Portal from ..trionics import ( @@ -47,6 +53,7 @@ from ..trionics import ( collapse_eg, ) from .._exceptions import ( + ActorTooSlowError, ContextCancelled, ) from .._root import ( @@ -60,11 +67,93 @@ if TYPE_CHECKING: import multiprocessing as mp # from ..ipc._server import IPCServer from ..ipc import IPCServer + from ..spawn._spawn import ProcessType log = get_logger() +async def _try_cancel_then_kill( + portal: Portal, + # `ProcessType` is `TYPE_CHECKING`-only (defined under that + # guard in `..spawn._spawn`) so we stringify here to avoid + # eager runtime eval of the annotation at function-def time + # (this module has no `from __future__ import annotations`). + proc: 'ProcessType', + subactor: Actor, + debug_mode_active: bool = False, +) -> None: + ''' + Per-child cancel-then-escalate helper used by + `ActorNursery.cancel()`. + + Sends a graceful actor-runtime cancel-RPC via + `Portal.cancel_actor(raise_on_timeout=True)`. If the bounded-wait + expires before the peer ack's, `ActorTooSlowError` is raised and + we escalate via `proc.terminate()` (SIGTERM) per SC-discipline: + + graceful cancel-req -> bounded wait -> hard-kill + + Without this escalation, a same-name sibling subactor whose + cancel-RPC failed to ack within `Portal.cancel_timeout` (e.g. + under TCP+forkserver register-RPC contention) would park the + parent's `soft_kill()` watcher forever waiting on `proc.poll()`, + deadlocking nursery `__aexit__`. See `ActorTooSlowError` for + the wider write-up. + + ''' + # XXX, do NOT escalate to `proc.terminate()` while ANY of + # the following are true — SIGTERM-ing a sub would tear + # down its sub-tree including any descendant proxying + # stdio to/from a REPL-locked actor, clobbering the user's + # debug session: + # + # - `Lock.ctx_in_debug is not None`: most precise — some + # actor in the tree is currently REPL-locked. Set in the + # root actor for the lifetime of the lock. Raceable + # (false negative if SIGINT arrives before lock-acquire + # RPC completes). + # + # - `_runtime_vars['_debug_mode']`: root-actor was opened + # with `debug_mode=True` (via `open_root_actor` / + # `open_nursery`). Set once at root boot, never cleared. + # Catches deep-descendant REPL sessions even when the + # intermediate nurseries didn't pass `debug_mode=` per- + # child. + # + # - `debug_mode_active`: this nursery has at least one + # child started with an explicit `debug_mode=` arg + # (`ActorNursery._at_least_one_child_in_debug`). Catches + # the case where root is NOT in debug-mode but a + # nursery-direct child opted in. + # + # Independent because root may NOT be in debug-mode even + # when a child is (only the child's `_runtime_vars` is + # mutated by per-child `debug_mode=True`). ORing covers + # every flavor without false-positively skipping + # legitimate hard-kill paths in non-debug trees. + if ( + debug.Lock.ctx_in_debug is not None + or + _state._runtime_vars.get('_debug_mode', False) + or + debug_mode_active + ): + await portal.cancel_actor() + return + + try: + await portal.cancel_actor(raise_on_timeout=True) + except ActorTooSlowError as too_slow: + log.error( + f'Cancel-ack TIMED OUT for sub-actor\n' + f' uid: {subactor.aid.reprol()!r}\n' + f' reason: {too_slow}\n' + f'-> escalating to `proc.terminate()` (hard-kill)\n' + ) + proc.terminate() + + class ActorNursery: ''' The fundamental actor supervision construct: spawn and manage @@ -428,10 +517,23 @@ class ActorNursery: else: # there's no other choice left proc.terminate() - # spawn cancel tasks for each sub-actor + # spawn per-child cancel tasks; the helper + # escalates to hard-kill on + # `ActorTooSlowError` rather than silently + # swallowing the cancel-ack timeout, EXCEPT + # when this nursery has any debug-eligible + # child (in which case we keep legacy + # fire-and-forget semantics to avoid + # clobbering an active REPL). assert portal if portal.channel.connected(): - tn.start_soon(portal.cancel_actor) + tn.start_soon( + _try_cancel_then_kill, + portal, + proc, + subactor, + self._at_least_one_child_in_debug, + ) log.cancel(msg) # if we cancelled the cancel (we hung cancelling remote actors)