From fdac157d3d74f2e641468289f522e8b95470f3e6 Mon Sep 17 00:00:00 2001 From: goodboy Date: Wed, 17 Jun 2026 19:46:04 -0400 Subject: [PATCH] Harden cancel-ack hard-kill escalation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two defensive fixes around the `Portal.cancel_actor()` + `_try_cancel_then_kill()` escalation from `34f333a0` "Escalate cancel-ack timeouts to `proc.terminate()`" (the `trionics.start_or_cancel` follow-up); surfaced by `/code-review high` on #462, - guard `proc.terminate()` for backends whose `proc` slot isn't a `Process` — the future `subint` backend stores an `int` interp-id, so escalation would `AttributeError` instead of hard-killing; now it logs + no-ops. - swap `assert cs.cancelled_caught` for an `if cs.cancelled_caught and raise_on_timeout:` guard so an unexpected shielded-scope exit returns a soft `False` rather than crashing `cancel_actor()` mid-teardown. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tractor/runtime/_portal.py | 12 ++++++++++-- tractor/runtime/_supervise.py | 15 ++++++++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/tractor/runtime/_portal.py b/tractor/runtime/_portal.py index 5ef82659..e0577d69 100644 --- a/tractor/runtime/_portal.py +++ b/tractor/runtime/_portal.py @@ -334,8 +334,11 @@ class Portal: # `move_on_after` fired — peer didn't ack within # bounded window. Behaviour depends on # `raise_on_timeout`: - assert cs.cancelled_caught - if raise_on_timeout: + if ( + cs.cancelled_caught + and + raise_on_timeout + ): raise ActorTooSlowError( f'Peer {peer_id} did not ack `Actor.cancel()`' f'-RPC within bounded wait of ' @@ -344,6 +347,11 @@ class Portal: # legacy fire-and-forget path: log + return False so # the caller can decide whether to escalate. + # + # NOTE, we also land here in the (unexpected) case where + # the shielded `move_on_after` block exits WITHOUT + # `return True` and WITHOUT the deadline firing — prefer + # a soft `False` over an `assert`-crash mid-teardown. log.debug( f'May have failed to cancel peer?\n' f'\n' diff --git a/tractor/runtime/_supervise.py b/tractor/runtime/_supervise.py index 5e66fa8f..d488ca0c 100644 --- a/tractor/runtime/_supervise.py +++ b/tractor/runtime/_supervise.py @@ -151,7 +151,20 @@ async def _try_cancel_then_kill( f' reason: {too_slow}\n' f'-> escalating to `proc.terminate()` (hard-kill)\n' ) - proc.terminate() + # XXX, the `subint` backend stores an `int` interp-id in the + # `proc` slot (not a `Process`), so it has no `.terminate()`. + # Guard here so a cancel-ack timeout doesn't `AttributeError` + # once that backend lands; its hard-kill path is a TODO. + if hasattr(proc, 'terminate'): + proc.terminate() + else: + log.error( + f'Cannot hard-kill sub-actor — backend proc-handle ' + f'{proc!r} ({type(proc).__name__!r}) has no ' + f'`.terminate()`!\n' + f' uid: {subactor.aid.reprol()!r}\n' + f'TODO: per-backend cancel-escalation.\n' + ) class ActorNursery: