Support debug-lock blocking, use on no-more IPC
This is a lingering debugger locking race case we needed to handle: - child crashes acquires TTY lock in root and attaches to `pdb` - child IPC goes down such that all channels to the root are broken / non-functional. - root is stuck thinking the child is still in debug even though it can't be contacted and the child actor machinery hasn't been cancelled by its parent. - root get's stuck in deadlock with child since it won't send a cancel request until the child is finished debugging, but the child can't unlock the debugger bc IPC is down. To avoid this scenario add debug lock blocking list via `._debug.Lock._blocked: set[tuple]` which holds actor uids for any actor that is detected by the root as having no transport channel connections with said root (of which at least one should exist if this sub-actor at some point acquired the debug lock). The root consequently checks this list for any actor that tries to (re)acquire the lock and blocks with a `ContextCancelled`. When a debug condition is tested in `._runtime._invoke` the context's `._enter_debugger_on_cancel` which is set to `False` if the actor is on the block list in which case the post-mortem entry is skipped. Further this adds a root-locking-task side cancel scope to `Lock._root_local_task_cs_in_debug` which can be cancelled by the root runtime when a stale lock is detected after all IPC channels for the actor have been torn down. NOTE: right now we're NOT doing this since it seems to cause test failures likely due because it may cause pre-mature cancellation and maybe needs a bit more experimenting?debug_lock_blocking
parent
734d8dd663
commit
fb721f36ef
|
@ -38,8 +38,14 @@ from trio_typing import TaskStatus
|
||||||
|
|
||||||
from .log import get_logger
|
from .log import get_logger
|
||||||
from ._discovery import get_root
|
from ._discovery import get_root
|
||||||
from ._state import is_root_process, debug_mode
|
from ._state import (
|
||||||
from ._exceptions import is_multi_cancelled
|
is_root_process,
|
||||||
|
debug_mode,
|
||||||
|
)
|
||||||
|
from ._exceptions import (
|
||||||
|
is_multi_cancelled,
|
||||||
|
ContextCancelled,
|
||||||
|
)
|
||||||
from ._ipc import Channel
|
from ._ipc import Channel
|
||||||
|
|
||||||
|
|
||||||
|
@ -72,6 +78,18 @@ class Lock:
|
||||||
# actor-wide variable pointing to current task name using debugger
|
# actor-wide variable pointing to current task name using debugger
|
||||||
local_task_in_debug: Optional[str] = None
|
local_task_in_debug: Optional[str] = None
|
||||||
|
|
||||||
|
# NOTE: set by the current task waiting on the root tty lock from
|
||||||
|
# the CALLER side of the `lock_tty_for_child()` context entry-call
|
||||||
|
# and must be cancelled if this actor is cancelled via IPC
|
||||||
|
# request-message otherwise deadlocks with the parent actor may
|
||||||
|
# ensure
|
||||||
|
_debugger_request_cs: Optional[trio.CancelScope] = None
|
||||||
|
|
||||||
|
# NOTE: set only in the root actor for the **local** root spawned task
|
||||||
|
# which has acquired the lock (i.e. this is on the callee side of
|
||||||
|
# the `lock_tty_for_child()` context entry).
|
||||||
|
_root_local_task_cs_in_debug: Optional[trio.CancelScope] = None
|
||||||
|
|
||||||
# actor tree-wide actor uid that supposedly has the tty lock
|
# actor tree-wide actor uid that supposedly has the tty lock
|
||||||
global_actor_in_debug: Optional[tuple[str, str]] = None
|
global_actor_in_debug: Optional[tuple[str, str]] = None
|
||||||
|
|
||||||
|
@ -81,12 +99,8 @@ class Lock:
|
||||||
# lock in root actor preventing multi-access to local tty
|
# lock in root actor preventing multi-access to local tty
|
||||||
_debug_lock: trio.StrictFIFOLock = trio.StrictFIFOLock()
|
_debug_lock: trio.StrictFIFOLock = trio.StrictFIFOLock()
|
||||||
|
|
||||||
# XXX: set by the current task waiting on the root tty lock
|
|
||||||
# and must be cancelled if this actor is cancelled via message
|
|
||||||
# otherwise deadlocks with the parent actor may ensure
|
|
||||||
_debugger_request_cs: Optional[trio.CancelScope] = None
|
|
||||||
|
|
||||||
_orig_sigint_handler: Optional[Callable] = None
|
_orig_sigint_handler: Optional[Callable] = None
|
||||||
|
_blocked: set[tuple[str, str]] = set()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def shield_sigint(cls):
|
def shield_sigint(cls):
|
||||||
|
@ -196,6 +210,12 @@ async def _acquire_debug_lock_from_root_task(
|
||||||
f"entering lock checkpoint, remote task: {task_name}:{uid}"
|
f"entering lock checkpoint, remote task: {task_name}:{uid}"
|
||||||
)
|
)
|
||||||
we_acquired = True
|
we_acquired = True
|
||||||
|
|
||||||
|
# NOTE: if the surrounding cancel scope from the
|
||||||
|
# `lock_tty_for_child()` caller is cancelled, this line should
|
||||||
|
# unblock and NOT leave us in some kind of
|
||||||
|
# a "child-locked-TTY-but-child-is-uncontactable-over-IPC"
|
||||||
|
# condition.
|
||||||
await Lock._debug_lock.acquire()
|
await Lock._debug_lock.acquire()
|
||||||
|
|
||||||
if Lock.no_remote_has_tty is None:
|
if Lock.no_remote_has_tty is None:
|
||||||
|
@ -267,6 +287,15 @@ async def lock_tty_for_child(
|
||||||
'''
|
'''
|
||||||
task_name = trio.lowlevel.current_task().name
|
task_name = trio.lowlevel.current_task().name
|
||||||
|
|
||||||
|
if tuple(subactor_uid) in Lock._blocked:
|
||||||
|
log.warning(
|
||||||
|
f'Actor {subactor_uid} is blocked from acquiring debug lock\n'
|
||||||
|
f"remote task: {task_name}:{subactor_uid}"
|
||||||
|
)
|
||||||
|
ctx._enter_debugger_on_cancel = False
|
||||||
|
await ctx.cancel(f'Debug lock blocked for {subactor_uid}')
|
||||||
|
return 'pdb_lock_blocked'
|
||||||
|
|
||||||
# TODO: when we get to true remote debugging
|
# TODO: when we get to true remote debugging
|
||||||
# this will deliver stdin data?
|
# this will deliver stdin data?
|
||||||
|
|
||||||
|
@ -280,8 +309,9 @@ async def lock_tty_for_child(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with (
|
with (
|
||||||
trio.CancelScope(shield=True),
|
trio.CancelScope(shield=True) as debug_lock_cs,
|
||||||
):
|
):
|
||||||
|
Lock._root_local_task_cs_in_debug = debug_lock_cs
|
||||||
async with _acquire_debug_lock_from_root_task(subactor_uid):
|
async with _acquire_debug_lock_from_root_task(subactor_uid):
|
||||||
|
|
||||||
# indicate to child that we've locked stdio
|
# indicate to child that we've locked stdio
|
||||||
|
@ -297,6 +327,7 @@ async def lock_tty_for_child(
|
||||||
return "pdb_unlock_complete"
|
return "pdb_unlock_complete"
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
Lock._root_local_task_cs_in_debug = None
|
||||||
Lock.unshield_sigint()
|
Lock.unshield_sigint()
|
||||||
|
|
||||||
|
|
||||||
|
@ -353,7 +384,7 @@ async def wait_for_parent_stdin_hijack(
|
||||||
|
|
||||||
log.pdb('unlocked context')
|
log.pdb('unlocked context')
|
||||||
|
|
||||||
except tractor.ContextCancelled:
|
except ContextCancelled:
|
||||||
log.warning('Root actor cancelled debug lock')
|
log.warning('Root actor cancelled debug lock')
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
@ -721,7 +752,9 @@ async def _maybe_enter_pm(err):
|
||||||
and not is_multi_cancelled(err)
|
and not is_multi_cancelled(err)
|
||||||
):
|
):
|
||||||
log.debug("Actor crashed, entering debug mode")
|
log.debug("Actor crashed, entering debug mode")
|
||||||
|
try:
|
||||||
await post_mortem()
|
await post_mortem()
|
||||||
|
finally:
|
||||||
Lock.release()
|
Lock.release()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
@ -234,6 +234,9 @@ async def _invoke(
|
||||||
f'{ctx.chan.uid}'
|
f'{ctx.chan.uid}'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if ctx._cancel_msg:
|
||||||
|
msg += f' with msg:\n{ctx._cancel_msg}'
|
||||||
|
|
||||||
# task-contex was cancelled so relay to the cancel to caller
|
# task-contex was cancelled so relay to the cancel to caller
|
||||||
raise ContextCancelled(
|
raise ContextCancelled(
|
||||||
msg,
|
msg,
|
||||||
|
@ -275,8 +278,16 @@ async def _invoke(
|
||||||
# if not is_multi_cancelled(err) and (
|
# if not is_multi_cancelled(err) and (
|
||||||
|
|
||||||
entered_debug: bool = False
|
entered_debug: bool = False
|
||||||
if not isinstance(err, ContextCancelled) or (
|
if (
|
||||||
isinstance(err, ContextCancelled) and ctx._cancel_called
|
not isinstance(err, ContextCancelled)
|
||||||
|
or (
|
||||||
|
isinstance(err, ContextCancelled)
|
||||||
|
and ctx._cancel_called
|
||||||
|
|
||||||
|
# if the root blocks the debugger lock request from a child
|
||||||
|
# we will get a remote-cancelled condition.
|
||||||
|
and ctx._enter_debugger_on_cancel
|
||||||
|
)
|
||||||
):
|
):
|
||||||
# XXX: is there any case where we'll want to debug IPC
|
# XXX: is there any case where we'll want to debug IPC
|
||||||
# disconnects as a default?
|
# disconnects as a default?
|
||||||
|
@ -286,7 +297,6 @@ async def _invoke(
|
||||||
# recovery logic - the only case is some kind of strange bug
|
# recovery logic - the only case is some kind of strange bug
|
||||||
# in our transport layer itself? Going to keep this
|
# in our transport layer itself? Going to keep this
|
||||||
# open ended for now.
|
# open ended for now.
|
||||||
|
|
||||||
entered_debug = await _debug._maybe_enter_pm(err)
|
entered_debug = await _debug._maybe_enter_pm(err)
|
||||||
|
|
||||||
if not entered_debug:
|
if not entered_debug:
|
||||||
|
@ -702,6 +712,34 @@ class Actor:
|
||||||
# if chan.uid == uid:
|
# if chan.uid == uid:
|
||||||
# self._contexts.pop((uid, cid))
|
# self._contexts.pop((uid, cid))
|
||||||
|
|
||||||
|
# NOTE: block this actor from acquiring the
|
||||||
|
# debugger-TTY-lock since we have no way to know if we
|
||||||
|
# cancelled it and further there is no way to ensure the
|
||||||
|
# lock will be released if acquired due to having no
|
||||||
|
# more active IPC channels.
|
||||||
|
if (
|
||||||
|
_state.is_root_process()
|
||||||
|
):
|
||||||
|
pdb_lock = _debug.Lock
|
||||||
|
log.cancel(
|
||||||
|
f'{uid} is now blocked from debugger lock'
|
||||||
|
)
|
||||||
|
log.runtime(f"{uid} blocked from pdb locking")
|
||||||
|
pdb_lock._blocked.add(uid)
|
||||||
|
|
||||||
|
# if a now stale local task has the TTY lock still
|
||||||
|
# we cancel it to allow servicing other requests for
|
||||||
|
# the lock.
|
||||||
|
if (
|
||||||
|
pdb_lock._root_local_task_cs_in_debug
|
||||||
|
and not pdb_lock._root_local_task_cs_in_debug.cancel_called
|
||||||
|
):
|
||||||
|
log.warning(
|
||||||
|
f'STALE DEBUG LOCK DETECTED FOR {uid}'
|
||||||
|
)
|
||||||
|
# TODO: figure out why this breaks tests..
|
||||||
|
# pdb_lock._root_local_task_cs_in_debug.cancel()
|
||||||
|
|
||||||
log.runtime(f"Peers is {self._peers}")
|
log.runtime(f"Peers is {self._peers}")
|
||||||
|
|
||||||
# No more channels to other actors (at all) registered
|
# No more channels to other actors (at all) registered
|
||||||
|
|
|
@ -372,7 +372,7 @@ class Context:
|
||||||
# status flags
|
# status flags
|
||||||
_cancel_called: bool = False
|
_cancel_called: bool = False
|
||||||
_cancel_msg: Optional[str] = None
|
_cancel_msg: Optional[str] = None
|
||||||
_trigger_debugger_on_cancel: bool = True
|
_enter_debugger_on_cancel: bool = True
|
||||||
_started_called: bool = False
|
_started_called: bool = False
|
||||||
_started_received: bool = False
|
_started_received: bool = False
|
||||||
_stream_opened: bool = False
|
_stream_opened: bool = False
|
||||||
|
|
Loading…
Reference in New Issue