Merge pull request #337 from goodboy/debug_lock_blocking
Debug lock blockingdun_unset_current_actor
						commit
						247d3448ae
					
				|  | @ -0,0 +1,41 @@ | |||
| Add support for debug-lock blocking using a ``._debug.Lock._blocked: | ||||
| set[tuple]`` and add ids when no-more IPC connections with the | ||||
| root actor are detected. | ||||
| 
 | ||||
| This is an enhancement which (mostly) solves a lingering debugger | ||||
| locking race case we needed to handle: | ||||
| 
 | ||||
| - child crashes acquires TTY lock in root and attaches to ``pdb`` | ||||
| - child IPC goes down such that all channels to the root are broken | ||||
|   / non-functional. | ||||
| - root is stuck thinking the child is still in debug even though it | ||||
|   can't be contacted and the child actor machinery hasn't been | ||||
|   cancelled by its parent. | ||||
| - root get's stuck in deadlock with child since it won't send a cancel | ||||
|   request until the child is finished debugging (to avoid clobbering | ||||
|   a child that is actually using the debugger), but the child can't | ||||
|   unlock the debugger bc IPC is down and it can't contact the root. | ||||
| 
 | ||||
| To avoid this scenario add debug lock blocking list via | ||||
| `._debug.Lock._blocked: set[tuple]` which holds actor uids for any actor | ||||
| that is detected by the root as having no transport channel connections | ||||
| (of which at least one should exist if this sub-actor at some point | ||||
| acquired the debug lock). The root consequently checks this list for any | ||||
| actor that tries to (re)acquire the lock and blocks with | ||||
| a ``ContextCancelled``. Further, when a debug condition is tested in | ||||
| ``._runtime._invoke``, the context's ``._enter_debugger_on_cancel`` is | ||||
| set to `False` if the actor was put on the block list then all | ||||
| post-mortem / crash handling will be bypassed for that task. | ||||
| 
 | ||||
| In theory this approach to block list management may cause problems | ||||
| where some nested child actor acquires and releases the lock multiple | ||||
| times and it gets stuck on the block list after the first use? If this | ||||
| turns out to be an issue we can try changing the strat so blocks are | ||||
| only added when the root has zero IPC peers left? | ||||
| 
 | ||||
| Further, this adds a root-locking-task side cancel scope, | ||||
| ``Lock._root_local_task_cs_in_debug``, which can be ``.cancel()``-ed by the root | ||||
| runtime when a stale lock is detected during the IPC channel testing. | ||||
| However, right now we're NOT using this since it seems to cause test | ||||
| failures likely due to causing pre-mature cancellation and maybe needs | ||||
| a bit more experimenting? | ||||
|  | @ -2,7 +2,7 @@ | |||
| package = "tractor" | ||||
| filename = "NEWS.rst" | ||||
| directory = "nooz/" | ||||
| version = "0.1.0a5" | ||||
| version = "0.1.0a6" | ||||
| title_format = "tractor {version} ({project_date})" | ||||
| template = "nooz/_template.rst" | ||||
| all_bullets = true | ||||
|  |  | |||
|  | @ -170,11 +170,11 @@ async def trio_ctx( | |||
|     # message. | ||||
|     with trio.fail_after(2): | ||||
|         async with ( | ||||
|             trio.open_nursery() as n, | ||||
| 
 | ||||
|             tractor.to_asyncio.open_channel_from( | ||||
|                 sleep_and_err, | ||||
|             ) as (first, chan), | ||||
| 
 | ||||
|             trio.open_nursery() as n, | ||||
|         ): | ||||
| 
 | ||||
|             assert first == 'start' | ||||
|  | @ -203,24 +203,25 @@ def test_context_spawns_aio_task_that_errors( | |||
|     ''' | ||||
|     async def main(): | ||||
| 
 | ||||
|         async with tractor.open_nursery() as n: | ||||
|             p = await n.start_actor( | ||||
|                 'aio_daemon', | ||||
|                 enable_modules=[__name__], | ||||
|                 infect_asyncio=True, | ||||
|                 # debug_mode=True, | ||||
|                 loglevel='cancel', | ||||
|             ) | ||||
|             async with p.open_context( | ||||
|                 trio_ctx, | ||||
|             ) as (ctx, first): | ||||
|         with trio.fail_after(2): | ||||
|             async with tractor.open_nursery() as n: | ||||
|                 p = await n.start_actor( | ||||
|                     'aio_daemon', | ||||
|                     enable_modules=[__name__], | ||||
|                     infect_asyncio=True, | ||||
|                     # debug_mode=True, | ||||
|                     loglevel='cancel', | ||||
|                 ) | ||||
|                 async with p.open_context( | ||||
|                     trio_ctx, | ||||
|                 ) as (ctx, first): | ||||
| 
 | ||||
|                 assert first == 'start' | ||||
|                     assert first == 'start' | ||||
| 
 | ||||
|                 if parent_cancels: | ||||
|                     await p.cancel_actor() | ||||
|                     if parent_cancels: | ||||
|                         await p.cancel_actor() | ||||
| 
 | ||||
|                 await trio.sleep_forever() | ||||
|                     await trio.sleep_forever() | ||||
| 
 | ||||
|     with pytest.raises(RemoteActorError) as excinfo: | ||||
|         trio.run(main) | ||||
|  |  | |||
|  | @ -38,8 +38,14 @@ from trio_typing import TaskStatus | |||
| 
 | ||||
| from .log import get_logger | ||||
| from ._discovery import get_root | ||||
| from ._state import is_root_process, debug_mode | ||||
| from ._exceptions import is_multi_cancelled | ||||
| from ._state import ( | ||||
|     is_root_process, | ||||
|     debug_mode, | ||||
| ) | ||||
| from ._exceptions import ( | ||||
|     is_multi_cancelled, | ||||
|     ContextCancelled, | ||||
| ) | ||||
| from ._ipc import Channel | ||||
| 
 | ||||
| 
 | ||||
|  | @ -72,6 +78,18 @@ class Lock: | |||
|     # actor-wide variable pointing to current task name using debugger | ||||
|     local_task_in_debug: Optional[str] = None | ||||
| 
 | ||||
|     # NOTE: set by the current task waiting on the root tty lock from | ||||
|     # the CALLER side of the `lock_tty_for_child()` context entry-call | ||||
|     # and must be cancelled if this actor is cancelled via IPC | ||||
|     # request-message otherwise deadlocks with the parent actor may | ||||
|     # ensure | ||||
|     _debugger_request_cs: Optional[trio.CancelScope] = None | ||||
| 
 | ||||
|     # NOTE: set only in the root actor for the **local** root spawned task | ||||
|     # which has acquired the lock (i.e. this is on the callee side of | ||||
|     # the `lock_tty_for_child()` context entry). | ||||
|     _root_local_task_cs_in_debug: Optional[trio.CancelScope] = None | ||||
| 
 | ||||
|     # actor tree-wide actor uid that supposedly has the tty lock | ||||
|     global_actor_in_debug: Optional[tuple[str, str]] = None | ||||
| 
 | ||||
|  | @ -81,12 +99,8 @@ class Lock: | |||
|     # lock in root actor preventing multi-access to local tty | ||||
|     _debug_lock: trio.StrictFIFOLock = trio.StrictFIFOLock() | ||||
| 
 | ||||
|     # XXX: set by the current task waiting on the root tty lock | ||||
|     # and must be cancelled if this actor is cancelled via message | ||||
|     # otherwise deadlocks with the parent actor may ensure | ||||
|     _debugger_request_cs: Optional[trio.CancelScope] = None | ||||
| 
 | ||||
|     _orig_sigint_handler: Optional[Callable] = None | ||||
|     _blocked: set[tuple[str, str]] = set() | ||||
| 
 | ||||
|     @classmethod | ||||
|     def shield_sigint(cls): | ||||
|  | @ -196,6 +210,12 @@ async def _acquire_debug_lock_from_root_task( | |||
|             f"entering lock checkpoint, remote task: {task_name}:{uid}" | ||||
|         ) | ||||
|         we_acquired = True | ||||
| 
 | ||||
|         # NOTE: if the surrounding cancel scope from the | ||||
|         # `lock_tty_for_child()` caller is cancelled, this line should | ||||
|         # unblock and NOT leave us in some kind of | ||||
|         # a "child-locked-TTY-but-child-is-uncontactable-over-IPC" | ||||
|         # condition. | ||||
|         await Lock._debug_lock.acquire() | ||||
| 
 | ||||
|         if Lock.no_remote_has_tty is None: | ||||
|  | @ -267,6 +287,15 @@ async def lock_tty_for_child( | |||
|     ''' | ||||
|     task_name = trio.lowlevel.current_task().name | ||||
| 
 | ||||
|     if tuple(subactor_uid) in Lock._blocked: | ||||
|         log.warning( | ||||
|             f'Actor {subactor_uid} is blocked from acquiring debug lock\n' | ||||
|             f"remote task: {task_name}:{subactor_uid}" | ||||
|         ) | ||||
|         ctx._enter_debugger_on_cancel = False | ||||
|         await ctx.cancel(f'Debug lock blocked for {subactor_uid}') | ||||
|         return 'pdb_lock_blocked' | ||||
| 
 | ||||
|     # TODO: when we get to true remote debugging | ||||
|     # this will deliver stdin data? | ||||
| 
 | ||||
|  | @ -280,8 +309,9 @@ async def lock_tty_for_child( | |||
| 
 | ||||
|     try: | ||||
|         with ( | ||||
|             trio.CancelScope(shield=True), | ||||
|             trio.CancelScope(shield=True) as debug_lock_cs, | ||||
|         ): | ||||
|             Lock._root_local_task_cs_in_debug = debug_lock_cs | ||||
|             async with _acquire_debug_lock_from_root_task(subactor_uid): | ||||
| 
 | ||||
|                 # indicate to child that we've locked stdio | ||||
|  | @ -297,6 +327,7 @@ async def lock_tty_for_child( | |||
|         return "pdb_unlock_complete" | ||||
| 
 | ||||
|     finally: | ||||
|         Lock._root_local_task_cs_in_debug = None | ||||
|         Lock.unshield_sigint() | ||||
| 
 | ||||
| 
 | ||||
|  | @ -353,7 +384,7 @@ async def wait_for_parent_stdin_hijack( | |||
| 
 | ||||
|                 log.pdb('unlocked context') | ||||
| 
 | ||||
|         except tractor.ContextCancelled: | ||||
|         except ContextCancelled: | ||||
|             log.warning('Root actor cancelled debug lock') | ||||
| 
 | ||||
|         finally: | ||||
|  | @ -721,9 +752,11 @@ async def _maybe_enter_pm(err): | |||
|         and not is_multi_cancelled(err) | ||||
|     ): | ||||
|         log.debug("Actor crashed, entering debug mode") | ||||
|         await post_mortem() | ||||
|         Lock.release() | ||||
|         return True | ||||
|         try: | ||||
|             await post_mortem() | ||||
|         finally: | ||||
|             Lock.release() | ||||
|             return True | ||||
| 
 | ||||
|     else: | ||||
|         return False | ||||
|  |  | |||
|  | @ -234,6 +234,9 @@ async def _invoke( | |||
|                         f'{ctx.chan.uid}' | ||||
|                     ) | ||||
| 
 | ||||
|                 if ctx._cancel_msg: | ||||
|                     msg += f' with msg:\n{ctx._cancel_msg}' | ||||
| 
 | ||||
|                 # task-contex was cancelled so relay to the cancel to caller | ||||
|                 raise ContextCancelled( | ||||
|                     msg, | ||||
|  | @ -275,8 +278,16 @@ async def _invoke( | |||
|             # if not is_multi_cancelled(err) and ( | ||||
| 
 | ||||
|             entered_debug: bool = False | ||||
|             if not isinstance(err, ContextCancelled) or ( | ||||
|                 isinstance(err, ContextCancelled) and ctx._cancel_called | ||||
|             if ( | ||||
|                 not isinstance(err, ContextCancelled) | ||||
|                 or ( | ||||
|                     isinstance(err, ContextCancelled) | ||||
|                     and ctx._cancel_called | ||||
| 
 | ||||
|                     # if the root blocks the debugger lock request from a child | ||||
|                     # we will get a remote-cancelled condition. | ||||
|                     and ctx._enter_debugger_on_cancel | ||||
|                 ) | ||||
|             ): | ||||
|                 # XXX: is there any case where we'll want to debug IPC | ||||
|                 # disconnects as a default? | ||||
|  | @ -286,7 +297,6 @@ async def _invoke( | |||
|                 # recovery logic - the only case is some kind of strange bug | ||||
|                 # in our transport layer itself? Going to keep this | ||||
|                 # open ended for now. | ||||
| 
 | ||||
|                 entered_debug = await _debug._maybe_enter_pm(err) | ||||
| 
 | ||||
|                 if not entered_debug: | ||||
|  | @ -698,16 +708,35 @@ class Actor: | |||
|                 log.runtime(f"No more channels for {chan.uid}") | ||||
|                 self._peers.pop(uid, None) | ||||
| 
 | ||||
|                 # for (uid, cid) in self._contexts.copy(): | ||||
|                 #     if chan.uid == uid: | ||||
|                 #         self._contexts.pop((uid, cid)) | ||||
|                 # NOTE: block this actor from acquiring the | ||||
|                 # debugger-TTY-lock since we have no way to know if we | ||||
|                 # cancelled it and further there is no way to ensure the | ||||
|                 # lock will be released if acquired due to having no | ||||
|                 # more active IPC channels. | ||||
|                 if _state.is_root_process(): | ||||
|                     pdb_lock = _debug.Lock | ||||
|                     pdb_lock._blocked.add(uid) | ||||
|                     log.runtime(f"{uid} blocked from pdb locking") | ||||
| 
 | ||||
|                     # if a now stale local task has the TTY lock still | ||||
|                     # we cancel it to allow servicing other requests for | ||||
|                     # the lock. | ||||
|                     if ( | ||||
|                         pdb_lock._root_local_task_cs_in_debug | ||||
|                         and not pdb_lock._root_local_task_cs_in_debug.cancel_called | ||||
|                     ): | ||||
|                         log.warning( | ||||
|                             f'STALE DEBUG LOCK DETECTED FOR {uid}' | ||||
|                         ) | ||||
|                         # TODO: figure out why this breaks tests.. | ||||
|                         # pdb_lock._root_local_task_cs_in_debug.cancel() | ||||
| 
 | ||||
|             log.runtime(f"Peers is {self._peers}") | ||||
| 
 | ||||
|             # No more channels to other actors (at all) registered | ||||
|             # as connected. | ||||
|             if not self._peers: | ||||
|                 log.runtime("Signalling no more peer channels") | ||||
|                 log.runtime("Signalling no more peer channel connections") | ||||
|                 self._no_more_peers.set() | ||||
| 
 | ||||
|             # XXX: is this necessary (GC should do it)? | ||||
|  |  | |||
|  | @ -371,6 +371,8 @@ class Context: | |||
| 
 | ||||
|     # status flags | ||||
|     _cancel_called: bool = False | ||||
|     _cancel_msg: Optional[str] = None | ||||
|     _enter_debugger_on_cancel: bool = True | ||||
|     _started_called: bool = False | ||||
|     _started_received: bool = False | ||||
|     _stream_opened: bool = False | ||||
|  | @ -452,7 +454,11 @@ class Context: | |||
|                 if not self._scope_nursery._closed:  # type: ignore | ||||
|                     self._scope_nursery.start_soon(raiser) | ||||
| 
 | ||||
|     async def cancel(self) -> None: | ||||
|     async def cancel( | ||||
|         self, | ||||
|         msg: Optional[str] = None, | ||||
| 
 | ||||
|     ) -> None: | ||||
|         ''' | ||||
|         Cancel this inter-actor-task context. | ||||
| 
 | ||||
|  | @ -461,6 +467,8 @@ class Context: | |||
| 
 | ||||
|         ''' | ||||
|         side = 'caller' if self._portal else 'callee' | ||||
|         if msg: | ||||
|             assert side == 'callee', 'Only callee side can provide cancel msg' | ||||
| 
 | ||||
|         log.cancel(f'Cancelling {side} side of context to {self.chan.uid}') | ||||
| 
 | ||||
|  | @ -497,8 +505,10 @@ class Context: | |||
|                     log.cancel( | ||||
|                         "Timed out on cancelling remote task " | ||||
|                         f"{cid} for {self._portal.channel.uid}") | ||||
| 
 | ||||
|         # callee side remote task | ||||
|         else: | ||||
|             # callee side remote task | ||||
|             self._cancel_msg = msg | ||||
| 
 | ||||
|             # TODO: should we have an explicit cancel message | ||||
|             # or is relaying the local `trio.Cancelled` as an | ||||
|  |  | |||
|  | @ -466,11 +466,11 @@ async def open_channel_from( | |||
|         ): | ||||
|             # sync to a "started()"-like first delivered value from the | ||||
|             # ``asyncio`` task. | ||||
|             first = await chan.receive() | ||||
| 
 | ||||
|             # deliver stream handle upward | ||||
|             try: | ||||
|                 with chan._trio_cs: | ||||
|                     first = await chan.receive() | ||||
| 
 | ||||
|                     # deliver stream handle upward | ||||
|                     yield first, chan | ||||
|             finally: | ||||
|                 chan._trio_exited = True | ||||
|  | @ -491,16 +491,18 @@ def run_as_asyncio_guest( | |||
|     SC semantics. | ||||
| 
 | ||||
|     ''' | ||||
|     # Uh, oh. :o | ||||
|     # Uh, oh. | ||||
|     # | ||||
|     # :o | ||||
| 
 | ||||
|     # It looks like your event loop has caught a case of the ``trio``s. | ||||
| 
 | ||||
|     # :() | ||||
| 
 | ||||
|     # Don't worry, we've heard you'll barely notice. You might hallucinate | ||||
|     # a few more propagating errors and feel like your digestion has | ||||
|     # slowed but if anything get's too bad your parents will know about | ||||
|     # it. | ||||
|     # Don't worry, we've heard you'll barely notice. You might | ||||
|     # hallucinate a few more propagating errors and feel like your | ||||
|     # digestion has slowed but if anything get's too bad your parents | ||||
|     # will know about it. | ||||
| 
 | ||||
|     # :) | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue