From 8c97f7bbb3e1cebbf11cad94e2bfbcc8b14c5d1b Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 23 Jul 2020 13:14:15 -0400 Subject: [PATCH 01/61] Create runtime variables --- tractor/_state.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tractor/_state.py b/tractor/_state.py index d624fc9..2a18a4c 100644 --- a/tractor/_state.py +++ b/tractor/_state.py @@ -3,10 +3,15 @@ Per process state """ from typing import Optional from collections import Mapping +import multiprocessing as mp import trio _current_actor: Optional['Actor'] = None # type: ignore +_runtime_vars = { + '_debug_mode': False, + '_is_root': False, +} def current_actor() -> 'Actor': # type: ignore @@ -36,3 +41,20 @@ class ActorContextInfo(Mapping): except RuntimeError: # no local actor/task context initialized yet return f'no {key} context' + + +def is_main_process() -> bool: + """Bool determining if this actor is running in the top-most process. + """ + return mp.current_process().name == 'MainProcess' + + +def debug_mode() -> bool: + """Bool determining if "debug mode" is on which enables + remote subactor pdb entry on crashes. + """ + return _runtime_vars['_debug_mode'] + + +def is_root_process() -> bool: + return _runtime_vars['_is_root'] From b11e91375cebeabb1f320a592b29c128cf940a01 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 23 Jul 2020 13:23:55 -0400 Subject: [PATCH 02/61] Initial attempt at multi-actor debugging Allow entering and attaching to a `pdb` instance in a child process. The current hackery is to have the child make an rpc to the parent and ask it to hijack stdin, once complete the child enters a `pdb` blocking method. The parent then relays all stdin input to the child thus controlling the "remote" debugger. A few things were added to accomplish this: - tracking the mapping of subactors to their parent nurseries - in the root actor, cancelling all nurseries under the root `trio` task on cancellation (i.e. `Actor.cancel()`) - pass a "runtime vars" map down the actor tree for propagating global state --- tractor/_actor.py | 12 +++- tractor/_debug.py | 134 +++++++++++++++++++++++++++++++++++++++++++ tractor/_entry.py | 3 + tractor/_spawn.py | 17 +++--- tractor/_trionics.py | 8 ++- 5 files changed, 164 insertions(+), 10 deletions(-) create mode 100644 tractor/_debug.py diff --git a/tractor/_actor.py b/tractor/_actor.py index d59bd82..9053d6f 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -7,6 +7,7 @@ from itertools import chain import importlib import importlib.util import inspect +import bdb import uuid import typing from typing import Dict, List, Tuple, Any, Optional @@ -125,8 +126,15 @@ async def _invoke( task_status.started(cs) await chan.send({'return': await coro, 'cid': cid}) except (Exception, trio.MultiError) as err: - # always ship errors back to caller log.exception("Actor errored:") + + # NOTE: don't enter debug mode recursively after quitting pdb + if _state.debug_mode() and not isinstance(err, bdb.BdbQuit): + # Allow for pdb control in parent + from ._debug import post_mortem + await post_mortem() + + # always ship errors back to caller err_msg = pack_error(err) err_msg['cid'] = cid try: @@ -178,6 +186,7 @@ class Actor: def __init__( self, name: str, + *, rpc_module_paths: List[str] = [], statespace: Optional[Dict[str, Any]] = None, uid: str = None, @@ -237,6 +246,7 @@ class Actor: self._parent_chan: Optional[Channel] = None self._forkserver_info: Optional[ Tuple[Any, Any, Any, Any, Any]] = None + self._actoruid2nursery: Dict[str, 'ActorNursery'] = {} async def wait_for_peer( self, uid: Tuple[str, str] diff --git a/tractor/_debug.py b/tractor/_debug.py new file mode 100644 index 0000000..e462b76 --- /dev/null +++ b/tractor/_debug.py @@ -0,0 +1,134 @@ +""" +Multi-core debugging for da peeps! +""" +import pdb +import sys +import tty +from functools import partial +from typing import Awaitable, Tuple + +from async_generator import aclosing +import tractor +import trio + +from .log import get_logger + +log = get_logger(__name__) + + +__all__ = ['breakpoint', 'post_mortem'] + + +# TODO: is there some way to determine this programatically? +_pdb_exit_patterns = tuple( + str.encode(patt + "\n") for patt in ('c', 'cont', 'continue', 'q', 'quit') +) + + +def subactoruid2proc( + actor: 'Actor', # noqa + uid: Tuple[str, str] +) -> trio.Process: + n = actor._actoruid2nursery[uid] + _, proc, _ = n._children[uid] + return proc + + +async def _hijack_stdin_relay_to_child( + subactor_uid: Tuple[str, str] +) -> None: + actor = tractor.current_actor() + proc = subactoruid2proc(actor, subactor_uid) + + # nlb = [] + + async def hijack_stdin(): + log.info(f"Hijacking stdin from {actor.uid}") + # try: + # # disable cooked mode + # fd = sys.stdin.fileno() + # old = tty.tcgetattr(fd) + # tty.setcbreak(fd) + + # trap std in and relay to subproc + async_stdin = trio.wrap_file(sys.stdin) + + async with aclosing(async_stdin): + # while True: + async for msg in async_stdin: + log.trace(f"Stdin input:\n{msg}") + # nlb.append(msg) + # encode to bytes + bmsg = str.encode(msg) + + # relay bytes to subproc over pipe + await proc.stdin.send_all(bmsg) + + # line = str.encode(''.join(nlb)) + # print(line) + + if bmsg in _pdb_exit_patterns: + log.info("Closing stdin hijack") + break + # finally: + # tty.tcsetattr(fd, tty.TCSAFLUSH, old) + + # schedule hijacking in root scope + actor._root_nursery.start_soon(hijack_stdin) + + +# XXX: We only make this sync in case someone wants to +# overload the ``breakpoint()`` built-in. +def _breakpoint(debug_func) -> Awaitable[None]: + """``tractor`` breakpoint entry for engaging pdb machinery + in subactors. + """ + actor = tractor.current_actor() + + async def wait_for_parent_stdin_hijack(): + log.debug('Breakpoint engaged!') + + # TODO: need a more robust check for the "root" actor + if actor._parent_chan: + async with tractor._portal.open_portal( + actor._parent_chan, + start_msg_loop=False, + ) as portal: + # with trio.fail_after(1): + await portal.run( + 'tractor._debug', + '_hijack_stdin_relay_to_child', + subactor_uid=actor.uid, + ) + + # block here one frame up where ``breakpoint()`` + # was awaited and begin handling stdin + debug_func(actor) + + # this must be awaited by caller + return wait_for_parent_stdin_hijack() + + +def _set_trace(actor): + pdb.set_trace( + header=f"\nAttaching pdb to actor: {actor.uid}\n", + # start 2 levels up + frame=sys._getframe().f_back.f_back, + ) + + +breakpoint = partial( + _breakpoint, + _set_trace, +) + + +def _post_mortem(actor): + log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n") + pdb.post_mortem() + + +post_mortem = partial( + _breakpoint, + _post_mortem, +) diff --git a/tractor/_entry.py b/tractor/_entry.py index 883cc6b..71dfdec 100644 --- a/tractor/_entry.py +++ b/tractor/_entry.py @@ -57,6 +57,9 @@ def _trio_main( ) -> None: """Entry point for a `trio_run_in_process` subactor. """ + # TODO: make a global func to set this or is it too hacky? + # os.environ['PYTHONBREAKPOINT'] = 'tractor._debug.breakpoint' + if actor.loglevel is not None: log.info( f"Setting loglevel for {actor.uid} to {actor.loglevel}") diff --git a/tractor/_spawn.py b/tractor/_spawn.py index f0eb012..6d19614 100644 --- a/tractor/_spawn.py +++ b/tractor/_spawn.py @@ -23,7 +23,7 @@ from multiprocessing import forkserver # type: ignore from typing import Tuple from . import _forkserver_override -from ._state import current_actor +from ._state import current_actor, is_main_process from .log import get_logger from ._portal import Portal from ._actor import Actor, ActorFailure @@ -87,12 +87,6 @@ def try_set_start_method(name: str) -> Optional[mp.context.BaseContext]: return _ctx -def is_main_process() -> bool: - """Bool determining if this actor is running in the top-most process. - """ - return mp.current_process().name == 'MainProcess' - - async def exhaust_portal( portal: Portal, actor: Actor @@ -206,6 +200,8 @@ async def new_proc( # passed through to actor main bind_addr: Tuple[str, int], parent_addr: Tuple[str, int], + _runtime_vars: Dict[str, Any], # serialized and sent to _child + *, use_trio_run_in_process: bool = False, task_status: TaskStatus[Portal] = trio.TASK_STATUS_IGNORED ) -> None: @@ -241,9 +237,14 @@ async def new_proc( "statespace": subactor.statespace, "_arb_addr": subactor._arb_addr, "bind_host": bind_addr[0], - "bind_port": bind_addr[1] + "bind_port": bind_addr[1], + "_runtime_vars": _runtime_vars, }) + # track subactor in current nursery + curr_actor = current_actor() + curr_actor._actoruid2nursery[subactor.uid] = actor_nursery + # resume caller at next checkpoint now that child is up task_status.started(portal) diff --git a/tractor/_trionics.py b/tractor/_trionics.py index e846144..d51a59b 100644 --- a/tractor/_trionics.py +++ b/tractor/_trionics.py @@ -13,10 +13,11 @@ from ._state import current_actor from .log import get_logger, get_loglevel from ._actor import Actor from ._portal import Portal +from . import _state from . import _spawn -log = get_logger('tractor') +log = get_logger(__name__) _default_bind_addr: Tuple[str, int] = ('127.0.0.1', 0) @@ -58,6 +59,10 @@ class ActorNursery: ) -> Portal: loglevel = loglevel or self._actor.loglevel or get_loglevel() + # configure and pass runtime state + _rtv = _state._runtime_vars.copy() + _rtv['_is_root'] = False + subactor = Actor( name, # modules allowed to invoked funcs from @@ -83,6 +88,7 @@ class ActorNursery: self.errors, bind_addr, parent_addr, + _rtv, # run time vars ) ) From b06d4b023e6bc9b27b1553b3d4505ce9d6707853 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 23 Jul 2020 13:32:29 -0400 Subject: [PATCH 03/61] Add support for "debug mode" When enabled a crashed actor will connect to the parent with `pdb` in post mortem mode. --- tractor/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tractor/__init__.py b/tractor/__init__.py index ea61d88..5450cbc 100644 --- a/tractor/__init__.py +++ b/tractor/__init__.py @@ -18,6 +18,7 @@ from ._actor import Actor, _start_actor, Arbiter from ._trionics import open_nursery from ._state import current_actor from ._exceptions import RemoteActorError, ModuleNotExposed +from ._debug import breakpoint, post_mortem from . import msg from . import _spawn @@ -103,14 +104,26 @@ def run( # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods # OR `trio_run_in_process` (the new default). start_method: Optional[str] = None, + debug_mode: bool = False, **kwargs, ) -> Any: """Run a trio-actor async function in process. This is tractor's main entry and the start point for any async actor. """ + # mark top most level process as root actor + _state._runtime_vars['_is_root'] = True + if start_method is not None: _spawn.try_set_start_method(start_method) + + if debug_mode: + _state._runtime_vars['_debug_mode'] = True + + # expose internal debug module to every actor allowing + # for use of ``await tractor.breakpoint()`` + kwargs.setdefault('rpc_module_paths', []).append('tractor._debug') + return trio.run(_main, async_fn, args, kwargs, arbiter_addr, name) From 1d1c881fd737ec9f3057a13599d1f9eed45e8176 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 23 Jul 2020 13:34:03 -0400 Subject: [PATCH 04/61] WIP debugging test script --- debugging/mp_debug.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 debugging/mp_debug.py diff --git a/debugging/mp_debug.py b/debugging/mp_debug.py new file mode 100644 index 0000000..c43a810 --- /dev/null +++ b/debugging/mp_debug.py @@ -0,0 +1,28 @@ +import tractor +import trio + + +async def bubble(): + print('IN BUBBLE') + await trio.sleep(.1) + await tractor.breakpoint() + + +async def bail(): + getattr(doggy) + + +async def main(): + """The main ``tractor`` routine. + """ + async with tractor.open_nursery() as n: + + # portal = await n.run_in_actor('future_self', bubble) + portal = await n.run_in_actor('future_self', bail) + + # The ``async with`` will unblock here since the 'some_linguist' + # actor has completed its main task ``cellar_door``. + + +if __name__ == '__main__': + tractor.run(main, loglevel='info', debug_mode=True) From e7ee0fec345bb5686c53bf7d40e60942af5e9745 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sun, 26 Jul 2020 15:33:24 -0400 Subject: [PATCH 05/61] Pass a copy of the expected exposed modules --- tests/test_rpc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_rpc.py b/tests/test_rpc.py index b3fa1df..8ac9b09 100644 --- a/tests/test_rpc.py +++ b/tests/test_rpc.py @@ -63,7 +63,7 @@ def test_rpc_errors(arb_addr, to_call, testdir): # module should raise a ModuleNotFoundError at import testdir.makefile('.py', tmp_mod=funcname) - # no need to exposed module to the subactor + # no need to expose module to the subactor subactor_exposed_mods = exposed_mods exposed_mods = [] func_defined = False @@ -95,7 +95,7 @@ def test_rpc_errors(arb_addr, to_call, testdir): tractor.run( main, arbiter_addr=arb_addr, - rpc_module_paths=exposed_mods, + rpc_module_paths=exposed_mods.copy(), ) # handle both parameterized cases From 8eb9a742dd9448394a2ea27152c627ac599e0d71 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sun, 26 Jul 2020 17:46:55 -0400 Subject: [PATCH 06/61] Add multi-process debugging support using `pdbpp` This is the first step in addressing #113 and the initial support of #130. Basically this allows (sub)processes to engage the `pdbpp` debug machinery which read/writes the root actor's tty but only in a FIFO semaphored way such that no two processes are using it simultaneously. That means you can have multiple actors enter a trace or crash and run the debugger in a sensible way without clobbering each other's access to stdio. It required adding some "tear down hooks" to a custom `pdbpp.Pdb` type such that we release a child's lock on the parent on debugger exit (in this case when either of the "continue" or "quit" commands are issued to the debugger console). There's some code left commented in anticipation of full support for issue #130 where we're need to actually capture and feed stdin to the target (remote) actor which won't necessarily being running on the same host. --- debugging/mp_debug.py | 8 +- tractor/_actor.py | 8 +- tractor/_debug.py | 174 ++++++++++++++++++++++++++++++------------ 3 files changed, 135 insertions(+), 55 deletions(-) diff --git a/debugging/mp_debug.py b/debugging/mp_debug.py index c43a810..dfbfa91 100644 --- a/debugging/mp_debug.py +++ b/debugging/mp_debug.py @@ -17,12 +17,14 @@ async def main(): """ async with tractor.open_nursery() as n: - # portal = await n.run_in_actor('future_self', bubble) - portal = await n.run_in_actor('future_self', bail) + portal1 = await n.run_in_actor('bubble', bubble) + portal = await n.run_in_actor('bail', bail) + # await portal.result() + # await portal1.result() # The ``async with`` will unblock here since the 'some_linguist' # actor has completed its main task ``cellar_door``. if __name__ == '__main__': - tractor.run(main, loglevel='info', debug_mode=True) + tractor.run(main, loglevel='critical', debug_mode=True) diff --git a/tractor/_actor.py b/tractor/_actor.py index 9053d6f..5cfbb7b 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -126,13 +126,14 @@ async def _invoke( task_status.started(cs) await chan.send({'return': await coro, 'cid': cid}) except (Exception, trio.MultiError) as err: - log.exception("Actor errored:") - # NOTE: don't enter debug mode recursively after quitting pdb if _state.debug_mode() and not isinstance(err, bdb.BdbQuit): # Allow for pdb control in parent from ._debug import post_mortem + log.exception("Actor crashed, entering debug mode:") await post_mortem() + else: + log.exception("Actor crashed:") # always ship errors back to caller err_msg = pack_error(err) @@ -182,6 +183,7 @@ class Actor: # Information about `__main__` from parent _parent_main_data: Dict[str, str] + _parent_chan_cs: Optional[trio.CancelScope] = None def __init__( self, @@ -840,6 +842,8 @@ class Actor: # for n in root.child_nurseries: # n.cancel_scope.cancel() + self._parent_chan_cs.cancel() + async def _cancel_task(self, cid, chan): """Cancel a local task by call-id / channel. diff --git a/tractor/_debug.py b/tractor/_debug.py index e462b76..5d3626b 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -1,9 +1,7 @@ """ Multi-core debugging for da peeps! """ -import pdb import sys -import tty from functools import partial from typing import Awaitable, Tuple @@ -13,6 +11,16 @@ import trio from .log import get_logger +try: + # wtf: only exported when installed in dev mode? + import pdbpp +except ImportError: + # pdbpp is installed in regular mode... + import pdb + assert pdb.xpm, "pdbpp is not installed?" + pdbpp = pdb + + log = get_logger(__name__) @@ -25,56 +33,96 @@ _pdb_exit_patterns = tuple( ) -def subactoruid2proc( - actor: 'Actor', # noqa - uid: Tuple[str, str] -) -> trio.Process: - n = actor._actoruid2nursery[uid] - _, proc, _ = n._children[uid] - return proc +_pdb_release_hook = None + + +class PdbwTeardown(pdbpp.Pdb): + """Add teardown hooks to the regular ``pdbpp.Pdb``. + """ + # TODO: figure out how to dissallow recursive .set_trace() entry + # since that'll cause deadlock for us. + def set_continue(self): + super().set_continue() + self.config.teardown(self) + + def set_quit(self): + super().set_quit() + self.config.teardown(self) + + +class TractorConfig(pdbpp.DefaultConfig): + """Custom ``pdbpp`` goodness. + """ + sticky_by_default = True + + def teardown(self, _pdb): + _pdb_release_hook(_pdb) + + +# override the pdbpp config with our coolio one +pdbpp.Pdb.DefaultConfig = TractorConfig + + +# TODO: will be needed whenever we get to true remote debugging. +# XXX see https://github.com/goodboy/tractor/issues/130 + +# def subactoruid2proc( +# actor: 'Actor', # noqa +# uid: Tuple[str, str] +# ) -> trio.Process: +# n = actor._actoruid2nursery[uid] +# _, proc, _ = n._children[uid] +# return proc + +# async def hijack_stdin(): +# log.info(f"Hijacking stdin from {actor.uid}") + +# trap std in and relay to subproc +# async_stdin = trio.wrap_file(sys.stdin) + +# async with aclosing(async_stdin): +# async for msg in async_stdin: +# log.trace(f"Stdin input:\n{msg}") +# # encode to bytes +# bmsg = str.encode(msg) + +# # relay bytes to subproc over pipe +# # await proc.stdin.send_all(bmsg) + +# if bmsg in _pdb_exit_patterns: +# log.info("Closing stdin hijack") +# break async def _hijack_stdin_relay_to_child( subactor_uid: Tuple[str, str] ) -> None: actor = tractor.current_actor() - proc = subactoruid2proc(actor, subactor_uid) + debug_lock = actor.statespace.setdefault( + '_debug_lock', trio.StrictFIFOLock() + ) - # nlb = [] + log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock") + await debug_lock.acquire() + log.warning(f"Actor {subactor_uid} acquired stdin hijack lock") - async def hijack_stdin(): - log.info(f"Hijacking stdin from {actor.uid}") - # try: - # # disable cooked mode - # fd = sys.stdin.fileno() - # old = tty.tcgetattr(fd) - # tty.setcbreak(fd) + # TODO: when we get to true remote debugging + # this will deliver stdin data + try: + # indicate to child that we've locked stdio + yield 'Locked' - # trap std in and relay to subproc - async_stdin = trio.wrap_file(sys.stdin) + # wait for cancellation of stream by child + await trio.sleep_forever() - async with aclosing(async_stdin): - # while True: - async for msg in async_stdin: - log.trace(f"Stdin input:\n{msg}") - # nlb.append(msg) - # encode to bytes - bmsg = str.encode(msg) + # TODO: for remote debugging schedule hijacking in root scope + # (see above) + # actor._root_nursery.start_soon(hijack_stdin) - # relay bytes to subproc over pipe - await proc.stdin.send_all(bmsg) - - # line = str.encode(''.join(nlb)) - # print(line) - - if bmsg in _pdb_exit_patterns: - log.info("Closing stdin hijack") - break - # finally: - # tty.tcsetattr(fd, tty.TCSAFLUSH, old) - - # schedule hijacking in root scope - actor._root_nursery.start_soon(hijack_stdin) + finally: + if debug_lock.locked(): + debug_lock.release() + log.debug(f"Actor {subactor_uid} released stdin hijack lock") # XXX: We only make this sync in case someone wants to @@ -84,35 +132,61 @@ def _breakpoint(debug_func) -> Awaitable[None]: in subactors. """ actor = tractor.current_actor() + do_unlock = trio.Event() - async def wait_for_parent_stdin_hijack(): - log.debug('Breakpoint engaged!') + async def wait_for_parent_stdin_hijack( + task_status=trio.TASK_STATUS_IGNORED + ): # TODO: need a more robust check for the "root" actor if actor._parent_chan: async with tractor._portal.open_portal( actor._parent_chan, start_msg_loop=False, + shield=True, ) as portal: # with trio.fail_after(1): - await portal.run( + agen = await portal.run( 'tractor._debug', '_hijack_stdin_relay_to_child', subactor_uid=actor.uid, ) + async with aclosing(agen): + async for val in agen: + assert val == 'Locked' + task_status.started() + with trio.CancelScope(shield=True): + await do_unlock.wait() + + # trigger cancellation of remote stream + break + + log.debug(f"Child {actor} released parent stdio lock") + + def unlock(_pdb): + do_unlock.set() + + global _pdb_release_hook + _pdb_release_hook = unlock + + async def _bp(): + # this must be awaited by caller + await actor._root_nursery.start( + wait_for_parent_stdin_hijack + ) # block here one frame up where ``breakpoint()`` # was awaited and begin handling stdin debug_func(actor) - # this must be awaited by caller - return wait_for_parent_stdin_hijack() + # return wait_for_parent_stdin_hijack() + return _bp() def _set_trace(actor): - pdb.set_trace( - header=f"\nAttaching pdb to actor: {actor.uid}\n", - # start 2 levels up + log.critical(f"\nAttaching pdb to actor: {actor.uid}\n") + PdbwTeardown().set_trace( + # start 2 levels up in user code frame=sys._getframe().f_back.f_back, ) @@ -125,7 +199,7 @@ breakpoint = partial( def _post_mortem(actor): log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n") - pdb.post_mortem() + pdbpp.xpm(Pdb=PdbwTeardown) post_mortem = partial( From f7cd2be039c490897d40c1d8af214320116b0675 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Wed, 29 Jul 2020 20:48:15 -0400 Subject: [PATCH 07/61] Play with re-entrant trace --- debugging/mp_debug.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/debugging/mp_debug.py b/debugging/mp_debug.py index dfbfa91..a279715 100644 --- a/debugging/mp_debug.py +++ b/debugging/mp_debug.py @@ -4,8 +4,9 @@ import trio async def bubble(): print('IN BUBBLE') - await trio.sleep(.1) - await tractor.breakpoint() + while True: + await trio.sleep(.1) + await tractor.breakpoint() async def bail(): @@ -27,4 +28,4 @@ async def main(): if __name__ == '__main__': - tractor.run(main, loglevel='critical', debug_mode=True) + tractor.run(main, loglevel='error', debug_mode=True) From efd7095cf850e5b14b28767521f48eeb12a30eb0 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Wed, 29 Jul 2020 20:57:10 -0400 Subject: [PATCH 08/61] Add pdbpp as dep --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 24d4479..e31e359 100755 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( ], install_requires=[ 'msgpack', 'trio>0.8', 'async_generator', 'colorlog', 'wrapt', - 'trio_typing' + 'trio_typing', 'pdbpp', ], tests_require=['pytest'], python_requires=">=3.7", From abaa2f5da0a47637d429abfcec9f8d8357fda0cc Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 30 Jul 2020 10:24:08 -0400 Subject: [PATCH 09/61] Drop uneeded `parent_chan_cs()` cancel call --- tractor/_actor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tractor/_actor.py b/tractor/_actor.py index 5cfbb7b..a02285f 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -842,8 +842,6 @@ class Actor: # for n in root.child_nurseries: # n.cancel_scope.cancel() - self._parent_chan_cs.cancel() - async def _cancel_task(self, cid, chan): """Cancel a local task by call-id / channel. From 68773d51fd10c7e56499308f51d1ea4f3343d26a Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 30 Jul 2020 10:41:58 -0400 Subject: [PATCH 10/61] Always expose the debug module --- tractor/_actor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tractor/_actor.py b/tractor/_actor.py index a02285f..3e244fa 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -206,8 +206,11 @@ class Actor: # will be passed to children self._parent_main_data = _mp_fixup_main._mp_figure_out_main() + # always include debugging tools module + rpc_module_paths.append('tractor._debug') + mods = {} - for name in rpc_module_paths or (): + for name in rpc_module_paths: mod = importlib.import_module(name) mods[name] = _get_mod_abspath(mod) From f9ef3fc5de0a5fb7d9d5e6f88aa93205069adcd7 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 30 Jul 2020 10:42:22 -0400 Subject: [PATCH 11/61] Cleanups and more comments --- tractor/_debug.py | 63 +++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index 5d3626b..e5da9a5 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -3,7 +3,7 @@ Multi-core debugging for da peeps! """ import sys from functools import partial -from typing import Awaitable, Tuple +from typing import Awaitable, Tuple, Optional, Callable from async_generator import aclosing import tractor @@ -27,27 +27,8 @@ log = get_logger(__name__) __all__ = ['breakpoint', 'post_mortem'] -# TODO: is there some way to determine this programatically? -_pdb_exit_patterns = tuple( - str.encode(patt + "\n") for patt in ('c', 'cont', 'continue', 'q', 'quit') -) - - -_pdb_release_hook = None - - -class PdbwTeardown(pdbpp.Pdb): - """Add teardown hooks to the regular ``pdbpp.Pdb``. - """ - # TODO: figure out how to dissallow recursive .set_trace() entry - # since that'll cause deadlock for us. - def set_continue(self): - super().set_continue() - self.config.teardown(self) - - def set_quit(self): - super().set_quit() - self.config.teardown(self) +# placeholder for function to set a ``trio.Event`` +_pdb_release_hook: Optional[Callable] = None class TractorConfig(pdbpp.DefaultConfig): @@ -59,13 +40,32 @@ class TractorConfig(pdbpp.DefaultConfig): _pdb_release_hook(_pdb) -# override the pdbpp config with our coolio one -pdbpp.Pdb.DefaultConfig = TractorConfig +class PdbwTeardown(pdbpp.Pdb): + """Add teardown hooks to the regular ``pdbpp.Pdb``. + """ + # override the pdbpp config with our coolio one + DefaultConfig = TractorConfig + + # TODO: figure out how to dissallow recursive .set_trace() entry + # since that'll cause deadlock for us. + def set_continue(self): + super().set_continue() + self.config.teardown(self) + + def set_quit(self): + super().set_quit() + self.config.teardown(self) # TODO: will be needed whenever we get to true remote debugging. # XXX see https://github.com/goodboy/tractor/issues/130 +# # TODO: is there some way to determine this programatically? +# _pdb_exit_patterns = tuple( +# str.encode(patt + "\n") for patt in ( +# 'c', 'cont', 'continue', 'q', 'quit') +# ) + # def subactoruid2proc( # actor: 'Actor', # noqa # uid: Tuple[str, str] @@ -170,17 +170,16 @@ def _breakpoint(debug_func) -> Awaitable[None]: _pdb_release_hook = unlock async def _bp(): - # this must be awaited by caller - await actor._root_nursery.start( - wait_for_parent_stdin_hijack - ) + # this **must** be awaited by the caller and is done using the + # root nursery so that the debugger can continue to run without + # being restricted by the scope of a new task nursery. + await actor._root_nursery.start(wait_for_parent_stdin_hijack) - # block here one frame up where ``breakpoint()`` - # was awaited and begin handling stdin + # block here one (at the appropriate frame *up* where + # ``breakpoint()`` was awaited and begin handling stdio debug_func(actor) - # return wait_for_parent_stdin_hijack() - return _bp() + return _bp() # user code **must** await this! def _set_trace(actor): From ebb21b9ba3a20fac2c3a7f9ce9825da413098a16 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sat, 1 Aug 2020 13:39:05 -0400 Subject: [PATCH 12/61] Support re-entrant breakpoints Keep an actor local (bool) flag which determines if there is already a running debugger instance for the current process. If another task tries to enter in this case, simply ignore it since allowing entry may result in a deadlock where the new task will be sync waiting on the parent stdio lock (a case that will never arrive due to the current debugger's active use of it). In the future we may want to allow FIFO queueing of local tasks where instead of ignoring re-entrant breakpoints we allow tasks to async wait for debugger release, though not sure the implications of that since you'd likely want to support switching the debugger to the new task and that could cause deadlocks where tasks are inter-dependent. It may be more sane to just error on multiple breakpoint requests within an actor. --- tractor/_debug.py | 75 +++++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index e5da9a5..b3a630b 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -36,8 +36,8 @@ class TractorConfig(pdbpp.DefaultConfig): """ sticky_by_default = True - def teardown(self, _pdb): - _pdb_release_hook(_pdb) + def teardown(self): + _pdb_release_hook() class PdbwTeardown(pdbpp.Pdb): @@ -50,11 +50,11 @@ class PdbwTeardown(pdbpp.Pdb): # since that'll cause deadlock for us. def set_continue(self): super().set_continue() - self.config.teardown(self) + self.config.teardown() def set_quit(self): super().set_quit() - self.config.teardown(self) + self.config.teardown() # TODO: will be needed whenever we get to true remote debugging. @@ -140,36 +140,48 @@ def _breakpoint(debug_func) -> Awaitable[None]: # TODO: need a more robust check for the "root" actor if actor._parent_chan: - async with tractor._portal.open_portal( - actor._parent_chan, - start_msg_loop=False, - shield=True, - ) as portal: - # with trio.fail_after(1): - agen = await portal.run( - 'tractor._debug', - '_hijack_stdin_relay_to_child', - subactor_uid=actor.uid, - ) - async with aclosing(agen): - async for val in agen: - assert val == 'Locked' - task_status.started() - with trio.CancelScope(shield=True): - await do_unlock.wait() - - # trigger cancellation of remote stream - break + try: + async with tractor._portal.open_portal( + actor._parent_chan, + start_msg_loop=False, + shield=True, + ) as portal: + # with trio.fail_after(1): + agen = await portal.run( + 'tractor._debug', + '_hijack_stdin_relay_to_child', + subactor_uid=actor.uid, + ) + async with aclosing(agen): + async for val in agen: + assert val == 'Locked' + task_status.started() + with trio.CancelScope(shield=True): + await do_unlock.wait() + # trigger cancellation of remote stream + break + finally: + log.debug(f"Exiting debugger for actor {actor}") + actor.statespace['_in_debug'] = False log.debug(f"Child {actor} released parent stdio lock") - def unlock(_pdb): - do_unlock.set() - - global _pdb_release_hook - _pdb_release_hook = unlock - async def _bp(): + """Async breakpoint which schedules a parent stdio lock, and once complete + enters the ``pdbpp`` debugging console. + """ + in_debug = actor.statespace.setdefault('_in_debug', False) + + if in_debug: + log.warning(f"Actor {actor} already has a debug lock, skipping...") + return + + # assign unlock callback for debugger teardown hooks + global _pdb_release_hook + _pdb_release_hook = do_unlock.set + + actor.statespace['_in_debug'] = True + # this **must** be awaited by the caller and is done using the # root nursery so that the debugger can continue to run without # being restricted by the scope of a new task nursery. @@ -179,7 +191,8 @@ def _breakpoint(debug_func) -> Awaitable[None]: # ``breakpoint()`` was awaited and begin handling stdio debug_func(actor) - return _bp() # user code **must** await this! + # user code **must** await this! + return _bp() def _set_trace(actor): From fd5fb9241a15ee66333c5898922e12c6d0855f23 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sun, 2 Aug 2020 22:30:03 -0400 Subject: [PATCH 13/61] Sparsen some lines --- tractor/__init__.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tractor/__init__.py b/tractor/__init__.py index 5450cbc..e53dfa4 100644 --- a/tractor/__init__.py +++ b/tractor/__init__.py @@ -54,9 +54,14 @@ async def _main( """Async entry point for ``tractor``. """ logger = log.get_logger('tractor') + main = partial(async_fn, *args) + arbiter_addr = (host, port) = arbiter_addr or ( - _default_arbiter_host, _default_arbiter_port) + _default_arbiter_host, + _default_arbiter_port + ) + loglevel = kwargs.get('loglevel', log.get_loglevel()) if loglevel is not None: log._default_loglevel = loglevel @@ -99,7 +104,9 @@ def run( *args, name: Optional[str] = None, arbiter_addr: Tuple[str, int] = ( - _default_arbiter_host, _default_arbiter_port), + _default_arbiter_host, + _default_arbiter_port, + ), # either the `multiprocessing` start method: # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods # OR `trio_run_in_process` (the new default). From bd157e05efecde891e4e61fdf488991dfe766997 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sat, 8 Aug 2020 23:00:56 -0400 Subject: [PATCH 14/61] Port to service nursery --- tractor/_debug.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index b3a630b..baa293e 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -185,7 +185,7 @@ def _breakpoint(debug_func) -> Awaitable[None]: # this **must** be awaited by the caller and is done using the # root nursery so that the debugger can continue to run without # being restricted by the scope of a new task nursery. - await actor._root_nursery.start(wait_for_parent_stdin_hijack) + await actor._service_n.start(wait_for_parent_stdin_hijack) # block here one (at the appropriate frame *up* where # ``breakpoint()`` was awaited and begin handling stdio From 291ecec070e87248efe32f86ce4a025439485ecd Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 6 Aug 2020 23:47:43 -0400 Subject: [PATCH 15/61] Maybe not sticky by default --- tractor/_debug.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index baa293e..7e92a03 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -34,7 +34,7 @@ _pdb_release_hook: Optional[Callable] = None class TractorConfig(pdbpp.DefaultConfig): """Custom ``pdbpp`` goodness. """ - sticky_by_default = True + # sticky_by_default = True def teardown(self): _pdb_release_hook() From 150179bfe4de749dda0a6bb18f80cead2c8217cc Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sat, 12 Sep 2020 11:47:14 -0400 Subject: [PATCH 16/61] Support entering post mortem on crashes in root actor --- tractor/_debug.py | 129 ++++++++++++++++++++++++++++++---------------- 1 file changed, 85 insertions(+), 44 deletions(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index 7e92a03..c31f16f 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -1,8 +1,10 @@ """ Multi-core debugging for da peeps! """ +import bdb import sys from functools import partial +from contextlib import asynccontextmanager, AsyncExitStack from typing import Awaitable, Tuple, Optional, Callable from async_generator import aclosing @@ -10,6 +12,7 @@ import tractor import trio from .log import get_logger +from . import _state try: # wtf: only exported when installed in dev mode? @@ -92,37 +95,40 @@ class PdbwTeardown(pdbpp.Pdb): # if bmsg in _pdb_exit_patterns: # log.info("Closing stdin hijack") # break +@asynccontextmanager +async def _acquire_debug_lock(): + """Acquire a actor local FIFO lock meant to mutex entry to a local + debugger entry point to avoid tty clobbering by multiple processes. + """ + try: + actor = tractor.current_actor() + debug_lock = actor.statespace.setdefault( + '_debug_lock', trio.StrictFIFOLock() + ) + await debug_lock.acquire() + log.error("TTY lock acquired") + yield + finally: + if debug_lock.locked(): + debug_lock.release() + log.error("TTY lock released") async def _hijack_stdin_relay_to_child( subactor_uid: Tuple[str, str] ) -> None: - actor = tractor.current_actor() - debug_lock = actor.statespace.setdefault( - '_debug_lock', trio.StrictFIFOLock() - ) - - log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock") - await debug_lock.acquire() - log.warning(f"Actor {subactor_uid} acquired stdin hijack lock") - # TODO: when we get to true remote debugging # this will deliver stdin data - try: + log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock") + async with _acquire_debug_lock(): + log.warning(f"Actor {subactor_uid} acquired stdin hijack lock") # indicate to child that we've locked stdio yield 'Locked' # wait for cancellation of stream by child await trio.sleep_forever() - # TODO: for remote debugging schedule hijacking in root scope - # (see above) - # actor._root_nursery.start_soon(hijack_stdin) - - finally: - if debug_lock.locked(): - debug_lock.release() - log.debug(f"Actor {subactor_uid} released stdin hijack lock") + log.debug(f"Actor {subactor_uid} released stdin hijack lock") # XXX: We only make this sync in case someone wants to @@ -137,34 +143,31 @@ def _breakpoint(debug_func) -> Awaitable[None]: async def wait_for_parent_stdin_hijack( task_status=trio.TASK_STATUS_IGNORED ): - - # TODO: need a more robust check for the "root" actor - if actor._parent_chan: - try: - async with tractor._portal.open_portal( - actor._parent_chan, - start_msg_loop=False, - shield=True, - ) as portal: - # with trio.fail_after(1): + try: + async with tractor._portal.open_portal( + actor._parent_chan, + start_msg_loop=False, + shield=True, + ) as portal: + with trio.fail_after(1): agen = await portal.run( 'tractor._debug', '_hijack_stdin_relay_to_child', subactor_uid=actor.uid, ) - async with aclosing(agen): - async for val in agen: - assert val == 'Locked' - task_status.started() - with trio.CancelScope(shield=True): - await do_unlock.wait() + async with aclosing(agen): + async for val in agen: + assert val == 'Locked' + task_status.started() + with trio.CancelScope(shield=True): + await do_unlock.wait() - # trigger cancellation of remote stream - break - finally: - log.debug(f"Exiting debugger for actor {actor}") - actor.statespace['_in_debug'] = False - log.debug(f"Child {actor} released parent stdio lock") + # trigger cancellation of remote stream + break + finally: + log.debug(f"Exiting debugger for actor {actor}") + actor.statespace['_in_debug'] = False + log.debug(f"Child {actor} released parent stdio lock") async def _bp(): """Async breakpoint which schedules a parent stdio lock, and once complete @@ -182,10 +185,27 @@ def _breakpoint(debug_func) -> Awaitable[None]: actor.statespace['_in_debug'] = True - # this **must** be awaited by the caller and is done using the - # root nursery so that the debugger can continue to run without - # being restricted by the scope of a new task nursery. - await actor._service_n.start(wait_for_parent_stdin_hijack) + # TODO: need a more robust check for the "root" actor + if actor._parent_chan: + # this **must** be awaited by the caller and is done using the + # root nursery so that the debugger can continue to run without + # being restricted by the scope of a new task nursery. + await actor._service_n.start(wait_for_parent_stdin_hijack) + + # block here one (at the appropriate frame *up* where + # ``breakpoint()`` was awaited and begin handling stdio + # debug_func(actor) + else: + # we also wait in the root-parent for any child that + # may have the tty locked prior + async def _lock( + task_status=trio.TASK_STATUS_IGNORED + ): + async with _acquire_debug_lock(): + task_status.started() + await do_unlock.wait() + + await actor._service_n.start(_lock) # block here one (at the appropriate frame *up* where # ``breakpoint()`` was awaited and begin handling stdio @@ -218,3 +238,24 @@ post_mortem = partial( _breakpoint, _post_mortem, ) + + +async def _maybe_enter_pm(err): + if ( + _state.debug_mode() + and not isinstance(err, bdb.BdbQuit) + + # XXX: if the error is the likely result of runtime-wide + # cancellation, we don't want to enter the debugger since + # there's races between when the parent actor has killed all + # comms and when the child tries to contact said parent to + # acquire the tty lock. + # Really we just want to mostly avoid catching KBIs here so there + # might be a simpler check we can do? + and trio.MultiError.filter( + lambda exc: exc if not isinstance(exc, trio.Cancelled) else None, + err, + ) + ): + log.warning("Actor crashed, entering debug mode") + await post_mortem() From 8b6e9f5530be07645f4a2c6a5a21b0dd94877321 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sat, 12 Sep 2020 11:48:20 -0400 Subject: [PATCH 17/61] Port to new debug api, set `_is_root` state flag on startup --- tractor/_actor.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/tractor/_actor.py b/tractor/_actor.py index 3e244fa..5d2a35d 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -7,7 +7,6 @@ from itertools import chain import importlib import importlib.util import inspect -import bdb import uuid import typing from typing import Dict, List, Tuple, Any, Optional @@ -27,6 +26,7 @@ from ._exceptions import ( unpack_error, ModuleNotExposed ) +from ._debug import _maybe_enter_pm from ._discovery import get_arbiter from ._portal import Portal from . import _state @@ -127,13 +127,8 @@ async def _invoke( await chan.send({'return': await coro, 'cid': cid}) except (Exception, trio.MultiError) as err: # NOTE: don't enter debug mode recursively after quitting pdb - if _state.debug_mode() and not isinstance(err, bdb.BdbQuit): - # Allow for pdb control in parent - from ._debug import post_mortem - log.exception("Actor crashed, entering debug mode:") - await post_mortem() - else: - log.exception("Actor crashed:") + log.exception("Actor crashed:") + await _maybe_enter_pm(err) # always ship errors back to caller err_msg = pack_error(err) @@ -201,6 +196,7 @@ class Actor: """ self.name = name self.uid = (name, uid or str(uuid.uuid4())) + self._is_cancelled: bool = False # retreive and store parent `__main__` data which # will be passed to children @@ -251,7 +247,7 @@ class Actor: self._parent_chan: Optional[Channel] = None self._forkserver_info: Optional[ Tuple[Any, Any, Any, Any, Any]] = None - self._actoruid2nursery: Dict[str, 'ActorNursery'] = {} + self._actoruid2nursery: Dict[str, 'ActorNursery'] = {} # noqa async def wait_for_peer( self, uid: Tuple[str, str] @@ -564,7 +560,7 @@ class Actor: f"Exiting msg loop for {chan} from {chan.uid} " f"with last msg:\n{msg}") - async def _chan_to_parent( + async def _from_parent( self, parent_addr: Optional[Tuple[str, int]], ) -> Tuple[Channel, Optional[Tuple[str, int]]]: @@ -593,6 +589,10 @@ class Actor: parent_data.pop('bind_host'), parent_data.pop('bind_port'), ) + rvs = parent_data.pop('_runtime_vars') + rvs['_is_root'] = False + _state._runtime_vars.update(rvs) + for attr, value in parent_data.items(): setattr(self, attr, value) @@ -630,13 +630,13 @@ class Actor: # establish primary connection with immediate parent self._parent_chan = None if parent_addr is not None: - self._parent_chan, accept_addr_from_rent = await self._chan_to_parent( + self._parent_chan, accept_addr_rent = await self._from_parent( parent_addr) # either it's passed in because we're not a child # or because we're running in mp mode - if accept_addr_from_rent is not None: - accept_addr = accept_addr_from_rent + if accept_addr_rent is not None: + accept_addr = accept_addr_rent # load exposed/allowed RPC modules # XXX: do this **after** establishing a channel to the parent @@ -711,7 +711,7 @@ class Actor: # Blocks here as expected until the root nursery is # killed (i.e. this actor is cancelled or signalled by the parent) - except (trio.MultiError, Exception) as err: + except Exception as err: if not registered_with_arbiter: # TODO: I guess we could try to connect back # to the parent through a channel and engage a debugger @@ -822,6 +822,8 @@ class Actor: spawning new rpc tasks - return control the parent channel message loop """ + self._is_cancelled = True + # cancel all ongoing rpc tasks with trio.CancelScope(shield=True): # kill all ongoing tasks @@ -1039,7 +1041,12 @@ async def _start_actor( parent_addr=None ) ) - result = await main() + try: + result = await main() + except (Exception, trio.MultiError) as err: + log.exception("Actor crashed:") + await _maybe_enter_pm(err) + raise # XXX: the actor is cancelled when this context is complete # given that there are no more active peer channels connected From 09daba4c9c25bf666e4c526321cf05fc43ddd172 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sat, 12 Sep 2020 11:48:57 -0400 Subject: [PATCH 18/61] Explicitly handle `debug_mode` flag correctly --- tractor/__init__.py | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/tractor/__init__.py b/tractor/__init__.py index e53dfa4..b65811a 100644 --- a/tractor/__init__.py +++ b/tractor/__init__.py @@ -47,14 +47,25 @@ _default_arbiter_port = 1616 async def _main( async_fn: typing.Callable[..., typing.Awaitable], args: Tuple, - kwargs: typing.Dict[str, typing.Any], arbiter_addr: Tuple[str, int], name: Optional[str] = None, + start_method: Optional[str] = None, + debug_mode: bool = False, + **kwargs: typing.Dict[str, typing.Any], ) -> typing.Any: """Async entry point for ``tractor``. """ logger = log.get_logger('tractor') + if start_method is not None: + _spawn.try_set_start_method(start_method) + + if debug_mode: + _state._runtime_vars['_debug_mode'] = True + # expose internal debug module to every actor allowing + # for use of ``await tractor.breakpoint()`` + kwargs.setdefault('rpc_module_paths', []).append('tractor._debug') + main = partial(async_fn, *args) arbiter_addr = (host, port) = arbiter_addr or ( @@ -109,7 +120,7 @@ def run( ), # either the `multiprocessing` start method: # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods - # OR `trio_run_in_process` (the new default). + # OR `trio` (the new default). start_method: Optional[str] = None, debug_mode: bool = False, **kwargs, @@ -121,17 +132,23 @@ def run( # mark top most level process as root actor _state._runtime_vars['_is_root'] = True - if start_method is not None: - _spawn.try_set_start_method(start_method) + return trio.run( + partial( + # our entry + _main, - if debug_mode: - _state._runtime_vars['_debug_mode'] = True + # user entry point + async_fn, + args, - # expose internal debug module to every actor allowing - # for use of ``await tractor.breakpoint()`` - kwargs.setdefault('rpc_module_paths', []).append('tractor._debug') - - return trio.run(_main, async_fn, args, kwargs, arbiter_addr, name) + # global kwargs + arbiter_addr=arbiter_addr, + name=name, + start_method=start_method, + debug_mode=debug_mode, + **kwargs, + ) + ) def run_daemon( From 9e1d9a8ce11d865dd0ed9ee5ad8263882f8e527b Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 24 Sep 2020 09:59:18 -0400 Subject: [PATCH 19/61] Add an internal context stack This aids with tearing down resources **after** the crash handling and debugger have completed. Leaving this internal for now but should eventually get a public convenience function like `tractor.context_stack()`. --- tractor/_actor.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tractor/_actor.py b/tractor/_actor.py index 5d2a35d..e5b8ef0 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -13,6 +13,7 @@ from typing import Dict, List, Tuple, Any, Optional from types import ModuleType import sys import os +from contextlib import ExitStack import trio # type: ignore from trio_typing import TaskStatus @@ -150,7 +151,7 @@ async def _invoke( # If we're cancelled before the task returns then the # cancel scope will not have been inserted yet log.warn( - f"Task {func} was likely cancelled before it was started") + f"Task {func} likely errored or cancelled before it started") if not actor._rpc_tasks: log.info("All RPC tasks have completed") @@ -175,6 +176,7 @@ class Actor: _root_n: Optional[trio.Nursery] = None _service_n: Optional[trio.Nursery] = None _server_n: Optional[trio.Nursery] = None + _lifetime_stack: ExitStack = ExitStack() # Information about `__main__` from parent _parent_main_data: Dict[str, str] @@ -426,7 +428,6 @@ class Actor: async def _process_messages( self, chan: Channel, - treat_as_gen: bool = False, shield: bool = False, task_status: TaskStatus[trio.CancelScope] = trio.TASK_STATUS_IGNORED, ) -> None: @@ -743,6 +744,11 @@ class Actor: finally: log.info("Root nursery complete") + # tear down all lifetime contexts + # api idea: ``tractor.open_context()`` + log.warn("Closing all actor lifetime contexts") + self._lifetime_stack.close() + # Unregister actor from the arbiter if registered_with_arbiter and ( self._arb_addr is not None From f1b242f913b9bf3d84fbf6b8e9d7d154bb3ef9d5 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 28 Sep 2020 08:54:21 -0400 Subject: [PATCH 20/61] Block SIGINT handling while in the debugger This seems to prevent a certain class of bugs to do with the root actor cancelling local tasks and getting into deadlock while children are trying to acquire the tty lock. I'm not sure it's the best idea yet since you're pretty much guaranteed to get "stuck" if a child activates the debugger after the root has been cancelled (at least "stuck" in terms of SIGINT being ignored). That kinda race condition seems to still exist somehow: a child can "beat" the root to activating the tty lock and the parent is stuck waiting on the child to terminate via its nursery. --- tractor/_debug.py | 133 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 98 insertions(+), 35 deletions(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index c31f16f..4d43025 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -4,8 +4,9 @@ Multi-core debugging for da peeps! import bdb import sys from functools import partial -from contextlib import asynccontextmanager, AsyncExitStack +from contextlib import asynccontextmanager, contextmanager from typing import Awaitable, Tuple, Optional, Callable +import signal from async_generator import aclosing import tractor @@ -23,25 +24,27 @@ except ImportError: assert pdb.xpm, "pdbpp is not installed?" pdbpp = pdb - log = get_logger(__name__) __all__ = ['breakpoint', 'post_mortem'] -# placeholder for function to set a ``trio.Event`` +# placeholder for function to set a ``trio.Event`` on debugger exit _pdb_release_hook: Optional[Callable] = None +# actor-wide flag +_in_debug = False + +# lock in root actor preventing multi-access to local tty +_debug_lock = trio.StrictFIFOLock() + class TractorConfig(pdbpp.DefaultConfig): """Custom ``pdbpp`` goodness. """ # sticky_by_default = True - def teardown(self): - _pdb_release_hook() - class PdbwTeardown(pdbpp.Pdb): """Add teardown hooks to the regular ``pdbpp.Pdb``. @@ -52,12 +55,20 @@ class PdbwTeardown(pdbpp.Pdb): # TODO: figure out how to dissallow recursive .set_trace() entry # since that'll cause deadlock for us. def set_continue(self): - super().set_continue() - self.config.teardown() + global _in_debug + try: + super().set_continue() + finally: + _in_debug = False + _pdb_release_hook() def set_quit(self): - super().set_quit() - self.config.teardown() + global _in_debug + try: + super().set_quit() + finally: + _in_debug = False + _pdb_release_hook() # TODO: will be needed whenever we get to true remote debugging. @@ -95,40 +106,75 @@ class PdbwTeardown(pdbpp.Pdb): # if bmsg in _pdb_exit_patterns: # log.info("Closing stdin hijack") # break + + @asynccontextmanager async def _acquire_debug_lock(): """Acquire a actor local FIFO lock meant to mutex entry to a local debugger entry point to avoid tty clobbering by multiple processes. """ try: - actor = tractor.current_actor() - debug_lock = actor.statespace.setdefault( - '_debug_lock', trio.StrictFIFOLock() - ) - await debug_lock.acquire() + log.error("TTY BEING ACQUIRED") + await _debug_lock.acquire() log.error("TTY lock acquired") yield finally: - if debug_lock.locked(): - debug_lock.release() + if _debug_lock.locked(): + _debug_lock.release() log.error("TTY lock released") +def handler(signum, frame): + """Block SIGINT while in debug to avoid deadlocks with cancellation. + """ + print( + "tractor ignores SIGINT while in debug mode\n" + "If you have a special need for it please open an issue.\n" + ) + + +# don't allow those stdlib mofos to mess with sigint handler +pdbpp.pdb.Pdb.sigint_handler = handler + + +@contextmanager +def _disable_sigint(): + try: + # disable sigint handling while in debug + prior_handler = signal.signal(signal.SIGINT, handler) + yield + finally: + # restore SIGINT handling + signal.signal(signal.SIGINT, prior_handler) + + async def _hijack_stdin_relay_to_child( subactor_uid: Tuple[str, str] ) -> None: + actor = tractor.current_actor() + nursery = actor._actoruid2nursery[subactor_uid] + print(f'NURSERY: {nursery}') + print(f'nursery is cancelled {nursery.cancelled}') + if actor._is_cancelled or nursery.cancelled: + raise RuntimeError( + "Can not engage debugger actor is already cancelled") + + await trio.sleep(0) + # TODO: when we get to true remote debugging # this will deliver stdin data - log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock") + log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock") async with _acquire_debug_lock(): - log.warning(f"Actor {subactor_uid} acquired stdin hijack lock") - # indicate to child that we've locked stdio - yield 'Locked' + log.warning(f"Actor {subactor_uid} ACQUIRED stdin hijack lock") - # wait for cancellation of stream by child - await trio.sleep_forever() + with _disable_sigint(): + # indicate to child that we've locked stdio + yield 'Locked' - log.debug(f"Actor {subactor_uid} released stdin hijack lock") + # wait for cancellation of stream by child + await trio.sleep_forever() + + log.debug(f"Actor {subactor_uid} RELEASED stdin hijack lock") # XXX: We only make this sync in case someone wants to @@ -137,6 +183,7 @@ def _breakpoint(debug_func) -> Awaitable[None]: """``tractor`` breakpoint entry for engaging pdb machinery in subactors. """ + global _in_debug actor = tractor.current_actor() do_unlock = trio.Event() @@ -147,7 +194,7 @@ def _breakpoint(debug_func) -> Awaitable[None]: async with tractor._portal.open_portal( actor._parent_chan, start_msg_loop=False, - shield=True, + # shield=True, ) as portal: with trio.fail_after(1): agen = await portal.run( @@ -156,34 +203,48 @@ def _breakpoint(debug_func) -> Awaitable[None]: subactor_uid=actor.uid, ) async with aclosing(agen): + + # block until first yield above async for val in agen: + assert val == 'Locked' task_status.started() - with trio.CancelScope(shield=True): - await do_unlock.wait() - # trigger cancellation of remote stream - break + # with trio.CancelScope(shield=True): + await do_unlock.wait() + + # trigger cancellation of remote stream + break finally: log.debug(f"Exiting debugger for actor {actor}") - actor.statespace['_in_debug'] = False + global _in_debug + _in_debug = False + # actor.statespace['_in_debug'] = False log.debug(f"Child {actor} released parent stdio lock") async def _bp(): """Async breakpoint which schedules a parent stdio lock, and once complete enters the ``pdbpp`` debugging console. """ - in_debug = actor.statespace.setdefault('_in_debug', False) + global _in_debug + # in_debug = actor.statespace.setdefault('_in_debug', False) - if in_debug: - log.warning(f"Actor {actor} already has a debug lock, skipping...") - return + if _in_debug: + # if **this** actor is already in debug mode block here + # waiting for the control to be released - this allows + # support for recursive entries to `tractor.breakpoint()` + log.warning( + f"Actor {actor.uid} already has a debug lock, waiting...") + await do_unlock.wait() + await trio.sleep(0.1) + # return # assign unlock callback for debugger teardown hooks global _pdb_release_hook _pdb_release_hook = do_unlock.set - actor.statespace['_in_debug'] = True + # actor.statespace['_in_debug'] = True + _in_debug = True # TODO: need a more robust check for the "root" actor if actor._parent_chan: @@ -231,6 +292,7 @@ breakpoint = partial( def _post_mortem(actor): log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n") + # custom Pdb post-mortem entry pdbpp.xpm(Pdb=PdbwTeardown) @@ -250,6 +312,7 @@ async def _maybe_enter_pm(err): # there's races between when the parent actor has killed all # comms and when the child tries to contact said parent to # acquire the tty lock. + # Really we just want to mostly avoid catching KBIs here so there # might be a simpler check we can do? and trio.MultiError.filter( From 363498b8826c20da956eccab4a3dc62d062dbbab Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 28 Sep 2020 09:24:36 -0400 Subject: [PATCH 21/61] Disable SIGINT handling in child processes There seems to be no good reason not too since our cancellation machinery/protocol should do this work when the root receives the signal. This also (hopefully) helps with some debugging race condition stuff. --- tractor/_entry.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tractor/_entry.py b/tractor/_entry.py index 71dfdec..9099fc0 100644 --- a/tractor/_entry.py +++ b/tractor/_entry.py @@ -3,6 +3,7 @@ Process entry points. """ from functools import partial from typing import Tuple, Any +import signal import trio # type: ignore @@ -57,6 +58,10 @@ def _trio_main( ) -> None: """Entry point for a `trio_run_in_process` subactor. """ + # Disable sigint handling in children; + # we don't need it thanks to our cancellation machinery. + signal.signal(signal.SIGINT, signal.SIG_IGN) + # TODO: make a global func to set this or is it too hacky? # os.environ['PYTHONBREAKPOINT'] = 'tractor._debug.breakpoint' From 25e93925b06ab3c0332c605c1a6ad3b220668bd1 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 28 Sep 2020 13:02:33 -0400 Subject: [PATCH 22/61] Add a cancel scope around child debugger requests This is needed in order to avoid the deadlock condition where a child actor is waiting on the root actor's tty lock but it's parent (possibly the root) is waiting on it to terminate after sending a cancel request. The solution is simple: create a cancel scope around the request in the child and always cancel it when a cancel request from the parent arrives. --- tractor/_actor.py | 14 ++++++-- tractor/_debug.py | 82 ++++++++++++++++++++++------------------------- 2 files changed, 50 insertions(+), 46 deletions(-) diff --git a/tractor/_actor.py b/tractor/_actor.py index e5b8ef0..045ebb2 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -27,7 +27,7 @@ from ._exceptions import ( unpack_error, ModuleNotExposed ) -from ._debug import _maybe_enter_pm +from . import _debug from ._discovery import get_arbiter from ._portal import Portal from . import _state @@ -129,7 +129,7 @@ async def _invoke( except (Exception, trio.MultiError) as err: # NOTE: don't enter debug mode recursively after quitting pdb log.exception("Actor crashed:") - await _maybe_enter_pm(err) + await _debug._maybe_enter_pm(err) # always ship errors back to caller err_msg = pack_error(err) @@ -832,6 +832,14 @@ class Actor: # cancel all ongoing rpc tasks with trio.CancelScope(shield=True): + + # kill any debugger request task to avoid deadlock + # with the root actor in this tree + dbcs = _debug._debugger_request_cs + if dbcs is not None: + log.debug("Cancelling active debugger request") + dbcs.cancel() + # kill all ongoing tasks await self.cancel_rpc_tasks() @@ -1051,7 +1059,7 @@ async def _start_actor( result = await main() except (Exception, trio.MultiError) as err: log.exception("Actor crashed:") - await _maybe_enter_pm(err) + await _debug._maybe_enter_pm(err) raise # XXX: the actor is cancelled when this context is complete diff --git a/tractor/_debug.py b/tractor/_debug.py index 4d43025..99814ea 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -11,6 +11,7 @@ import signal from async_generator import aclosing import tractor import trio +# from trio.testing import wait_all_tasks_blocked from .log import get_logger from . import _state @@ -39,6 +40,11 @@ _in_debug = False # lock in root actor preventing multi-access to local tty _debug_lock = trio.StrictFIFOLock() +# XXX: set by the current task waiting on the root tty lock +# and must be cancelled if this actor is cancelled via message +# otherwise deadlocks with the parent actor may ensure +_debugger_request_cs: trio.CancelScope = None + class TractorConfig(pdbpp.DefaultConfig): """Custom ``pdbpp`` goodness. @@ -119,8 +125,7 @@ async def _acquire_debug_lock(): log.error("TTY lock acquired") yield finally: - if _debug_lock.locked(): - _debug_lock.release() + _debug_lock.release() log.error("TTY lock released") @@ -151,16 +156,6 @@ def _disable_sigint(): async def _hijack_stdin_relay_to_child( subactor_uid: Tuple[str, str] ) -> None: - actor = tractor.current_actor() - nursery = actor._actoruid2nursery[subactor_uid] - print(f'NURSERY: {nursery}') - print(f'nursery is cancelled {nursery.cancelled}') - if actor._is_cancelled or nursery.cancelled: - raise RuntimeError( - "Can not engage debugger actor is already cancelled") - - await trio.sleep(0) - # TODO: when we get to true remote debugging # this will deliver stdin data log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock") @@ -172,6 +167,7 @@ async def _hijack_stdin_relay_to_child( yield 'Locked' # wait for cancellation of stream by child + # indicating debugger is dis-engaged await trio.sleep_forever() log.debug(f"Actor {subactor_uid} RELEASED stdin hijack lock") @@ -190,37 +186,40 @@ def _breakpoint(debug_func) -> Awaitable[None]: async def wait_for_parent_stdin_hijack( task_status=trio.TASK_STATUS_IGNORED ): - try: - async with tractor._portal.open_portal( - actor._parent_chan, - start_msg_loop=False, - # shield=True, - ) as portal: - with trio.fail_after(1): - agen = await portal.run( - 'tractor._debug', - '_hijack_stdin_relay_to_child', - subactor_uid=actor.uid, - ) - async with aclosing(agen): + global _debugger_request_cs + with trio.CancelScope() as cs: + _debugger_request_cs = cs + try: + async with tractor._portal.open_portal( + actor._parent_chan, + start_msg_loop=False, + # shield=True, + ) as portal: + with trio.fail_after(1): + agen = await portal.run( + 'tractor._debug', + '_hijack_stdin_relay_to_child', + subactor_uid=actor.uid, + ) + async with aclosing(agen): - # block until first yield above - async for val in agen: + # block until first yield above + async for val in agen: - assert val == 'Locked' - task_status.started() + assert val == 'Locked' + task_status.started() - # with trio.CancelScope(shield=True): - await do_unlock.wait() + # with trio.CancelScope(shield=True): + await do_unlock.wait() - # trigger cancellation of remote stream - break - finally: - log.debug(f"Exiting debugger for actor {actor}") - global _in_debug - _in_debug = False - # actor.statespace['_in_debug'] = False - log.debug(f"Child {actor} released parent stdio lock") + # trigger cancellation of remote stream + break + finally: + log.debug(f"Exiting debugger for actor {actor}") + global _in_debug + _in_debug = False + # actor.statespace['_in_debug'] = False + log.debug(f"Child {actor} released parent stdio lock") async def _bp(): """Async breakpoint which schedules a parent stdio lock, and once complete @@ -237,7 +236,6 @@ def _breakpoint(debug_func) -> Awaitable[None]: f"Actor {actor.uid} already has a debug lock, waiting...") await do_unlock.wait() await trio.sleep(0.1) - # return # assign unlock callback for debugger teardown hooks global _pdb_release_hook @@ -253,9 +251,6 @@ def _breakpoint(debug_func) -> Awaitable[None]: # being restricted by the scope of a new task nursery. await actor._service_n.start(wait_for_parent_stdin_hijack) - # block here one (at the appropriate frame *up* where - # ``breakpoint()`` was awaited and begin handling stdio - # debug_func(actor) else: # we also wait in the root-parent for any child that # may have the tty locked prior @@ -270,6 +265,7 @@ def _breakpoint(debug_func) -> Awaitable[None]: # block here one (at the appropriate frame *up* where # ``breakpoint()`` was awaited and begin handling stdio + log.debug("Entering the synchronous world of the debugger") debug_func(actor) # user code **must** await this! From 5dd2d35fc5ca34f803d9720806b7b7a87473434f Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 28 Sep 2020 13:11:22 -0400 Subject: [PATCH 23/61] Huh, maybe we don't need to block SIGINT Seems like the request task cancel scope is actually solving all the deadlock issues and masking SIGINT isn't changing much behaviour at all. I think let's keep it unmasked for now in case it does turn out useful in cancelling from unrecoverable states while in debug. --- tractor/_debug.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index 99814ea..b9ed5a2 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -142,15 +142,15 @@ def handler(signum, frame): pdbpp.pdb.Pdb.sigint_handler = handler -@contextmanager -def _disable_sigint(): - try: - # disable sigint handling while in debug - prior_handler = signal.signal(signal.SIGINT, handler) - yield - finally: - # restore SIGINT handling - signal.signal(signal.SIGINT, prior_handler) +# @contextmanager +# def _disable_sigint(): +# try: +# # disable sigint handling while in debug +# prior_handler = signal.signal(signal.SIGINT, handler) +# yield +# finally: +# # restore SIGINT handling +# signal.signal(signal.SIGINT, prior_handler) async def _hijack_stdin_relay_to_child( @@ -162,13 +162,14 @@ async def _hijack_stdin_relay_to_child( async with _acquire_debug_lock(): log.warning(f"Actor {subactor_uid} ACQUIRED stdin hijack lock") - with _disable_sigint(): - # indicate to child that we've locked stdio - yield 'Locked' + # with _disable_sigint(): - # wait for cancellation of stream by child - # indicating debugger is dis-engaged - await trio.sleep_forever() + # indicate to child that we've locked stdio + yield 'Locked' + + # wait for cancellation of stream by child + # indicating debugger is dis-engaged + await trio.sleep_forever() log.debug(f"Actor {subactor_uid} RELEASED stdin hijack lock") @@ -218,7 +219,6 @@ def _breakpoint(debug_func) -> Awaitable[None]: log.debug(f"Exiting debugger for actor {actor}") global _in_debug _in_debug = False - # actor.statespace['_in_debug'] = False log.debug(f"Child {actor} released parent stdio lock") async def _bp(): @@ -226,7 +226,6 @@ def _breakpoint(debug_func) -> Awaitable[None]: enters the ``pdbpp`` debugging console. """ global _in_debug - # in_debug = actor.statespace.setdefault('_in_debug', False) if _in_debug: # if **this** actor is already in debug mode block here @@ -241,7 +240,8 @@ def _breakpoint(debug_func) -> Awaitable[None]: global _pdb_release_hook _pdb_release_hook = do_unlock.set - # actor.statespace['_in_debug'] = True + # mark local actor as "in debug mode" to avoid recurrent + # entries/requests to the root process _in_debug = True # TODO: need a more robust check for the "root" actor From d7a472c7f2d85139ec3169fcf76278f2c52dffb4 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 28 Sep 2020 13:13:53 -0400 Subject: [PATCH 24/61] Update our debugging example to wait on results --- debugging/mp_debug.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/debugging/mp_debug.py b/debugging/mp_debug.py index a279715..fff92d3 100644 --- a/debugging/mp_debug.py +++ b/debugging/mp_debug.py @@ -9,7 +9,7 @@ async def bubble(): await tractor.breakpoint() -async def bail(): +async def name_error(): getattr(doggy) @@ -19,13 +19,21 @@ async def main(): async with tractor.open_nursery() as n: portal1 = await n.run_in_actor('bubble', bubble) - portal = await n.run_in_actor('bail', bail) - # await portal.result() - # await portal1.result() + portal = await n.run_in_actor('name_error', name_error) + await portal1.result() + await portal.result() # The ``async with`` will unblock here since the 'some_linguist' # actor has completed its main task ``cellar_door``. +# TODO: +# - recurrent entry from single actor +# - recurrent entry to breakpoint() from single actor *after* and an +# error +# - root error alongside child errors +# - recurrent root errors + + if __name__ == '__main__': - tractor.run(main, loglevel='error', debug_mode=True) + tractor.run(main, loglevel='info', debug_mode=True) From fc2cb610b90302de1b453df1369976131fb9af59 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 28 Sep 2020 13:49:45 -0400 Subject: [PATCH 25/61] Make "hard kill" just a `Process.terminate()` It's not like any of this code is really being used anyway since we aren't indefinitely blocking for cancelled subactors to terminate (yet). Drop the `do_hard_kill()` bit for now and just rely on the underlying process api. Oh, and mark the nursery as cancelled asap. --- tractor/_trionics.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/tractor/_trionics.py b/tractor/_trionics.py index d51a59b..d78e110 100644 --- a/tractor/_trionics.py +++ b/tractor/_trionics.py @@ -137,19 +137,14 @@ class ActorNursery: If ``hard_killl`` is set to ``True`` then kill the processes directly without any far end graceful ``trio`` cancellation. """ - def do_hard_kill(proc): - log.warning(f"Hard killing subactors {self._children}") - proc.terminate() - # XXX: below doesn't seem to work? - # send KeyBoardInterrupt (trio abort signal) to sub-actors - # os.kill(proc.pid, signal.SIGINT) + self.cancelled = True - log.debug("Cancelling nursery") + log.critical("Cancelling nursery") with trio.move_on_after(3) as cs: async with trio.open_nursery() as nursery: for subactor, proc, portal in self._children.values(): if hard_kill: - do_hard_kill(proc) + proc.terminate() else: if portal is None: # actor hasn't fully spawned yet event = self._actor._peer_connected[subactor.uid] @@ -169,7 +164,7 @@ class ActorNursery: if chan: portal = Portal(chan) else: # there's no other choice left - do_hard_kill(proc) + proc.terminate() # spawn cancel tasks for each sub-actor assert portal @@ -178,13 +173,13 @@ class ActorNursery: # if we cancelled the cancel (we hung cancelling remote actors) # then hard kill all sub-processes if cs.cancelled_caught: - log.error(f"Failed to gracefully cancel {self}, hard killing!") - async with trio.open_nursery(): - for subactor, proc, portal in self._children.values(): - nursery.start_soon(do_hard_kill, proc) + log.error( + f"Failed to cancel {self}\nHard killing process tree!") + for subactor, proc, portal in self._children.values(): + log.warning(f"Hard killing process {proc}") + proc.terminate() # mark ourselves as having (tried to have) cancelled all subactors - self.cancelled = True self._join_procs.set() From 29ed065dc44dbe3f7b887be2ece77adeff24d080 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 28 Sep 2020 13:56:42 -0400 Subject: [PATCH 26/61] Ack our inability to hard kill sub-procs --- tractor/_portal.py | 3 ++- tractor/_spawn.py | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/tractor/_portal.py b/tractor/_portal.py index e749ec6..af3c1b5 100644 --- a/tractor/_portal.py +++ b/tractor/_portal.py @@ -278,7 +278,8 @@ class Portal: try: # send cancel cmd - might not get response # XXX: sure would be nice to make this work with a proper shield - # with trio.CancelScope(shield=True): + # with trio.CancelScope() as cancel_scope: + # with trio.CancelScope(shield=True) as cancel_scope: with trio.move_on_after(0.5) as cancel_scope: cancel_scope.shield = True await self.run('self', 'cancel') diff --git a/tractor/_spawn.py b/tractor/_spawn.py index 6d19614..2a2f602 100644 --- a/tractor/_spawn.py +++ b/tractor/_spawn.py @@ -188,8 +188,17 @@ async def spawn_subactor( # the outer scope since no actor zombies are # ever allowed. This ``__aexit__()`` also shields # internally. - async with proc: - log.debug(f"Terminating {proc}") + log.debug(f"Attempting to kill {proc}") + + # NOTE: this timeout effectively does nothing right now since + # we are shielding the ``.wait()`` inside ``new_proc()`` which + # will pretty much never release until the process exits. + with trio.move_on_after(3) as cs: + async with proc: + log.debug(f"Terminating {proc}") + if cs.cancelled_caught: + log.critical(f"HARD KILLING {proc}") + proc.kill() async def new_proc( @@ -354,6 +363,8 @@ async def new_proc( await proc_waiter(proc) proc.join() + # This is again common logic for all backends: + log.debug(f"Joined {proc}") # pop child entry to indicate we are no longer managing this subactor subactor, proc, portal = actor_nursery._children.pop(subactor.uid) From 0a2a94fee038075a3496a750f02fcb09221a5848 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sat, 3 Oct 2020 19:35:18 -0400 Subject: [PATCH 27/61] Add initial root actor debugger tests --- {debugging => examples/debugging}/mp_debug.py | 0 examples/debugging/root_actor_breakpoint.py | 15 +++ examples/debugging/root_actor_error.py | 9 ++ tests/test_debugger.py | 106 ++++++++++++++++++ tests/test_docs_examples.py | 15 ++- 5 files changed, 141 insertions(+), 4 deletions(-) rename {debugging => examples/debugging}/mp_debug.py (100%) create mode 100644 examples/debugging/root_actor_breakpoint.py create mode 100644 examples/debugging/root_actor_error.py create mode 100644 tests/test_debugger.py diff --git a/debugging/mp_debug.py b/examples/debugging/mp_debug.py similarity index 100% rename from debugging/mp_debug.py rename to examples/debugging/mp_debug.py diff --git a/examples/debugging/root_actor_breakpoint.py b/examples/debugging/root_actor_breakpoint.py new file mode 100644 index 0000000..bd4dcb1 --- /dev/null +++ b/examples/debugging/root_actor_breakpoint.py @@ -0,0 +1,15 @@ +import trio +import tractor + + +async def main(): + + await trio.sleep(0.1) + + await tractor.breakpoint() + + await trio.sleep(0.1) + + +if __name__ == '__main__': + tractor.run(main, debug_mode=True) diff --git a/examples/debugging/root_actor_error.py b/examples/debugging/root_actor_error.py new file mode 100644 index 0000000..7486699 --- /dev/null +++ b/examples/debugging/root_actor_error.py @@ -0,0 +1,9 @@ +import tractor + + +async def main(): + assert 0 + + +if __name__ == '__main__': + tractor.run(main, debug_mode=True) diff --git a/tests/test_debugger.py b/tests/test_debugger.py new file mode 100644 index 0000000..9d746c6 --- /dev/null +++ b/tests/test_debugger.py @@ -0,0 +1,106 @@ +""" +That native debug better work! +""" +from os import path + +import pytest +import pexpect + +from .test_docs_examples import repodir + + +def examples_dir(): + """Return the abspath to the examples directory. + """ + return path.join(repodir(), 'examples', 'debugging/') + + +def mk_cmd(ex_name: str) -> str: + """Generate a command suitable to pass to ``pexpect.spawn()``. + """ + return ' '.join( + ['python', + path.join(examples_dir(), f'{ex_name}.py')] + ) + + +def spawn( + cmd: str, + testdir, +) -> pexpect.spawn: + return testdir.spawn( + cmd=mk_cmd(cmd), + expect_timeout=3, + ) + + +@pytest.mark.parametrize( + 'user_in_out', + [ + ('c', 'AssertionError'), + ('q', 'AssertionError'), + ], + ids=lambda item: item[1], +) +def test_root_actor_error(testdir, user_in_out): + """Demonstrate crash handler entering pdbpp from basic error in root actor. + """ + user_input, expect_err_str = user_in_out + + child = spawn('root_actor_error', testdir) + + # scan for the pdbpp prompt + child.expect("\(Pdb\+\+\)") + + # make sure expected logging and error arrives + assert 'TTY lock acquired' in str(child.before) + assert 'AssertionError' in str(child.before) + + # send user command + child.sendline(user_input) + child.expect('\r\n') + child.expect('TTY lock released') + + # process should exit + child.expect(pexpect.EOF) + assert expect_err_str in str(child.before) + + +@pytest.mark.parametrize( + 'user_in_out', + [ + ('c', None), + ('q', 'bdb.BdbQuit'), + ], + ids=lambda item: f'{item[0]} -> {item[1]}', +) +def test_root_actor_bp(testdir, user_in_out): + """Demonstrate breakpoint from in root actor. + """ + user_input, expect_err_str = user_in_out + child = spawn('root_actor_breakpoint', testdir) + + # scan for the pdbpp prompt + child.expect("\(Pdb\+\+\)") + + assert 'Error' not in str(child.before) + + # send user command + child.sendline(user_input) + child.expect('\r\n') + + # process should exit + child.expect(pexpect.EOF) + + if expect_err_str is None: + assert 'Error' not in str(child.before) + else: + assert expect_err_str in str(child.before) + + +def test_subactor_error(testdir): + ... + + +def test_subactor_breakpoint(testdir): + ... diff --git a/tests/test_docs_examples.py b/tests/test_docs_examples.py index c6d5df8..67b34ee 100644 --- a/tests/test_docs_examples.py +++ b/tests/test_docs_examples.py @@ -85,15 +85,22 @@ def run_example_in_subproc(loglevel, testdir, arb_addr): @pytest.mark.parametrize( 'example_script', - [f for f in os.listdir(examples_dir()) if '__' not in f], + [ + f for f in os.listdir(examples_dir()) + if ( + ('__' not in f) and + ('debugging' not in f) + ) + ], ) def test_example(run_example_in_subproc, example_script): """Load and run scripts from this repo's ``examples/`` dir as a user would copy and pasing them into their editor. - On windows a little more "finessing" is done to make ``multiprocessing`` play nice: - we copy the ``__main__.py`` into the test directory and invoke the script as a module - with ``python -m test_example``. + On windows a little more "finessing" is done to make + ``multiprocessing`` play nice: we copy the ``__main__.py`` into the + test directory and invoke the script as a module with ``python -m + test_example``. """ ex_file = os.path.join(examples_dir(), example_script) with open(ex_file, 'r') as ex: From 9067bb2a41b0722ef33c2cfef9a9e3ceb5c7bcd8 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sat, 3 Oct 2020 19:36:04 -0400 Subject: [PATCH 28/61] Shorten arbiter contact timeout --- tractor/_actor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tractor/_actor.py b/tractor/_actor.py index 045ebb2..6517ed3 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -754,7 +754,7 @@ class Actor: self._arb_addr is not None ): failed = False - with trio.move_on_after(5) as cs: + with trio.move_on_after(0.5) as cs: cs.shield = True try: async with get_arbiter(*self._arb_addr) as arb_portal: From 73a32f7d9c48255f77a42f389b7c0dbfd3f88464 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sun, 4 Oct 2020 09:54:36 -0400 Subject: [PATCH 29/61] Add initial subactor debug tests --- examples/debugging/subactor_breakpoint.py | 25 ++++++ examples/debugging/subactor_error.py | 16 ++++ tests/test_debugger.py | 100 +++++++++++++++++++--- 3 files changed, 128 insertions(+), 13 deletions(-) create mode 100644 examples/debugging/subactor_breakpoint.py create mode 100644 examples/debugging/subactor_error.py diff --git a/examples/debugging/subactor_breakpoint.py b/examples/debugging/subactor_breakpoint.py new file mode 100644 index 0000000..35db6cf --- /dev/null +++ b/examples/debugging/subactor_breakpoint.py @@ -0,0 +1,25 @@ +import trio +import tractor + + +async def breakpoint_forever(): + """Indefinitely re-enter debugger in child actor. + """ + while True: + await trio.sleep(0.1) + await tractor.breakpoint() + + +async def main(): + + async with tractor.open_nursery() as n: + + portal = await n.run_in_actor( + 'breakpoint_forever', + breakpoint_forever, + ) + await portal.result() + + +if __name__ == '__main__': + tractor.run(main, debug_mode=True) diff --git a/examples/debugging/subactor_error.py b/examples/debugging/subactor_error.py new file mode 100644 index 0000000..67ec1b6 --- /dev/null +++ b/examples/debugging/subactor_error.py @@ -0,0 +1,16 @@ +import tractor + + +async def name_error(): + getattr(doggypants) + + +async def main(): + async with tractor.open_nursery() as n: + + portal = await n.run_in_actor('name_error', name_error) + await portal.result() + + +if __name__ == '__main__': + tractor.run(main, debug_mode=True) diff --git a/tests/test_debugger.py b/tests/test_debugger.py index 9d746c6..a26f9ce 100644 --- a/tests/test_debugger.py +++ b/tests/test_debugger.py @@ -24,14 +24,19 @@ def mk_cmd(ex_name: str) -> str: ) +@pytest.fixture def spawn( - cmd: str, testdir, + arb_addr, ) -> pexpect.spawn: - return testdir.spawn( - cmd=mk_cmd(cmd), - expect_timeout=3, - ) + + def _spawn(cmd): + return testdir.spawn( + cmd=mk_cmd(cmd), + expect_timeout=3, + ) + + return _spawn @pytest.mark.parametrize( @@ -42,12 +47,12 @@ def spawn( ], ids=lambda item: item[1], ) -def test_root_actor_error(testdir, user_in_out): +def test_root_actor_error(spawn, user_in_out): """Demonstrate crash handler entering pdbpp from basic error in root actor. """ user_input, expect_err_str = user_in_out - child = spawn('root_actor_error', testdir) + child = spawn('root_actor_error') # scan for the pdbpp prompt child.expect("\(Pdb\+\+\)") @@ -74,11 +79,11 @@ def test_root_actor_error(testdir, user_in_out): ], ids=lambda item: f'{item[0]} -> {item[1]}', ) -def test_root_actor_bp(testdir, user_in_out): +def test_root_actor_bp(spawn, user_in_out): """Demonstrate breakpoint from in root actor. """ user_input, expect_err_str = user_in_out - child = spawn('root_actor_breakpoint', testdir) + child = spawn('root_actor_breakpoint') # scan for the pdbpp prompt child.expect("\(Pdb\+\+\)") @@ -98,9 +103,78 @@ def test_root_actor_bp(testdir, user_in_out): assert expect_err_str in str(child.before) -def test_subactor_error(testdir): - ... +def test_subactor_error(spawn): + child = spawn('subactor_error') + + # scan for the pdbpp prompt + child.expect("\(Pdb\+\+\)") + + before = str(child.before.decode()) + assert "Attaching to pdb in crashed actor: ('name_error'" in before + + # send user command + # (in this case it's the same for 'continue' vs. 'quit') + child.sendline('continue') + child.expect('\r\n') + + # the debugger should enter a second time in the nursery + # creating actor + + child.expect("\(Pdb\+\+\)") + + before = str(child.before.decode()) + + # root actor gets debugger engaged + assert "Attaching to pdb in crashed actor: ('arbiter'" in before + + # error is a remote error propagated from the subactor + assert "RemoteActorError: ('name_error'" in before + + child.sendline('c') + child.expect('\r\n') + + # process should exit + child.expect(pexpect.EOF) -def test_subactor_breakpoint(testdir): - ... +def test_subactor_breakpoint(spawn): + child = spawn('subactor_breakpoint') + + # scan for the pdbpp prompt + child.expect("\(Pdb\+\+\)") + + before = str(child.before.decode()) + assert "Attaching pdb to actor: ('breakpoint_forever'" in before + + # do some "next" commands to demonstrate recurrent breakpoint + # entries + for _ in range(10): + child.sendline('next') + child.expect("\(Pdb\+\+\)") + + # now run some "continues" to show re-entries + for _ in range(5): + child.sendline('continue') + child.expect("\(Pdb\+\+\)") + before = str(child.before.decode()) + assert "Attaching pdb to actor: ('breakpoint_forever'" in before + + # finally quit the loop + child.sendline('q') + + # child process should exit but parent will capture pdb.BdbQuit + child.expect("\(Pdb\+\+\)") + + before = str(child.before.decode()) + assert "RemoteActorError: ('breakpoint_forever'" in before + assert 'bdb.BdbQuit' in before + + # quit the parent + child.sendline('c') + + # process should exit + child.expect(pexpect.EOF) + + before = str(child.before.decode()) + assert "RemoteActorError: ('breakpoint_forever'" in before + assert 'bdb.BdbQuit' in before From a2151cdd4d4091e60db49b7fef612677141f5102 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sun, 4 Oct 2020 09:55:34 -0400 Subject: [PATCH 30/61] Allow re-entrant breakpoints during pdb stepping --- tractor/_debug.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index b9ed5a2..a0a5fc6 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -180,7 +180,6 @@ def _breakpoint(debug_func) -> Awaitable[None]: """``tractor`` breakpoint entry for engaging pdb machinery in subactors. """ - global _in_debug actor = tractor.current_actor() do_unlock = trio.Event() @@ -196,7 +195,7 @@ def _breakpoint(debug_func) -> Awaitable[None]: start_msg_loop=False, # shield=True, ) as portal: - with trio.fail_after(1): + with trio.fail_after(.5): agen = await portal.run( 'tractor._debug', '_hijack_stdin_relay_to_child', @@ -225,9 +224,15 @@ def _breakpoint(debug_func) -> Awaitable[None]: """Async breakpoint which schedules a parent stdio lock, and once complete enters the ``pdbpp`` debugging console. """ - global _in_debug + task_name = trio.lowlevel.current_task() + + global _in_debug + if _in_debug : + if _in_debug == task_name: + # this task already has the lock and is + # likely recurrently entering a breakpoint + return - if _in_debug: # if **this** actor is already in debug mode block here # waiting for the control to be released - this allows # support for recursive entries to `tractor.breakpoint()` @@ -242,7 +247,7 @@ def _breakpoint(debug_func) -> Awaitable[None]: # mark local actor as "in debug mode" to avoid recurrent # entries/requests to the root process - _in_debug = True + _in_debug = task_name # TODO: need a more robust check for the "root" actor if actor._parent_chan: From e387e8b3221741ac7caab2983601bd421a8593a0 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sun, 4 Oct 2020 17:31:02 -0400 Subject: [PATCH 31/61] Add a multi-subactor test with nesting --- examples/debugging/mp_debug.py | 39 ----------- examples/debugging/multi_subactors.py | 51 ++++++++++++++ tests/test_debugger.py | 95 ++++++++++++++++++++++++--- 3 files changed, 137 insertions(+), 48 deletions(-) delete mode 100644 examples/debugging/mp_debug.py create mode 100644 examples/debugging/multi_subactors.py diff --git a/examples/debugging/mp_debug.py b/examples/debugging/mp_debug.py deleted file mode 100644 index fff92d3..0000000 --- a/examples/debugging/mp_debug.py +++ /dev/null @@ -1,39 +0,0 @@ -import tractor -import trio - - -async def bubble(): - print('IN BUBBLE') - while True: - await trio.sleep(.1) - await tractor.breakpoint() - - -async def name_error(): - getattr(doggy) - - -async def main(): - """The main ``tractor`` routine. - """ - async with tractor.open_nursery() as n: - - portal1 = await n.run_in_actor('bubble', bubble) - portal = await n.run_in_actor('name_error', name_error) - await portal1.result() - await portal.result() - - # The ``async with`` will unblock here since the 'some_linguist' - # actor has completed its main task ``cellar_door``. - - -# TODO: -# - recurrent entry from single actor -# - recurrent entry to breakpoint() from single actor *after* and an -# error -# - root error alongside child errors -# - recurrent root errors - - -if __name__ == '__main__': - tractor.run(main, loglevel='info', debug_mode=True) diff --git a/examples/debugging/multi_subactors.py b/examples/debugging/multi_subactors.py new file mode 100644 index 0000000..97de906 --- /dev/null +++ b/examples/debugging/multi_subactors.py @@ -0,0 +1,51 @@ +import tractor +import trio + + +async def breakpoint_forever(): + "Indefinitely re-enter debugger in child actor." + while True: + await trio.sleep(0.1) + await tractor.breakpoint() + + +async def name_error(): + "Raise a ``NameError``" + getattr(doggypants) + + +async def spawn_error(): + """"A nested nursery that triggers another ``NameError``. + """ + async with tractor.open_nursery() as n: + portal = await n.run_in_actor('name_error_1', name_error) + return await portal.result() + + +async def main(): + """The main ``tractor`` routine. + + The process tree should look as approximately as follows: + + -python examples/debugging/multi_subactors.py + |-python -m tractor._child --uid ('name_error', 'a7caf490 ...) + |-python -m tractor._child --uid ('bp_forever', '1f787a7e ...) + `-python -m tractor._child --uid ('spawn_error', '52ee14a5 ...) + `-python -m tractor._child --uid ('name_error', '3391222c ...) + """ + async with tractor.open_nursery() as n: + + # spawn both actors + portal1 = await n.run_in_actor('bp_forever', breakpoint_forever) + portal = await n.run_in_actor('name_error', name_error) + portal2 = await n.run_in_actor('spawn_error', spawn_error) + + # attempt to collect results (which raises error in parent) + # still has some issues where the parent seems to get stuck + # await portal.result() + # await portal1.result() + # await portal2.result() + + +if __name__ == '__main__': + tractor.run(main, debug_mode=True) diff --git a/tests/test_debugger.py b/tests/test_debugger.py index a26f9ce..de76247 100644 --- a/tests/test_debugger.py +++ b/tests/test_debugger.py @@ -9,6 +9,17 @@ import pexpect from .test_docs_examples import repodir +# TODO: +# - recurrent entry from single actor +# - recurrent entry to breakpoint() from single actor *after* and an +# error +# - root error before child errors +# - root error after child errors +# - root error before child breakpoint +# - root error after child breakpoint +# - recurrent root errors + + def examples_dir(): """Return the abspath to the examples directory. """ @@ -45,7 +56,7 @@ def spawn( ('c', 'AssertionError'), ('q', 'AssertionError'), ], - ids=lambda item: item[1], + ids=lambda item: f'{item[0]} -> {item[1]}', ) def test_root_actor_error(spawn, user_in_out): """Demonstrate crash handler entering pdbpp from basic error in root actor. @@ -55,7 +66,7 @@ def test_root_actor_error(spawn, user_in_out): child = spawn('root_actor_error') # scan for the pdbpp prompt - child.expect("\(Pdb\+\+\)") + child.expect(r"\(Pdb\+\+\)") # make sure expected logging and error arrives assert 'TTY lock acquired' in str(child.before) @@ -86,7 +97,7 @@ def test_root_actor_bp(spawn, user_in_out): child = spawn('root_actor_breakpoint') # scan for the pdbpp prompt - child.expect("\(Pdb\+\+\)") + child.expect(r"\(Pdb\+\+\)") assert 'Error' not in str(child.before) @@ -104,10 +115,12 @@ def test_root_actor_bp(spawn, user_in_out): def test_subactor_error(spawn): + "Single subactor raising an error" + child = spawn('subactor_error') # scan for the pdbpp prompt - child.expect("\(Pdb\+\+\)") + child.expect(r"\(Pdb\+\+\)") before = str(child.before.decode()) assert "Attaching to pdb in crashed actor: ('name_error'" in before @@ -120,7 +133,7 @@ def test_subactor_error(spawn): # the debugger should enter a second time in the nursery # creating actor - child.expect("\(Pdb\+\+\)") + child.expect(r"\(Pdb\+\+\)") before = str(child.before.decode()) @@ -138,10 +151,12 @@ def test_subactor_error(spawn): def test_subactor_breakpoint(spawn): + "Single subactor with an infinite breakpoint loop" + child = spawn('subactor_breakpoint') # scan for the pdbpp prompt - child.expect("\(Pdb\+\+\)") + child.expect(r"\(Pdb\+\+\)") before = str(child.before.decode()) assert "Attaching pdb to actor: ('breakpoint_forever'" in before @@ -150,12 +165,12 @@ def test_subactor_breakpoint(spawn): # entries for _ in range(10): child.sendline('next') - child.expect("\(Pdb\+\+\)") + child.expect(r"\(Pdb\+\+\)") # now run some "continues" to show re-entries for _ in range(5): child.sendline('continue') - child.expect("\(Pdb\+\+\)") + child.expect(r"\(Pdb\+\+\)") before = str(child.before.decode()) assert "Attaching pdb to actor: ('breakpoint_forever'" in before @@ -163,7 +178,7 @@ def test_subactor_breakpoint(spawn): child.sendline('q') # child process should exit but parent will capture pdb.BdbQuit - child.expect("\(Pdb\+\+\)") + child.expect(r"\(Pdb\+\+\)") before = str(child.before.decode()) assert "RemoteActorError: ('breakpoint_forever'" in before @@ -178,3 +193,65 @@ def test_subactor_breakpoint(spawn): before = str(child.before.decode()) assert "RemoteActorError: ('breakpoint_forever'" in before assert 'bdb.BdbQuit' in before + + +def test_multi_subactors(spawn): + """Multiple subactors, both erroring and breakpointing as well as + a nested subactor erroring. + """ + child = spawn(r'multi_subactors') + + # scan for the pdbpp prompt + child.expect(r"\(Pdb\+\+\)") + + before = str(child.before.decode()) + assert "Attaching pdb to actor: ('bp_forever'" in before + + # do some "next" commands to demonstrate recurrent breakpoint + # entries + for _ in range(10): + child.sendline('next') + child.expect(r"\(Pdb\+\+\)") + + # continue to next error + child.sendline('c') + + # first name_error failure + child.expect(r"\(Pdb\+\+\)") + before = str(child.before.decode()) + assert "NameError" in before + + # continue again + child.sendline('c') + + # 2nd name_error failure + child.expect(r"\(Pdb\+\+\)") + before = str(child.before.decode()) + assert "NameError" in before + + # breakpoint loop should re-engage + child.sendline('c') + child.expect(r"\(Pdb\+\+\)") + before = str(child.before.decode()) + assert "Attaching pdb to actor: ('bp_forever'" in before + + # now run some "continues" to show re-entries + for _ in range(5): + child.sendline('c') + child.expect(r"\(Pdb\+\+\)") + + # quit the loop and expect parent to attach + child.sendline('q') + child.expect(r"\(Pdb\+\+\)") + before = str(child.before.decode()) + assert "Attaching to pdb in crashed actor: ('arbiter'" in before + assert "RemoteActorError: ('bp_forever'" in before + assert 'bdb.BdbQuit' in before + + # process should exit + child.sendline('c') + child.expect(pexpect.EOF) + + before = str(child.before.decode()) + assert "RemoteActorError: ('bp_forever'" in before + assert 'bdb.BdbQuit' in before From 83a45119e9e6d103c57dedc6af225b9a9968dbca Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sun, 4 Oct 2020 17:58:41 -0400 Subject: [PATCH 32/61] Add "root mailbox" contact info passing Every subactor in the tree now receives the socket (or whatever the mailbox type ends up being) during startup and can call the new `tractor._discovery.get_root()` function to get a portal to the current root actor in their tree. The main reason for adding this atm is to support nested child actors gaining access to the root's tty lock for debugging. Also, when a channel disconnects from a message loop, might as well kill all its rpc tasks. --- tractor/_actor.py | 31 +++++++++++++++++++++++++------ tractor/_debug.py | 18 ++++++++++-------- tractor/_discovery.py | 12 +++++++++++- tractor/_state.py | 1 + 4 files changed, 47 insertions(+), 15 deletions(-) diff --git a/tractor/_actor.py b/tractor/_actor.py index 6517ed3..b79106f 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -539,7 +539,9 @@ class Actor: f"Waiting on next msg for {chan} from {chan.uid}") else: # channel disconnect - log.debug(f"{chan} from {chan.uid} disconnected") + log.debug( + f"{chan} from {chan.uid} disconnected, cancelling all rpc tasks") + self.cancel_rpc_tasks(chan) except trio.ClosedResourceError: log.error(f"{chan} form {chan.uid} broke") @@ -676,6 +678,9 @@ class Actor: accept_port=port ) ) + accept_addr = self.accept_addr + if _state._runtime_vars['_is_root']: + _state._runtime_vars['_root_mailbox'] = accept_addr # Register with the arbiter if we're told its addr log.debug(f"Registering {self} for role `{self.name}`") @@ -686,7 +691,7 @@ class Actor: 'self', 'register_actor', uid=self.uid, - sockaddr=self.accept_addr, + sockaddr=accept_addr, ) registered_with_arbiter = True @@ -895,15 +900,23 @@ class Actor: f"Sucessfully cancelled task:\ncid: {cid}\nfunc: {func}\n" f"peer: {chan.uid}\n") - async def cancel_rpc_tasks(self) -> None: + async def cancel_rpc_tasks( + self, + only_chan: Optional[Channel] = None, + ) -> None: """Cancel all existing RPC responder tasks using the cancel scope registered for each. """ tasks = self._rpc_tasks log.info(f"Cancelling all {len(tasks)} rpc tasks:\n{tasks} ") for (chan, cid) in tasks.copy(): + if only_chan is not None: + if only_chan != chan: + continue + # TODO: this should really done in a nursery batch await self._cancel_task(cid, chan) + log.info( f"Waiting for remaining rpc tasks to complete {tasks}") await self._ongoing_rpc_tasks.wait() @@ -1058,9 +1071,15 @@ async def _start_actor( try: result = await main() except (Exception, trio.MultiError) as err: - log.exception("Actor crashed:") - await _debug._maybe_enter_pm(err) - raise + try: + log.exception("Actor crashed:") + await _debug._maybe_enter_pm(err) + + raise + + finally: + actor._service_n.cancel_scope.cancel() + await actor.cancel() # XXX: the actor is cancelled when this context is complete # given that there are no more active peer channels connected diff --git a/tractor/_debug.py b/tractor/_debug.py index a0a5fc6..2e1c107 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -15,6 +15,7 @@ import trio from .log import get_logger from . import _state +from ._discovery import get_root try: # wtf: only exported when installed in dev mode? @@ -190,11 +191,7 @@ def _breakpoint(debug_func) -> Awaitable[None]: with trio.CancelScope() as cs: _debugger_request_cs = cs try: - async with tractor._portal.open_portal( - actor._parent_chan, - start_msg_loop=False, - # shield=True, - ) as portal: + async with get_root() as portal: with trio.fail_after(.5): agen = await portal.run( 'tractor._debug', @@ -262,9 +259,14 @@ def _breakpoint(debug_func) -> Awaitable[None]: async def _lock( task_status=trio.TASK_STATUS_IGNORED ): - async with _acquire_debug_lock(): - task_status.started() - await do_unlock.wait() + try: + async with _acquire_debug_lock(): + task_status.started() + await do_unlock.wait() + finally: + global _in_debug + _in_debug = False + log.debug(f"{actor} released tty lock") await actor._service_n.start(_lock) diff --git a/tractor/_discovery.py b/tractor/_discovery.py index 0407896..13da85d 100644 --- a/tractor/_discovery.py +++ b/tractor/_discovery.py @@ -11,7 +11,7 @@ from ._portal import ( open_portal, LocalPortal, ) -from ._state import current_actor +from ._state import current_actor, _runtime_vars @asynccontextmanager @@ -37,6 +37,16 @@ async def get_arbiter( yield arb_portal +@asynccontextmanager +async def get_root( +**kwargs, +) -> typing.AsyncGenerator[Union[Portal, LocalPortal], None]: + host, port = _runtime_vars['_root_mailbox'] + async with _connect_chan(host, port) as chan: + async with open_portal(chan, **kwargs) as portal: + yield portal + + @asynccontextmanager async def find_actor( name: str, diff --git a/tractor/_state.py b/tractor/_state.py index 2a18a4c..2728b78 100644 --- a/tractor/_state.py +++ b/tractor/_state.py @@ -11,6 +11,7 @@ _current_actor: Optional['Actor'] = None # type: ignore _runtime_vars = { '_debug_mode': False, '_is_root': False, + '_root_mailbox': (None, None) } From 31c1a32d58c6c2252bc788fb26815dbbb7ae1da1 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Sun, 4 Oct 2020 19:39:00 -0400 Subject: [PATCH 33/61] Add re-entrant root breakpoint test; demonstrates a bug.. --- .../root_actor_breakpoint_forever.py | 11 +++++++++ tests/test_debugger.py | 23 ++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 examples/debugging/root_actor_breakpoint_forever.py diff --git a/examples/debugging/root_actor_breakpoint_forever.py b/examples/debugging/root_actor_breakpoint_forever.py new file mode 100644 index 0000000..0332ab6 --- /dev/null +++ b/examples/debugging/root_actor_breakpoint_forever.py @@ -0,0 +1,11 @@ +import tractor + + +async def main(): + + while True: + await tractor.breakpoint() + + +if __name__ == '__main__': + tractor.run(main, debug_mode=True) diff --git a/tests/test_debugger.py b/tests/test_debugger.py index de76247..0770a14 100644 --- a/tests/test_debugger.py +++ b/tests/test_debugger.py @@ -114,6 +114,28 @@ def test_root_actor_bp(spawn, user_in_out): assert expect_err_str in str(child.before) +def test_root_actor_bp_forever(spawn): + "Re-enter a breakpoint from the root actor-task." + child = spawn('root_actor_breakpoint_forever') + + # do some "next" commands to demonstrate recurrent breakpoint + # entries + for _ in range(10): + child.sendline('next') + child.expect(r"\(Pdb\+\+\)") + + # do one continue which should trigger a new task to lock the tty + child.sendline('continue') + child.expect(r"\(Pdb\+\+\)") + + # XXX: this previously caused a bug! + child.sendline('n') + child.expect(r"\(Pdb\+\+\)") + + child.sendline('n') + child.expect(r"\(Pdb\+\+\)") + + def test_subactor_error(spawn): "Single subactor raising an error" @@ -128,7 +150,6 @@ def test_subactor_error(spawn): # send user command # (in this case it's the same for 'continue' vs. 'quit') child.sendline('continue') - child.expect('\r\n') # the debugger should enter a second time in the nursery # creating actor From d43d367153f731275de02f7336cb1276c5fbfe11 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 5 Oct 2020 10:07:06 -0400 Subject: [PATCH 34/61] Facepalm: tty locking from root doesn't require an extra task --- tractor/_actor.py | 1 - tractor/_debug.py | 80 ++++++++++++++++++++------------------------ tractor/_trionics.py | 2 +- 3 files changed, 38 insertions(+), 45 deletions(-) diff --git a/tractor/_actor.py b/tractor/_actor.py index b79106f..2813f8d 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -1078,7 +1078,6 @@ async def _start_actor( raise finally: - actor._service_n.cancel_scope.cancel() await actor.cancel() # XXX: the actor is cancelled when this context is complete diff --git a/tractor/_debug.py b/tractor/_debug.py index 2e1c107..a00594a 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -4,9 +4,9 @@ Multi-core debugging for da peeps! import bdb import sys from functools import partial -from contextlib import asynccontextmanager, contextmanager +from contextlib import asynccontextmanager from typing import Awaitable, Tuple, Optional, Callable -import signal +# import signal from async_generator import aclosing import tractor @@ -16,6 +16,7 @@ import trio from .log import get_logger from . import _state from ._discovery import get_root +from ._state import is_root_process try: # wtf: only exported when installed in dev mode? @@ -35,7 +36,7 @@ __all__ = ['breakpoint', 'post_mortem'] # placeholder for function to set a ``trio.Event`` on debugger exit _pdb_release_hook: Optional[Callable] = None -# actor-wide flag +# actor-wide variable pointing to current task name using debugger _in_debug = False # lock in root actor preventing multi-access to local tty @@ -120,14 +121,15 @@ async def _acquire_debug_lock(): """Acquire a actor local FIFO lock meant to mutex entry to a local debugger entry point to avoid tty clobbering by multiple processes. """ + task_name = trio.lowlevel.current_task() try: - log.error("TTY BEING ACQUIRED") + log.error(f"TTY BEING ACQUIRED by {task_name}") await _debug_lock.acquire() - log.error("TTY lock acquired") + log.error(f"TTY lock acquired by {task_name}") yield finally: _debug_lock.release() - log.error("TTY lock released") + log.error(f"TTY lock released by {task_name}") def handler(signum, frame): @@ -221,54 +223,46 @@ def _breakpoint(debug_func) -> Awaitable[None]: """Async breakpoint which schedules a parent stdio lock, and once complete enters the ``pdbpp`` debugging console. """ - task_name = trio.lowlevel.current_task() + task_name = trio.lowlevel.current_task().name global _in_debug - if _in_debug : - if _in_debug == task_name: - # this task already has the lock and is - # likely recurrently entering a breakpoint - return - - # if **this** actor is already in debug mode block here - # waiting for the control to be released - this allows - # support for recursive entries to `tractor.breakpoint()` - log.warning( - f"Actor {actor.uid} already has a debug lock, waiting...") - await do_unlock.wait() - await trio.sleep(0.1) - - # assign unlock callback for debugger teardown hooks - global _pdb_release_hook - _pdb_release_hook = do_unlock.set - - # mark local actor as "in debug mode" to avoid recurrent - # entries/requests to the root process - _in_debug = task_name # TODO: need a more robust check for the "root" actor - if actor._parent_chan: + if actor._parent_chan and not is_root_process(): + if _in_debug: + if _in_debug == task_name: + # this task already has the lock and is + # likely recurrently entering a breakpoint + return + + # if **this** actor is already in debug mode block here + # waiting for the control to be released - this allows + # support for recursive entries to `tractor.breakpoint()` + log.warning( + f"Actor {actor.uid} already has a debug lock, waiting...") + await do_unlock.wait() + await trio.sleep(0.1) + + # assign unlock callback for debugger teardown hooks + global _pdb_release_hook + _pdb_release_hook = do_unlock.set + + # mark local actor as "in debug mode" to avoid recurrent + # entries/requests to the root process + _in_debug = task_name + # this **must** be awaited by the caller and is done using the # root nursery so that the debugger can continue to run without # being restricted by the scope of a new task nursery. await actor._service_n.start(wait_for_parent_stdin_hijack) - else: + elif is_root_process(): # we also wait in the root-parent for any child that # may have the tty locked prior - async def _lock( - task_status=trio.TASK_STATUS_IGNORED - ): - try: - async with _acquire_debug_lock(): - task_status.started() - await do_unlock.wait() - finally: - global _in_debug - _in_debug = False - log.debug(f"{actor} released tty lock") - - await actor._service_n.start(_lock) + if _debug_lock.locked(): # root process already has it; ignore + return + await _debug_lock.acquire() + _pdb_release_hook = _debug_lock.release # block here one (at the appropriate frame *up* where # ``breakpoint()`` was awaited and begin handling stdio diff --git a/tractor/_trionics.py b/tractor/_trionics.py index d78e110..2d57ac6 100644 --- a/tractor/_trionics.py +++ b/tractor/_trionics.py @@ -139,7 +139,7 @@ class ActorNursery: """ self.cancelled = True - log.critical("Cancelling nursery") + log.warning(f"Cancelling nursery in {self._actor.uid}") with trio.move_on_after(3) as cs: async with trio.open_nursery() as nursery: for subactor, proc, portal in self._children.values(): From 371025947ab3ffa827ca247dfeecf62756e528d7 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 5 Oct 2020 11:38:39 -0400 Subject: [PATCH 35/61] Add a multi-subactor test where the root errors --- .../debugging/multi_subactor_root_errors.py | 43 +++++++++++++++++ examples/debugging/multi_subactors.py | 16 +++---- tests/test_debugger.py | 46 ++++++++++++++++--- 3 files changed, 89 insertions(+), 16 deletions(-) create mode 100644 examples/debugging/multi_subactor_root_errors.py diff --git a/examples/debugging/multi_subactor_root_errors.py b/examples/debugging/multi_subactor_root_errors.py new file mode 100644 index 0000000..f7a4372 --- /dev/null +++ b/examples/debugging/multi_subactor_root_errors.py @@ -0,0 +1,43 @@ +import tractor + + +async def name_error(): + "Raise a ``NameError``" + getattr(doggypants) + + +async def spawn_error(): + """"A nested nursery that triggers another ``NameError``. + """ + async with tractor.open_nursery() as n: + portal = await n.run_in_actor('name_error_1', name_error) + return await portal.result() + + +async def main(): + """The main ``tractor`` routine. + + The process tree should look as approximately as follows: + + -python examples/debugging/multi_subactors.py + |-python -m tractor._child --uid ('name_error', 'a7caf490 ...) + `-python -m tractor._child --uid ('spawn_error', '52ee14a5 ...) + `-python -m tractor._child --uid ('name_error', '3391222c ...) + """ + async with tractor.open_nursery() as n: + + # spawn both actors + portal = await n.run_in_actor('name_error', name_error) + portal1 = await n.run_in_actor('spawn_error', spawn_error) + + # trigger a root actor error + assert 0 + + # attempt to collect results (which raises error in parent) + # still has some issues where the parent seems to get stuck + await portal.result() + await portal1.result() + + +if __name__ == '__main__': + tractor.run(main, debug_mode=True) diff --git a/examples/debugging/multi_subactors.py b/examples/debugging/multi_subactors.py index 97de906..16ff22d 100644 --- a/examples/debugging/multi_subactors.py +++ b/examples/debugging/multi_subactors.py @@ -35,16 +35,12 @@ async def main(): """ async with tractor.open_nursery() as n: - # spawn both actors - portal1 = await n.run_in_actor('bp_forever', breakpoint_forever) - portal = await n.run_in_actor('name_error', name_error) - portal2 = await n.run_in_actor('spawn_error', spawn_error) - - # attempt to collect results (which raises error in parent) - # still has some issues where the parent seems to get stuck - # await portal.result() - # await portal1.result() - # await portal2.result() + # Spawn both actors, don't bother with collecting results + # (would result in a different debugger outcome due to parent's + # cancellation). + await n.run_in_actor('bp_forever', breakpoint_forever) + await n.run_in_actor('name_error', name_error) + await n.run_in_actor('spawn_error', spawn_error) if __name__ == '__main__': diff --git a/tests/test_debugger.py b/tests/test_debugger.py index 0770a14..8c87791 100644 --- a/tests/test_debugger.py +++ b/tests/test_debugger.py @@ -1,5 +1,8 @@ """ That native debug better work! + +All these tests can be understood (somewhat) by running the equivalent +`examples/debugging/` scripts manually. """ from os import path @@ -10,9 +13,8 @@ from .test_docs_examples import repodir # TODO: -# - recurrent entry from single actor # - recurrent entry to breakpoint() from single actor *after* and an -# error +# error in another task? # - root error before child errors # - root error after child errors # - root error before child breakpoint @@ -68,14 +70,14 @@ def test_root_actor_error(spawn, user_in_out): # scan for the pdbpp prompt child.expect(r"\(Pdb\+\+\)") + before = str(child.before.decode()) + # make sure expected logging and error arrives - assert 'TTY lock acquired' in str(child.before) - assert 'AssertionError' in str(child.before) + assert "Attaching to pdb in crashed actor: ('arbiter'" in before + assert 'AssertionError' in before # send user command child.sendline(user_input) - child.expect('\r\n') - child.expect('TTY lock released') # process should exit child.expect(pexpect.EOF) @@ -276,3 +278,35 @@ def test_multi_subactors(spawn): before = str(child.before.decode()) assert "RemoteActorError: ('bp_forever'" in before assert 'bdb.BdbQuit' in before + + +def test_multi_subactors_root_errors(spawn): + """Multiple subactors, both erroring and breakpointing as well as + a nested subactor erroring. + """ + child = spawn('multi_subactor_root_errors') + + # scan for the pdbpp prompt + child.expect(r"\(Pdb\+\+\)") + + # at most one subactor should attach before the root is cancelled + before = str(child.before.decode()) + assert "NameError: name 'doggypants' is not defined" in before + + # continue again + child.sendline('c') + child.expect(r"\(Pdb\+\+\)") + + # should now get attached in root with assert error + before = str(child.before.decode()) + # should have come just after priot prompt + assert "Cancelling nursery in ('spawn_error'," in before + assert "Attaching to pdb in crashed actor: ('arbiter'" in before + assert "AssertionError" in before + + # continue again + child.sendline('c') + child.expect(pexpect.EOF) + + before = str(child.before.decode()) + assert "AssertionError" in before From 2b53c74b1c174723e8c1efe58196ae53d3c66709 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 5 Oct 2020 11:42:13 -0400 Subject: [PATCH 36/61] Change to relative conftest.py imports --- tests/test_cancellation.py | 2 +- tests/test_discovery.py | 2 +- tests/test_local.py | 2 +- tests/test_multi_program.py | 2 +- tests/test_spawning.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index e6d460c..dbb0502 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -11,7 +11,7 @@ import pytest import trio import tractor -from conftest import tractor_test, no_windows +from .conftest import tractor_test, no_windows async def assert_err(delay=0): diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 52a1dc8..7f071e2 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -11,7 +11,7 @@ import pytest import tractor import trio -from conftest import tractor_test +from .conftest import tractor_test @tractor_test diff --git a/tests/test_local.py b/tests/test_local.py index 0a594d0..c6a3569 100644 --- a/tests/test_local.py +++ b/tests/test_local.py @@ -7,7 +7,7 @@ import pytest import trio import tractor -from conftest import tractor_test +from .conftest import tractor_test @pytest.mark.trio diff --git a/tests/test_multi_program.py b/tests/test_multi_program.py index 12ca3ef..2d08f2f 100644 --- a/tests/test_multi_program.py +++ b/tests/test_multi_program.py @@ -6,7 +6,7 @@ import time import pytest import tractor -from conftest import ( +from .conftest import ( tractor_test, sig_prog, _INT_SIGNAL, diff --git a/tests/test_spawning.py b/tests/test_spawning.py index 220af3e..0800836 100644 --- a/tests/test_spawning.py +++ b/tests/test_spawning.py @@ -5,7 +5,7 @@ import pytest import trio import tractor -from conftest import tractor_test +from .conftest import tractor_test statespace = {'doggy': 10, 'kitty': 4} From abf8bb2813cb09364a293542506d17c55166acfc Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 6 Oct 2020 09:21:53 -0400 Subject: [PATCH 37/61] Add a deep nested error propagation test --- ...ed_subactors_error_up_through_nurseries.py | 58 +++++++++++++++++++ .../debugging/multi_subactor_root_errors.py | 4 +- tests/test_debugger.py | 21 +++++++ 3 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 examples/debugging/multi_nested_subactors_error_up_through_nurseries.py diff --git a/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py b/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py new file mode 100644 index 0000000..488fffa --- /dev/null +++ b/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py @@ -0,0 +1,58 @@ +import tractor + + +async def name_error(): + "Raise a ``NameError``" + getattr(doggypants) + + +async def breakpoint_forever(): + "Indefinitely re-enter debugger in child actor." + while True: + await tractor.breakpoint() + + +async def spawn_until(depth=0): + """"A nested nursery that triggers another ``NameError``. + """ + async with tractor.open_nursery() as n: + if depth < 1: + # await n.run_in_actor('breakpoint_forever', breakpoint_forever) + await n.run_in_actor('name_error', name_error) + else: + depth -= 1 + await n.run_in_actor(f'spawn_until_{depth}', spawn_until, depth=depth) + + +async def main(): + """The main ``tractor`` routine. + + The process tree should look as approximately as follows when the debugger + first engages: + + python examples/debugging/multi_nested_subactors_bp_forever.py + ├─ python -m tractor._child --uid ('spawner1', '7eab8462 ...) + │ └─ python -m tractor._child --uid ('spawn_until_3', 'afcba7a8 ...) + │ └─ python -m tractor._child --uid ('spawn_until_2', 'd2433d13 ...) + │ └─ python -m tractor._child --uid ('spawn_until_1', '1df589de ...) + │ └─ python -m tractor._child --uid ('spawn_until_0', '3720602b ...) + │ + └─ python -m tractor._child --uid ('spawner0', '1d42012b ...) + └─ python -m tractor._child --uid ('spawn_until_2', '2877e155 ...) + └─ python -m tractor._child --uid ('spawn_until_1', '0502d786 ...) + └─ python -m tractor._child --uid ('spawn_until_0', 'de918e6d ...) + + """ + async with tractor.open_nursery() as n: + + # spawn both actors + portal = await n.run_in_actor('spawner0', spawn_until, depth=3) + portal1 = await n.run_in_actor('spawner1', spawn_until, depth=4) + + # gah still an issue here. + # await portal.result() + # await portal1.result() + + +if __name__ == '__main__': + tractor.run(main, debug_mode=True) diff --git a/examples/debugging/multi_subactor_root_errors.py b/examples/debugging/multi_subactor_root_errors.py index f7a4372..05f0fa7 100644 --- a/examples/debugging/multi_subactor_root_errors.py +++ b/examples/debugging/multi_subactor_root_errors.py @@ -19,8 +19,8 @@ async def main(): The process tree should look as approximately as follows: - -python examples/debugging/multi_subactors.py - |-python -m tractor._child --uid ('name_error', 'a7caf490 ...) + python examples/debugging/multi_subactors.py + ├─ python -m tractor._child --uid ('name_error', 'a7caf490 ...) `-python -m tractor._child --uid ('spawn_error', '52ee14a5 ...) `-python -m tractor._child --uid ('name_error', '3391222c ...) """ diff --git a/tests/test_debugger.py b/tests/test_debugger.py index 8c87791..2540c2e 100644 --- a/tests/test_debugger.py +++ b/tests/test_debugger.py @@ -310,3 +310,24 @@ def test_multi_subactors_root_errors(spawn): before = str(child.before.decode()) assert "AssertionError" in before + + +def test_multi_nested_subactors_error_through_nurseries(spawn): + """Verify deeply nested actors that error trigger debugger entries + at each level up the tree. + """ + + # TODO: inside this script there's still a bug where if the parent + # errors before a 2 levels lower actor has released the lock, the + # parent tries to cancel it but it's stuck in the debugger? + + child = spawn('multi_nested_subactors_error_up_through_nurseries') + + for _ in range(12): + child.expect(r"\(Pdb\+\+\)") + child.sendline('c') + + child.expect(pexpect.EOF) + + before = str(child.before.decode()) + assert "NameError" in before From 07112089d045a99ca3f58196cd5b4a4bea8c5b44 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Wed, 7 Oct 2020 05:53:26 -0400 Subject: [PATCH 38/61] Add mention subactor uid during locking --- tractor/_debug.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tractor/_debug.py b/tractor/_debug.py index a00594a..04c9637 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -11,7 +11,7 @@ from typing import Awaitable, Tuple, Optional, Callable from async_generator import aclosing import tractor import trio -# from trio.testing import wait_all_tasks_blocked +from trio.testing import wait_all_tasks_blocked from .log import get_logger from . import _state @@ -117,19 +117,19 @@ class PdbwTeardown(pdbpp.Pdb): @asynccontextmanager -async def _acquire_debug_lock(): +async def _acquire_debug_lock(uid: Tuple[str, str]) -> None: """Acquire a actor local FIFO lock meant to mutex entry to a local debugger entry point to avoid tty clobbering by multiple processes. """ task_name = trio.lowlevel.current_task() try: - log.error(f"TTY BEING ACQUIRED by {task_name}") + log.error(f"TTY BEING ACQUIRED by {task_name}:{uid}") await _debug_lock.acquire() - log.error(f"TTY lock acquired by {task_name}") + log.error(f"TTY lock acquired by {task_name}:{uid}") yield finally: _debug_lock.release() - log.error(f"TTY lock released by {task_name}") + log.error(f"TTY lock released by {task_name}:{uid}") def handler(signum, frame): @@ -162,7 +162,7 @@ async def _hijack_stdin_relay_to_child( # TODO: when we get to true remote debugging # this will deliver stdin data log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock") - async with _acquire_debug_lock(): + async with _acquire_debug_lock(subactor_uid): log.warning(f"Actor {subactor_uid} ACQUIRED stdin hijack lock") # with _disable_sigint(): @@ -266,9 +266,10 @@ def _breakpoint(debug_func) -> Awaitable[None]: # block here one (at the appropriate frame *up* where # ``breakpoint()`` was awaited and begin handling stdio - log.debug("Entering the synchronous world of the debugger") + log.debug("Entering the synchronous world of pdb") debug_func(actor) + # user code **must** await this! return _bp() @@ -288,7 +289,7 @@ breakpoint = partial( def _post_mortem(actor): - log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n") + log.critical(f"\nAttaching to pdb in crashed actor: {actor.uid}\n") # custom Pdb post-mortem entry pdbpp.xpm(Pdb=PdbwTeardown) From acb4cb0b2be7b293b6aee501809380411b886f9c Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 6 Oct 2020 22:39:20 -0400 Subject: [PATCH 39/61] Add test showing issue with child in tty lock when cancelled --- ...root_cancelled_but_child_is_in_tty_lock.py | 48 +++++++++++++++++++ tests/test_debugger.py | 13 +++++ 2 files changed, 61 insertions(+) create mode 100644 examples/debugging/root_cancelled_but_child_is_in_tty_lock.py diff --git a/examples/debugging/root_cancelled_but_child_is_in_tty_lock.py b/examples/debugging/root_cancelled_but_child_is_in_tty_lock.py new file mode 100644 index 0000000..04ec767 --- /dev/null +++ b/examples/debugging/root_cancelled_but_child_is_in_tty_lock.py @@ -0,0 +1,48 @@ +import tractor + + +async def name_error(): + "Raise a ``NameError``" + getattr(doggypants) + + +async def spawn_until(depth=0): + """"A nested nursery that triggers another ``NameError``. + """ + async with tractor.open_nursery() as n: + if depth < 1: + # await n.run_in_actor('breakpoint_forever', breakpoint_forever) + await n.run_in_actor('name_error', name_error) + else: + depth -= 1 + await n.run_in_actor(f'spawn_until_{depth}', spawn_until, depth=depth) + + +async def main(): + """The main ``tractor`` routine. + + The process tree should look as approximately as follows when the debugger + first engages: + + python examples/debugging/multi_nested_subactors_bp_forever.py + ├─ python -m tractor._child --uid ('spawner1', '7eab8462 ...) + │ └─ python -m tractor._child --uid ('spawn_until_0', '3720602b ...) + │ └─ python -m tractor._child --uid ('name_error', '505bf71d ...) + │ + └─ python -m tractor._child --uid ('spawner0', '1d42012b ...) + └─ python -m tractor._child --uid ('name_error', '6c2733b8 ...) + + """ + async with tractor.open_nursery() as n: + + # spawn both actors + portal = await n.run_in_actor('spawner0', spawn_until, depth=0) + portal1 = await n.run_in_actor('spawner1', spawn_until, depth=1) + + # nursery cancellation should be triggered due to propagated error + await portal.result() + await portal1.result() + + +if __name__ == '__main__': + tractor.run(main, debug_mode=True, loglevel='warning') diff --git a/tests/test_debugger.py b/tests/test_debugger.py index 2540c2e..76113c9 100644 --- a/tests/test_debugger.py +++ b/tests/test_debugger.py @@ -331,3 +331,16 @@ def test_multi_nested_subactors_error_through_nurseries(spawn): before = str(child.before.decode()) assert "NameError" in before + + +def test_root_nursery_cancels_before_child_releases_tty_lock(spawn): + """Exemplifies a bug where the root sends a cancel message before a nested child + which has the tty lock (and is in pdb) doesn't cancel after exiting the debugger. + """ + child = spawn('root_cancelled_but_child_is_in_tty_lock') + + for _ in range(5): + child.expect(r"\(Pdb\+\+\)") + child.sendline('c') + + # child.expect(pexpect.EOF) From 0e344eead877c30bca4adad37db9bdb729694392 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 12 Oct 2020 08:56:49 -0400 Subject: [PATCH 40/61] Add a "cancel arrives during a sync sleep in child" test This appears to demonstrate the same bug found in #156. It looks like cancelling a subactor with a child, while that child is running sync code, can result in the child never getting cancelled due to some strange condition where the internal nurseries aren't being torn down as expected when a `trio.Cancelled` is raised. --- tests/test_cancellation.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index dbb0502..80232bf 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -413,3 +413,34 @@ def test_cancel_via_SIGINT_other_task( with pytest.raises(KeyboardInterrupt): tractor.run(main) + + + +async def spin_for(period=3): + "Sync sleep." + time.sleep(period) + + +async def spawn(): + async with tractor.open_nursery() as tn: + portal = await tn.run_in_actor('sleeper', spin_for) + + +def test_cancel_while_childs_child_in_sync_sleep( + loglevel, + start_method, + spawn_backend, +): + """Verify that a child cancelled while executing sync code is torn + down even when that cancellation is triggered by the parent + 2 nurseries "up". + """ + async def main(): + with trio.fail_after(2): + async with tractor.open_nursery() as tn: + portal = await tn.run_in_actor('spawn', spawn) + await trio.sleep(1) + assert 0 + + with pytest.raises(AssertionError): + tractor.run(main) From 79c38b04e712259dee1137f0fd2aa93384b3dc80 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 12 Oct 2020 23:28:36 -0400 Subject: [PATCH 41/61] Report `trio.Cancelled` when exhausting portals.. For reliable remote cancellation we need to "report" `trio.Cancelled`s (just like any other error) when exhausting a portal such that the caller can make decisions about cancelling the respective actor if need be. Resolves #156 --- tractor/_spawn.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tractor/_spawn.py b/tractor/_spawn.py index 2a2f602..2065967 100644 --- a/tractor/_spawn.py +++ b/tractor/_spawn.py @@ -112,6 +112,11 @@ async def exhaust_portal( except (Exception, trio.MultiError) as err: # we reraise in the parent task via a ``trio.MultiError`` return err + except trio.Cancelled as err: + # lol, of course we need this too ;P + # TODO: merge with above? + log.warning(f"Cancelled result waiter for {portal.actor.uid}") + return err else: log.debug(f"Returning final result: {final}") return final From a88a6ba7a3143d89d90748adad07377d206f3136 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 00:36:34 -0400 Subject: [PATCH 42/61] Add pattern matching to test --- tests/test_debugger.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/tests/test_debugger.py b/tests/test_debugger.py index 76113c9..071febf 100644 --- a/tests/test_debugger.py +++ b/tests/test_debugger.py @@ -334,13 +334,30 @@ def test_multi_nested_subactors_error_through_nurseries(spawn): def test_root_nursery_cancels_before_child_releases_tty_lock(spawn): - """Exemplifies a bug where the root sends a cancel message before a nested child - which has the tty lock (and is in pdb) doesn't cancel after exiting the debugger. + """Test that when the root sends a cancel message before a nested + child has unblocked (which can happen when it has the tty lock and + is engaged in pdb) it is indeed cancelled after exiting the debugger. """ child = spawn('root_cancelled_but_child_is_in_tty_lock') - for _ in range(5): + child.expect(r"\(Pdb\+\+\)") + + before = str(child.before.decode()) + assert "NameError: name 'doggypants' is not defined" in before + assert "tractor._exceptions.RemoteActorError: ('name_error'" not in before + child.sendline('c') + + for _ in range(4): child.expect(r"\(Pdb\+\+\)") + + before = str(child.before.decode()) + assert "NameError: name 'doggypants' is not defined" in before + child.sendline('c') - # child.expect(pexpect.EOF) + + child.expect(pexpect.EOF) + before = str(child.before.decode()) + assert "tractor._exceptions.RemoteActorError: ('spawner0'" in before + assert "tractor._exceptions.RemoteActorError: ('name_error'" in before + assert "NameError: name 'doggypants' is not defined" in before From c41e5c8313ac73c78544152c9cd764a676cc3e2c Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 00:45:29 -0400 Subject: [PATCH 43/61] Fix missing await --- tractor/_actor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tractor/_actor.py b/tractor/_actor.py index 2813f8d..74c2049 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -541,7 +541,7 @@ class Actor: # channel disconnect log.debug( f"{chan} from {chan.uid} disconnected, cancelling all rpc tasks") - self.cancel_rpc_tasks(chan) + await self.cancel_rpc_tasks(chan) except trio.ClosedResourceError: log.error(f"{chan} form {chan.uid} broke") From 1710b642a5e89ef97ba4aa4989f1bc1cebb24ab5 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 10:50:21 -0400 Subject: [PATCH 44/61] Make tests a package (for relative imports) --- tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/__init__.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 From c375a2d028e727b41d45cd65ebb2af836e541682 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 11:03:55 -0400 Subject: [PATCH 45/61] mypy fixes --- tractor/__init__.py | 5 ++++- tractor/_actor.py | 2 +- tractor/_debug.py | 12 ++++++------ tractor/_state.py | 6 +++--- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/tractor/__init__.py b/tractor/__init__.py index b65811a..d3aedc4 100644 --- a/tractor/__init__.py +++ b/tractor/__init__.py @@ -17,6 +17,7 @@ from ._discovery import get_arbiter, find_actor, wait_for_actor from ._actor import Actor, _start_actor, Arbiter from ._trionics import open_nursery from ._state import current_actor +from . import _state from ._exceptions import RemoteActorError, ModuleNotExposed from ._debug import breakpoint, post_mortem from . import msg @@ -24,6 +25,8 @@ from . import _spawn __all__ = [ + 'breakpoint', + 'post_mortem', 'current_actor', 'find_actor', 'get_arbiter', @@ -51,7 +54,7 @@ async def _main( name: Optional[str] = None, start_method: Optional[str] = None, debug_mode: bool = False, - **kwargs: typing.Dict[str, typing.Any], + **kwargs, ) -> typing.Any: """Async entry point for ``tractor``. """ diff --git a/tractor/_actor.py b/tractor/_actor.py index 74c2049..80a7dd5 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -249,7 +249,7 @@ class Actor: self._parent_chan: Optional[Channel] = None self._forkserver_info: Optional[ Tuple[Any, Any, Any, Any, Any]] = None - self._actoruid2nursery: Dict[str, 'ActorNursery'] = {} # noqa + self._actoruid2nursery: Dict[str, 'ActorNursery'] = {} # type: ignore async def wait_for_peer( self, uid: Tuple[str, str] diff --git a/tractor/_debug.py b/tractor/_debug.py index 04c9637..29c0430 100644 --- a/tractor/_debug.py +++ b/tractor/_debug.py @@ -5,7 +5,7 @@ import bdb import sys from functools import partial from contextlib import asynccontextmanager -from typing import Awaitable, Tuple, Optional, Callable +from typing import Awaitable, Tuple, Optional, Callable, AsyncIterator # import signal from async_generator import aclosing @@ -22,9 +22,9 @@ try: # wtf: only exported when installed in dev mode? import pdbpp except ImportError: - # pdbpp is installed in regular mode... + # pdbpp is installed in regular mode...it monkey patches stuff import pdb - assert pdb.xpm, "pdbpp is not installed?" + assert pdb.xpm, "pdbpp is not installed?" # type: ignore pdbpp = pdb log = get_logger(__name__) @@ -45,7 +45,7 @@ _debug_lock = trio.StrictFIFOLock() # XXX: set by the current task waiting on the root tty lock # and must be cancelled if this actor is cancelled via message # otherwise deadlocks with the parent actor may ensure -_debugger_request_cs: trio.CancelScope = None +_debugger_request_cs: Optional[trio.CancelScope] = None class TractorConfig(pdbpp.DefaultConfig): @@ -117,7 +117,7 @@ class PdbwTeardown(pdbpp.Pdb): @asynccontextmanager -async def _acquire_debug_lock(uid: Tuple[str, str]) -> None: +async def _acquire_debug_lock(uid: Tuple[str, str]) -> AsyncIterator[None]: """Acquire a actor local FIFO lock meant to mutex entry to a local debugger entry point to avoid tty clobbering by multiple processes. """ @@ -158,7 +158,7 @@ pdbpp.pdb.Pdb.sigint_handler = handler async def _hijack_stdin_relay_to_child( subactor_uid: Tuple[str, str] -) -> None: +) -> AsyncIterator[str]: # TODO: when we get to true remote debugging # this will deliver stdin data log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock") diff --git a/tractor/_state.py b/tractor/_state.py index 2728b78..03ddf13 100644 --- a/tractor/_state.py +++ b/tractor/_state.py @@ -1,14 +1,14 @@ """ Per process state """ -from typing import Optional +from typing import Optional, Dict, Any from collections import Mapping import multiprocessing as mp import trio _current_actor: Optional['Actor'] = None # type: ignore -_runtime_vars = { +_runtime_vars: Dict[str, Any] = { '_debug_mode': False, '_is_root': False, '_root_mailbox': (None, None) @@ -54,7 +54,7 @@ def debug_mode() -> bool: """Bool determining if "debug mode" is on which enables remote subactor pdb entry on crashes. """ - return _runtime_vars['_debug_mode'] + return bool(_runtime_vars['_debug_mode']) def is_root_process() -> bool: From 0ce6d2b55c3654ee4a40da71958cf657809c6f80 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 11:04:16 -0400 Subject: [PATCH 46/61] Add `pexpect` dep for debugger tests --- requirements-test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-test.txt b/requirements-test.txt index e9dae32..2e2d1b5 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -3,3 +3,4 @@ pytest-trio pdbpp mypy trio_typing +pexpect From 573b8fef73fb515695093354fe0d470ab5182814 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 11:48:52 -0400 Subject: [PATCH 47/61] Add better actor cancellation tracking Add `Actor._cancel_called` and `._cancel_complete` making it possible to determine whether the actor has started the cancellation sequence and whether that sequence has fully completed. This allows for blocking in internal machinery tasks as necessary. Also, always trigger the end of ongoing rpc tasks even if the last task errors; there's no guarantee the trio cancellation semantics will guarantee us a nice internal "state" without this. --- tractor/_actor.py | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/tractor/_actor.py b/tractor/_actor.py index 80a7dd5..f4b9795 100644 --- a/tractor/_actor.py +++ b/tractor/_actor.py @@ -152,10 +152,10 @@ async def _invoke( # cancel scope will not have been inserted yet log.warn( f"Task {func} likely errored or cancelled before it started") - - if not actor._rpc_tasks: - log.info("All RPC tasks have completed") - actor._ongoing_rpc_tasks.set() + finally: + if not actor._rpc_tasks: + log.info("All RPC tasks have completed") + actor._ongoing_rpc_tasks.set() def _get_mod_abspath(module): @@ -198,7 +198,9 @@ class Actor: """ self.name = name self.uid = (name, uid or str(uuid.uuid4())) - self._is_cancelled: bool = False + + self._cancel_complete = trio.Event() + self._cancel_called: bool = False # retreive and store parent `__main__` data which # will be passed to children @@ -531,7 +533,10 @@ class Actor: else: # self.cancel() was called so kill this msg loop # and break out into ``_async_main()`` - log.warning(f"{self.uid} was remotely cancelled") + log.warning( + f"{self.uid} was remotely cancelled; " + "waiting on cancellation completion..") + await self._cancel_complete.wait() loop_cs.cancel() break @@ -540,8 +545,9 @@ class Actor: else: # channel disconnect log.debug( - f"{chan} from {chan.uid} disconnected, cancelling all rpc tasks") - await self.cancel_rpc_tasks(chan) + f"{chan} for {chan.uid} disconnected, cancelling tasks" + ) + self.cancel_rpc_tasks(chan) except trio.ClosedResourceError: log.error(f"{chan} form {chan.uid} broke") @@ -833,7 +839,8 @@ class Actor: spawning new rpc tasks - return control the parent channel message loop """ - self._is_cancelled = True + log.warning(f"{self.uid} is trying to cancel") + self._cancel_called = True # cancel all ongoing rpc tasks with trio.CancelScope(shield=True): @@ -848,14 +855,16 @@ class Actor: # kill all ongoing tasks await self.cancel_rpc_tasks() + # cancel all rpc tasks permanently + if self._service_n: + self._service_n.cancel_scope.cancel() + # stop channel server self.cancel_server() await self._server_down.wait() - # rekt all channel loops - if self._service_n: - self._service_n.cancel_scope.cancel() - + log.warning(f"{self.uid} was sucessfullly cancelled") + self._cancel_complete.set() return True # XXX: hard kill logic if needed? @@ -895,6 +904,9 @@ class Actor: scope.cancel() # wait for _invoke to mark the task complete + log.debug( + f"Waiting on task to cancel:\ncid: {cid}\nfunc: {func}\n" + f"peer: {chan.uid}\n") await is_complete.wait() log.debug( f"Sucessfully cancelled task:\ncid: {cid}\nfunc: {func}\n" From 08ff98963180e4c3aa8e7093a37b12b9d5916a16 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 11:59:18 -0400 Subject: [PATCH 48/61] Add some comments --- tractor/_trionics.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tractor/_trionics.py b/tractor/_trionics.py index 2d57ac6..aecf16e 100644 --- a/tractor/_trionics.py +++ b/tractor/_trionics.py @@ -230,7 +230,7 @@ async def open_nursery() -> typing.AsyncGenerator[ActorNursery, None]: # after we yield upwards yield anursery log.debug( - f"Waiting on subactors {anursery._children}" + f"Waiting on subactors {anursery._children} " "to complete" ) except BaseException as err: @@ -274,6 +274,7 @@ async def open_nursery() -> typing.AsyncGenerator[ActorNursery, None]: # ria_nursery scope end + # XXX: do we need a `trio.Cancelled` catch here as well? except (Exception, trio.MultiError) as err: # If actor-local error was raised while waiting on # ".run_in_actor()" actors then we also want to cancel all @@ -295,6 +296,8 @@ async def open_nursery() -> typing.AsyncGenerator[ActorNursery, None]: if anursery._children: with trio.CancelScope(shield=True): await anursery.cancel() + + # use `MultiError` as needed if len(errors) > 1: raise trio.MultiError(tuple(errors.values())) else: From 24ef9193343f746083a455347c3bb551d84b79ce Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 14:16:20 -0400 Subject: [PATCH 49/61] Skip sync sleep test on mp backend --- tests/test_cancellation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index 80232bf..aace021 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -435,6 +435,9 @@ def test_cancel_while_childs_child_in_sync_sleep( down even when that cancellation is triggered by the parent 2 nurseries "up". """ + if start_method == 'forkserver': + pytest.skip("Forksever sux hard at resuming from sync sleep...") + async def main(): with trio.fail_after(2): async with tractor.open_nursery() as tn: From ba52de79e1a3af42ebe0107542982efd27e043df Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 14:20:19 -0400 Subject: [PATCH 50/61] Skip quad ex on local mp tests as well --- tests/test_streaming.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_streaming.py b/tests/test_streaming.py index 17ab9ea..98855e0 100644 --- a/tests/test_streaming.py +++ b/tests/test_streaming.py @@ -203,9 +203,8 @@ async def cancel_after(wait): @pytest.fixture(scope='module') def time_quad_ex(arb_addr, ci_env, spawn_backend): - if ci_env and spawn_backend == 'mp' and (platform.system() != 'Windows'): - """no idea, but the travis and github actions, mp *nix runs are - flaking out here often + if spawn_backend == 'mp' and (platform.system() != 'Windows'): + """no idea but the mp *nix runs are flaking out here often... """ pytest.skip("Test is too flaky on mp in CI") From e3c26943baad06b2142d9017ad427833ac68addf Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 14:20:44 -0400 Subject: [PATCH 51/61] Support debug mode only on the trio backend --- tractor/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tractor/__init__.py b/tractor/__init__.py index d3aedc4..5ed843f 100644 --- a/tractor/__init__.py +++ b/tractor/__init__.py @@ -63,11 +63,13 @@ async def _main( if start_method is not None: _spawn.try_set_start_method(start_method) - if debug_mode: + if debug_mode and _spawn._spawn_method == 'trio': _state._runtime_vars['_debug_mode'] = True # expose internal debug module to every actor allowing # for use of ``await tractor.breakpoint()`` kwargs.setdefault('rpc_module_paths', []).append('tractor._debug') + elif debug_mode: + raise RuntimeError("Debug mode is only supported for the `trio` backend!") main = partial(async_fn, *args) From 666966097ab26d02648f27d676dddba92dbac711 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 14:42:02 -0400 Subject: [PATCH 52/61] Revert "Change to relative conftest.py imports" This reverts commit 2b53c74b1c174723e8c1efe58196ae53d3c66709. --- tests/test_cancellation.py | 2 +- tests/test_discovery.py | 2 +- tests/test_local.py | 2 +- tests/test_multi_program.py | 2 +- tests/test_spawning.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index aace021..45452b5 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -11,7 +11,7 @@ import pytest import trio import tractor -from .conftest import tractor_test, no_windows +from conftest import tractor_test, no_windows async def assert_err(delay=0): diff --git a/tests/test_discovery.py b/tests/test_discovery.py index 7f071e2..52a1dc8 100644 --- a/tests/test_discovery.py +++ b/tests/test_discovery.py @@ -11,7 +11,7 @@ import pytest import tractor import trio -from .conftest import tractor_test +from conftest import tractor_test @tractor_test diff --git a/tests/test_local.py b/tests/test_local.py index c6a3569..0a594d0 100644 --- a/tests/test_local.py +++ b/tests/test_local.py @@ -7,7 +7,7 @@ import pytest import trio import tractor -from .conftest import tractor_test +from conftest import tractor_test @pytest.mark.trio diff --git a/tests/test_multi_program.py b/tests/test_multi_program.py index 2d08f2f..12ca3ef 100644 --- a/tests/test_multi_program.py +++ b/tests/test_multi_program.py @@ -6,7 +6,7 @@ import time import pytest import tractor -from .conftest import ( +from conftest import ( tractor_test, sig_prog, _INT_SIGNAL, diff --git a/tests/test_spawning.py b/tests/test_spawning.py index 0800836..220af3e 100644 --- a/tests/test_spawning.py +++ b/tests/test_spawning.py @@ -5,7 +5,7 @@ import pytest import trio import tractor -from .conftest import tractor_test +from conftest import tractor_test statespace = {'doggy': 10, 'kitty': 4} From a49deb46f142875e6fcaede43004b7ea83f690b5 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 14:42:16 -0400 Subject: [PATCH 53/61] Revert "Make tests a package (for relative imports)" This reverts commit 1710b642a5e89ef97ba4aa4989f1bc1cebb24ab5. --- tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/__init__.py diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 From a934eb063c11e3b6e55a605ee0db62b381039415 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 14:49:31 -0400 Subject: [PATCH 54/61] Factor `repodir()` helper into conftest.py --- tests/conftest.py | 10 ++++++++++ tests/test_debugger.py | 2 +- tests/test_docs_examples.py | 10 +--------- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 6a9f938..347cc7d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -39,6 +39,16 @@ no_windows = pytest.mark.skipif( ) +def repodir(): + """Return the abspath to the repo directory. + """ + dirname = os.path.dirname + dirpath = os.path.abspath( + dirname(dirname(os.path.realpath(__file__))) + ) + return dirpath + + def pytest_addoption(parser): parser.addoption( "--ll", action="store", dest='loglevel', diff --git a/tests/test_debugger.py b/tests/test_debugger.py index 071febf..fe357e8 100644 --- a/tests/test_debugger.py +++ b/tests/test_debugger.py @@ -9,7 +9,7 @@ from os import path import pytest import pexpect -from .test_docs_examples import repodir +from conftest import repodir # TODO: diff --git a/tests/test_docs_examples.py b/tests/test_docs_examples.py index 67b34ee..73e6a69 100644 --- a/tests/test_docs_examples.py +++ b/tests/test_docs_examples.py @@ -11,15 +11,7 @@ import shutil import pytest - -def repodir(): - """Return the abspath to the repo directory. - """ - dirname = os.path.dirname - dirpath = os.path.abspath( - dirname(dirname(os.path.realpath(__file__))) - ) - return dirpath +from conftest import repodir def examples_dir(): From fd59f4ad1692511299b8bc066b4bf728a862756e Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 14:56:26 -0400 Subject: [PATCH 55/61] On windows .spawn dne? --- tests/test_debugger.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_debugger.py b/tests/test_debugger.py index fe357e8..227e8f5 100644 --- a/tests/test_debugger.py +++ b/tests/test_debugger.py @@ -41,7 +41,7 @@ def mk_cmd(ex_name: str) -> str: def spawn( testdir, arb_addr, -) -> pexpect.spawn: +) -> 'pexpect.spawn': def _spawn(cmd): return testdir.spawn( @@ -355,7 +355,6 @@ def test_root_nursery_cancels_before_child_releases_tty_lock(spawn): child.sendline('c') - child.expect(pexpect.EOF) before = str(child.before.decode()) assert "tractor._exceptions.RemoteActorError: ('spawner0'" in before From 15edcc622d9c9d6a64939d452462e92c798ec15a Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 15:13:24 -0400 Subject: [PATCH 56/61] Skip it on windows too --- tests/test_streaming.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_streaming.py b/tests/test_streaming.py index 98855e0..919b278 100644 --- a/tests/test_streaming.py +++ b/tests/test_streaming.py @@ -203,7 +203,7 @@ async def cancel_after(wait): @pytest.fixture(scope='module') def time_quad_ex(arb_addr, ci_env, spawn_backend): - if spawn_backend == 'mp' and (platform.system() != 'Windows'): + if spawn_backend == 'mp': """no idea but the mp *nix runs are flaking out here often... """ pytest.skip("Test is too flaky on mp in CI") From 1b6ee2ecf670007121e7ae79995d4aec26c1cae1 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 15:26:14 -0400 Subject: [PATCH 57/61] Skip sync sleep test on windows --- tests/test_cancellation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index 45452b5..9e3c9f9 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -426,6 +426,7 @@ async def spawn(): portal = await tn.run_in_actor('sleeper', spin_for) +@no_windows def test_cancel_while_childs_child_in_sync_sleep( loglevel, start_method, From 0177268f13bdd49391b2dde9f329133fd4c2a3de Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 16:30:19 -0400 Subject: [PATCH 58/61] Report on skipped tests --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 689e364..3e271ff 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,4 +38,4 @@ jobs: - name: Install dependencies run: pip install -U . -r requirements-test.txt -r requirements-docs.txt --upgrade-strategy eager - name: Run tests - run: pytest tests/ --spawn-backend=${{ matrix.spawn_backend }} \ No newline at end of file + run: pytest tests/ --spawn-backend=${{ matrix.spawn_backend }} -rs From 1c25f25ab0cc9f85512c7db1e5546fc5900ab785 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 21:27:17 -0400 Subject: [PATCH 59/61] Drop travisCI; it's slower and has worse windows support. --- .travis.yml | 68 ----------------------------------------------------- 1 file changed, 68 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 484f64d..0000000 --- a/.travis.yml +++ /dev/null @@ -1,68 +0,0 @@ -language: python -dist: xenial -sudo: required - -matrix: - include: - - name: "Windows, Python Latest: multiprocessing" - os: windows - language: sh - python: 3.x # only works on linux - env: SPAWN_BACKEND="mp" - before_install: - - choco install python3 --params "/InstallDir:C:\\Python" - - export PATH="/c/Python:/c/Python/Scripts:$PATH" - - python -m pip install --upgrade pip wheel - - - name: "Windows, Python Latest: trio" - os: windows - language: sh - python: 3.x # only works on linux - env: SPAWN_BACKEND="trio" - before_install: - - choco install python3 --params "/InstallDir:C:\\Python" - - export PATH="/c/Python:/c/Python/Scripts:$PATH" - - python -m pip install --upgrade pip wheel - - - name: "Windows, Python 3.7: multiprocessing" - os: windows - python: 3.7 # only works on linux - env: SPAWN_BACKEND="mp" - language: sh - before_install: - - choco install python3 --version 3.7.4 --params "/InstallDir:C:\\Python" - - export PATH="/c/Python:/c/Python/Scripts:$PATH" - - python -m pip install --upgrade pip wheel - - - name: "Windows, Python 3.7: trio" - os: windows - python: 3.7 # only works on linux - env: SPAWN_BACKEND="trio" - language: sh - before_install: - - choco install python3 --version 3.7.4 --params "/InstallDir:C:\\Python" - - export PATH="/c/Python:/c/Python/Scripts:$PATH" - - python -m pip install --upgrade pip wheel - - - name: "Python 3.7: multiprocessing" - python: 3.7 # this works for Linux but is ignored on macOS or Windows - env: SPAWN_BACKEND="mp" - - name: "Python 3.7: trio" - python: 3.7 # this works for Linux but is ignored on macOS or Windows - env: SPAWN_BACKEND="trio" - - - name: "Python 3.8: multiprocessing" - python: 3.8 # this works for Linux but is ignored on macOS or Windows - env: SPAWN_BACKEND="mp" - - name: "Python 3.8: trio" - python: 3.8 # this works for Linux but is ignored on macOS or Windows - env: SPAWN_BACKEND="trio" - -install: - - cd $TRAVIS_BUILD_DIR - - pip install -U pip - - pip install -U . -r requirements-test.txt -r requirements-docs.txt --upgrade-strategy eager - -script: - - mypy tractor/ --ignore-missing-imports - - pytest tests/ --spawn-backend=${SPAWN_BACKEND} From bba47e4c7a2a63a5bcc8db55c48ed85f06c76b65 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Tue, 13 Oct 2020 21:33:22 -0400 Subject: [PATCH 60/61] Add gh actions badge --- README.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 168f2da..70f9df3 100644 --- a/README.rst +++ b/README.rst @@ -2,7 +2,8 @@ tractor ======= A `structured concurrent`_, async-native "`actor model`_" built on trio_ and multiprocessing_. -|travis| |docs| +|gh_actions| +|docs| .. _actor model: https://en.wikipedia.org/wiki/Actor_model .. _trio: https://github.com/python-trio/trio @@ -54,8 +55,8 @@ say hi, please feel free to ping me on the `trio gitter channel`_! .. _trio gitter channel: https://gitter.im/python-trio/general -.. |travis| image:: https://img.shields.io/travis/goodboy/tractor/master.svg - :target: https://travis-ci.org/goodboy/tractor +.. |gh_actions| image:: https://img.shields.io/endpoint.svg?url=https%3A%2F%2Factions-badge.atrox.dev%2Fgoodboy%2Ftractor%2Fbadge&style=popout-square + :target: https://actions-badge.atrox.dev/goodboy/tractor/goto .. |docs| image:: https://readthedocs.org/projects/tractor/badge/?version=latest :target: https://tractor.readthedocs.io/en/latest/?badge=latest :alt: Documentation Status From 61a8df358c74c8f06fd375345134248274949346 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Wed, 14 Oct 2020 09:06:40 -0400 Subject: [PATCH 61/61] Comments tweak --- tests/test_debugger.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_debugger.py b/tests/test_debugger.py index 227e8f5..5ea4f71 100644 --- a/tests/test_debugger.py +++ b/tests/test_debugger.py @@ -3,6 +3,8 @@ That native debug better work! All these tests can be understood (somewhat) by running the equivalent `examples/debugging/` scripts manually. + +TODO: None of these tests have been run successfully on windows yet. """ from os import path @@ -12,7 +14,7 @@ import pexpect from conftest import repodir -# TODO: +# TODO: The next great debugger audit could be done by you! # - recurrent entry to breakpoint() from single actor *after* and an # error in another task? # - root error before child errors @@ -314,12 +316,14 @@ def test_multi_subactors_root_errors(spawn): def test_multi_nested_subactors_error_through_nurseries(spawn): """Verify deeply nested actors that error trigger debugger entries - at each level up the tree. + at each actor nurserly (level) all the way up the tree. """ - # TODO: inside this script there's still a bug where if the parent - # errors before a 2 levels lower actor has released the lock, the - # parent tries to cancel it but it's stuck in the debugger? + # NOTE: previously, inside this script was a a bug where if the + # parent errors before a 2-levels-lower actor has released the lock, + # the parent tries to cancel it but it's stuck in the debugger? + # A test (below) has now been added to explicitly verify this is + # fixed. child = spawn('multi_nested_subactors_error_up_through_nurseries')