From c074aea0309189d23d846365c281be816ee09b35 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 20 Jan 2020 11:10:51 -0500 Subject: [PATCH] Support TRIP for process launching This took a ton of tinkering and a rework of the actor nursery tear down logic. The main changes include: - each subprocess is now spawned from inside a trio task from one of two containing nurseries created in the body of `tractor.open_nursery()`: one for `run_in_actor()` processes and one for `start_actor()` "daemons". This is to address the need for `trio-run-in_process.open_in_process()` opening a nursery which must be closed from the same task that opened it. Using this same approach for `multiprocessing` seems to work well. The nurseries are waited in order (rip actors then daemon actors) during tear down which allows for avoiding the recursive re-entry of `ActorNursery.wait()` handled prior. - pull out all the nested functions / closures that were in `ActorNursery.wait()` and move into the `_spawn` module such that that process shutdown logic takes place in each containing task's code path. This allows for vastly simplifying `.wait()` to just contain an event trigger which initiates process waiting / result collection. Likely `.wait()` should just be removed since it can no longer be used to synchronously wait on the actor nursery. - drop `ActorNursery.__aenter__()` / `.__atexit__()` and move this "supervisor" tear down logic into the closing block of `open_nursery()`. This not only cleans makes the code more comprehensible it also makes our nursery implementation look more like the one in `trio`. Resolves #93 --- tractor/_spawn.py | 267 +++++++++++++++++++++++++-------- tractor/_trionics.py | 340 ++++++++++++------------------------------- 2 files changed, 301 insertions(+), 306 deletions(-) diff --git a/tractor/_spawn.py b/tractor/_spawn.py index 026e257..5e99723 100644 --- a/tractor/_spawn.py +++ b/tractor/_spawn.py @@ -3,12 +3,15 @@ Process spawning. Mostly just wrapping around ``multiprocessing``. """ +import inspect import multiprocessing as mp - -# from . import log +import platform +from typing import Any, List, Dict import trio import trio_run_in_process +from trio_typing import TaskStatus +from async_generator import aclosing try: from multiprocessing import semaphore_tracker # type: ignore @@ -23,12 +26,24 @@ from typing import Tuple from . import _forkserver_override from ._state import current_actor -from ._actor import Actor +from .log import get_logger +from ._portal import Portal +from ._actor import Actor, ActorFailure +log = get_logger('tractor') + _ctx: mp.context.BaseContext = mp.get_context("spawn") # type: ignore +if platform.system() == 'Windows': + async def proc_waiter(proc: mp.Process) -> None: + await trio.hazmat.WaitForSingleObject(proc.sentinel) +else: + async def proc_waiter(proc: mp.Process) -> None: + await trio.hazmat.wait_readable(proc.sentinel) + + def try_set_start_method(name: str) -> mp.context.BaseContext: """Attempt to set the start method for ``multiprocess.Process`` spawning. @@ -60,73 +75,203 @@ def is_main_process() -> bool: return mp.current_process().name == 'MainProcess' +async def exhaust_portal( + portal: Portal, + actor: Actor +) -> Any: + """Pull final result from portal (assuming it has one). + + If the main task is an async generator do our best to consume + what's left of it. + """ + try: + log.debug(f"Waiting on final result from {actor.uid}") + final = res = await portal.result() + # if it's an async-gen then alert that we're cancelling it + if inspect.isasyncgen(res): + final = [] + log.warning( + f"Blindly consuming asyncgen for {actor.uid}") + with trio.fail_after(1): + async with aclosing(res) as agen: + async for item in agen: + log.debug(f"Consuming item {item}") + final.append(item) + except (Exception, trio.MultiError) as err: + # we reraise in the parent task via a ``trio.MultiError`` + return err + else: + return final + + +async def cancel_on_completion( + portal: Portal, + actor: Actor, + errors: List[Exception], + task_status=trio.TASK_STATUS_IGNORED, +) -> None: + """Cancel actor gracefully once it's "main" portal's + result arrives. + + Should only be called for actors spawned with `run_in_actor()`. + """ + with trio.CancelScope() as cs: + task_status.started(cs) + # if this call errors we store the exception for later + # in ``errors`` which will be reraised inside + # a MultiError and we still send out a cancel request + result = await exhaust_portal(portal, actor) + if isinstance(result, Exception): + errors[actor.uid] = result + log.warning( + f"Cancelling {portal.channel.uid} after error {result}" + ) + else: + log.info(f"Cancelling {portal.channel.uid} gracefully") + + # cancel the process now that we have a final result + await portal.cancel_actor() + + # XXX: lol, this will never get run without a shield above.. + # if cs.cancelled_caught: + # log.warning( + # "Result waiter was cancelled, process may have died") + + async def new_proc( name: str, - actor: Actor, + actor_nursery: 'ActorNursery', + subactor: Actor, + errors: Dict[str, Exception], # passed through to actor main bind_addr: Tuple[str, int], parent_addr: Tuple[str, int], - nursery: trio.Nursery = None, + begin_wait_phase: trio.Event, use_trip: bool = True, + task_status: TaskStatus[Portal] = trio.TASK_STATUS_IGNORED ) -> mp.Process: """Create a new ``multiprocessing.Process`` using the spawn method as configured using ``try_set_start_method()``. """ - if use_trip: # trio_run_in_process - mng = trio_run_in_process.open_in_process( - actor._trip_main, - bind_addr, - parent_addr, - nursery=nursery, - ) - # XXX playing with trip logging - # l = log.get_console_log(level='debug', name=None, _root_name='trio-run-in-process') - # import logging - # logger = logging.getLogger("trio-run-in-process") - # logger.setLevel('DEBUG') - proc = await mng.__aenter__() - proc.mng = mng - return proc - else: - # use multiprocessing - start_method = _ctx.get_start_method() - if start_method == 'forkserver': - # XXX do our hackery on the stdlib to avoid multiple - # forkservers (one at each subproc layer). - fs = forkserver._forkserver - curr_actor = current_actor() - if is_main_process() and not curr_actor._forkserver_info: - # if we're the "main" process start the forkserver only once - # and pass its ipc info to downstream children - # forkserver.set_forkserver_preload(rpc_module_paths) - forkserver.ensure_running() - fs_info = ( - fs._forkserver_address, - fs._forkserver_alive_fd, - getattr(fs, '_forkserver_pid', None), - getattr(resource_tracker._resource_tracker, '_pid', None), - resource_tracker._resource_tracker._fd, - ) - else: - assert curr_actor._forkserver_info - fs_info = ( - fs._forkserver_address, - fs._forkserver_alive_fd, - fs._forkserver_pid, - resource_tracker._resource_tracker._pid, - resource_tracker._resource_tracker._fd, - ) = curr_actor._forkserver_info - else: - fs_info = (None, None, None, None, None) + cancel_scope = None - return _ctx.Process( - target=actor._mp_main, - args=( + async with trio.open_nursery() as nursery: + if use_trip: + # trio_run_in_process + async with trio_run_in_process.open_in_process( + subactor._trip_main, bind_addr, - fs_info, - start_method, - parent_addr - ), - # daemon=True, - name=name, - ) + parent_addr, + ) as proc: + log.info(f"Started {proc}") + + # wait for actor to spawn and connect back to us + # channel should have handshake completed by the + # local actor by the time we get a ref to it + event, chan = await actor_nursery._actor.wait_for_peer( + subactor.uid) + portal = Portal(chan) + actor_nursery._children[subactor.uid] = ( + subactor, proc, portal) + task_status.started(portal) + + # wait for ActorNursery.wait() to be called + await actor_nursery._join_procs.wait() + + if portal in actor_nursery._cancel_after_result_on_exit: + cancel_scope = await nursery.start( + cancel_on_completion, portal, subactor, errors) + + # TRIP blocks here until process is complete + else: + # `multiprocessing` + start_method = _ctx.get_start_method() + if start_method == 'forkserver': + # XXX do our hackery on the stdlib to avoid multiple + # forkservers (one at each subproc layer). + fs = forkserver._forkserver + curr_actor = current_actor() + if is_main_process() and not curr_actor._forkserver_info: + # if we're the "main" process start the forkserver + # only once and pass its ipc info to downstream + # children + # forkserver.set_forkserver_preload(rpc_module_paths) + forkserver.ensure_running() + fs_info = ( + fs._forkserver_address, + fs._forkserver_alive_fd, + getattr(fs, '_forkserver_pid', None), + getattr( + resource_tracker._resource_tracker, '_pid', None), + resource_tracker._resource_tracker._fd, + ) + else: + assert curr_actor._forkserver_info + fs_info = ( + fs._forkserver_address, + fs._forkserver_alive_fd, + fs._forkserver_pid, + resource_tracker._resource_tracker._pid, + resource_tracker._resource_tracker._fd, + ) = curr_actor._forkserver_info + else: + fs_info = (None, None, None, None, None) + + proc = _ctx.Process( + target=subactor._mp_main, + args=( + bind_addr, + fs_info, + start_method, + parent_addr + ), + # daemon=True, + name=name, + ) + # `multiprocessing` only (since no async interface): + # register the process before start in case we get a cancel + # request before the actor has fully spawned - then we can wait + # for it to fully come up before sending a cancel request + actor_nursery._children[subactor.uid] = (subactor, proc, None) + + proc.start() + if not proc.is_alive(): + raise ActorFailure("Couldn't start sub-actor?") + + log.info(f"Started {proc}") + + # wait for actor to spawn and connect back to us + # channel should have handshake completed by the + # local actor by the time we get a ref to it + event, chan = await actor_nursery._actor.wait_for_peer( + subactor.uid) + portal = Portal(chan) + actor_nursery._children[subactor.uid] = (subactor, proc, portal) + + # unblock parent task + task_status.started(portal) + + # wait for ActorNursery.wait() to be called + # this is required to ensure synchronization + # with startup and registration of this actor in + # ActorNursery.run_in_actor() + await actor_nursery._join_procs.wait() + + if portal in actor_nursery._cancel_after_result_on_exit: + cancel_scope = await nursery.start( + cancel_on_completion, portal, subactor, errors) + + # TODO: timeout block here? + if proc.is_alive(): + await proc_waiter(proc) + proc.join() + + log.debug(f"Joined {proc}") + # pop child entry to indicate we are no longer managing this subactor + subactor, proc, portal = actor_nursery._children.pop(subactor.uid) + # cancel result waiter that may have been spawned in + # tandem if not done already + if cancel_scope: + log.warning( + f"Cancelling existing result waiter task for {subactor.uid}") + cancel_scope.cancel() diff --git a/tractor/_trionics.py b/tractor/_trionics.py index 9808935..49f7838 100644 --- a/tractor/_trionics.py +++ b/tractor/_trionics.py @@ -1,42 +1,37 @@ """ ``trio`` inspired apis and helpers """ -import inspect -import importlib -import platform import multiprocessing as mp from typing import Tuple, List, Dict, Optional, Any import typing import trio -from async_generator import asynccontextmanager, aclosing -import trio_run_in_process +from async_generator import asynccontextmanager from ._state import current_actor from .log import get_logger, get_loglevel -from ._actor import Actor, ActorFailure +from ._actor import Actor # , ActorFailure from ._portal import Portal from . import _spawn -if platform.system() == 'Windows': - async def proc_waiter(proc: mp.Process) -> None: - await trio.hazmat.WaitForSingleObject(proc.sentinel) -else: - async def proc_waiter(proc: mp.Process) -> None: - await trio.hazmat.wait_readable(proc.sentinel) - - log = get_logger('tractor') class ActorNursery: """Spawn scoped subprocess actors. """ - def __init__(self, actor: Actor, nursery: trio.Nursery) -> None: + def __init__( + self, + actor: Actor, + ria_nursery: trio.Nursery, + da_nursery: trio.Nursery, + errors: Dict[str, Exception], + ) -> None: # self.supervisor = supervisor # TODO self._actor: Actor = actor - self._nursery = nursery + self._ria_nursery = ria_nursery + self._da_nursery = da_nursery self._children: Dict[ Tuple[str, str], Tuple[Actor, mp.Process, Optional[Portal]] @@ -45,10 +40,8 @@ class ActorNursery: # cancelled when their "main" result arrives self._cancel_after_result_on_exit: set = set() self.cancelled: bool = False - # self._aexitstack = contextlib.AsyncExitStack() - - async def __aenter__(self): - return self + self._join_procs = trio.Event() + self.errors = errors async def start_actor( self, @@ -57,51 +50,34 @@ class ActorNursery: statespace: Optional[Dict[str, Any]] = None, rpc_module_paths: List[str] = None, loglevel: str = None, # set log level per subactor + nursery: trio.Nursery = None, ) -> Portal: loglevel = loglevel or self._actor.loglevel or get_loglevel() - mods = {} - for path in rpc_module_paths or (): - mod = importlib.import_module(path) - mods[path] = mod.__file__ - - actor = Actor( + subactor = Actor( name, # modules allowed to invoked funcs from - rpc_module_paths=mods, + rpc_module_paths=rpc_module_paths, statespace=statespace, # global proc state vars loglevel=loglevel, arbiter_addr=current_actor()._arb_addr, ) parent_addr = self._actor.accept_addr assert parent_addr - proc = await _spawn.new_proc( + + # start a task to spawn a process + # blocks until process has been started and a portal setup + nursery = nursery or self._da_nursery + return await nursery.start( + _spawn.new_proc, name, - actor, + self, + subactor, + self.errors, bind_addr, parent_addr, - self._nursery, + nursery, ) - # `multiprocessing` only (since no async interface): - # register the process before start in case we get a cancel - # request before the actor has fully spawned - then we can wait - # for it to fully come up before sending a cancel request - self._children[actor.uid] = (actor, proc, None) - - if not isinstance(proc, trio_run_in_process.process.Process): - proc.start() - if not proc.is_alive(): - raise ActorFailure("Couldn't start sub-actor?") - - log.info(f"Started {proc}") - # wait for actor to spawn and connect back to us - # channel should have handshake completed by the - # local actor by the time we get a ref to it - event, chan = await self._actor.wait_for_peer(actor.uid) - portal = Portal(chan) - self._children[actor.uid] = (actor, proc, portal) - - return portal async def run_in_actor( self, @@ -127,6 +103,8 @@ class ActorNursery: bind_addr=bind_addr, statespace=statespace, loglevel=loglevel, + # use the run_in_actor nursery + nursery=self._ria_nursery, ) # this marks the actor to be cancelled after its portal result # is retreived, see ``wait()`` below. @@ -140,153 +118,9 @@ class ActorNursery: async def wait(self) -> None: """Wait for all subactors to complete. - - This is probably the most complicated (and confusing, sorry) - function that does all the clever crap to deal with cancellation, - error propagation, and graceful subprocess tear down. """ - async def exhaust_portal(portal, actor): - """Pull final result from portal (assuming it has one). - - If the main task is an async generator do our best to consume - what's left of it. - """ - try: - log.debug(f"Waiting on final result from {actor.uid}") - final = res = await portal.result() - # if it's an async-gen then alert that we're cancelling it - if inspect.isasyncgen(res): - final = [] - log.warning( - f"Blindly consuming asyncgen for {actor.uid}") - with trio.fail_after(1): - async with aclosing(res) as agen: - async for item in agen: - log.debug(f"Consuming item {item}") - final.append(item) - except (Exception, trio.MultiError) as err: - # we reraise in the parent task via a ``trio.MultiError`` - return err - else: - return final - - async def cancel_on_completion( - portal: Portal, - actor: Actor, - task_status=trio.TASK_STATUS_IGNORED, - ) -> None: - """Cancel actor gracefully once it's "main" portal's - result arrives. - - Should only be called for actors spawned with `run_in_actor()`. - """ - with trio.CancelScope() as cs: - task_status.started(cs) - # if this call errors we store the exception for later - # in ``errors`` which will be reraised inside - # a MultiError and we still send out a cancel request - result = await exhaust_portal(portal, actor) - if isinstance(result, Exception): - errors.append(result) - log.warning( - f"Cancelling {portal.channel.uid} after error {result}" - ) - else: - log.info(f"Cancelling {portal.channel.uid} gracefully") - - # cancel the process now that we have a final result - await portal.cancel_actor() - - # XXX: lol, this will never get run without a shield above.. - # if cs.cancelled_caught: - # log.warning( - # "Result waiter was cancelled, process may have died") - - async def wait_for_proc( - proc: mp.Process, - actor: Actor, - portal: Portal, - cancel_scope: Optional[trio._core._run.CancelScope] = None, - ) -> None: - # please god don't hang - if not isinstance(proc, trio_run_in_process.process.Process): - # TODO: timeout block here? - if proc.is_alive(): - await proc_waiter(proc) - proc.join() - else: - # trio_run_in_process blocking wait - if errors: - multierror = trio.MultiError(errors) - # import pdb; pdb.set_trace() - # try: - # with trio.CancelScope(shield=True): - # await proc.mng.__aexit__( - # type(multierror), - # multierror, - # multierror.__traceback__, - # ) - # except BaseException as err: - # import pdb; pdb.set_trace() - # pass - # else: - await proc.mng.__aexit__(None, None, None) - # proc.nursery.cancel_scope.cancel() - - log.debug(f"Joined {proc}") - # indicate we are no longer managing this subactor - self._children.pop(actor.uid) - - # proc terminated, cancel result waiter that may have - # been spawned in tandem if not done already - if cancel_scope: # and not portal._cancelled: - log.warning( - f"Cancelling existing result waiter task for {actor.uid}") - cancel_scope.cancel() - log.debug(f"Waiting on all subactors to complete") - # since we pop each child subactor on termination, - # iterate a copy - children = self._children.copy() - errors: List[Exception] = [] - # wait on run_in_actor() tasks, unblocks when all complete - async with trio.open_nursery() as nursery: - # async with self._nursery as nursery: - for subactor, proc, portal in children.values(): - cs = None - # portal from ``run_in_actor()`` - if portal in self._cancel_after_result_on_exit: - cs = await nursery.start( - cancel_on_completion, portal, subactor) - # TODO: how do we handle remote host spawned actors? - nursery.start_soon( - wait_for_proc, proc, subactor, portal, cs) - - if errors: - multierror = trio.MultiError(errors) - if not self.cancelled: - # bubble up error(s) here and expect to be called again - # once the nursery has been cancelled externally (ex. - # from within __aexit__() if an error is caught around - # ``self.wait()`` then, ``self.cancel()`` is called - # immediately, in the default supervisor strat, after - # which in turn ``self.wait()`` is called again.) - raise trio.MultiError(errors) - - # wait on all `start_actor()` subactors to complete - # if errors were captured above and we have not been cancelled - # then these ``start_actor()`` spawned actors will block until - # cancelled externally - children = self._children.copy() - async with trio.open_nursery() as nursery: - for subactor, proc, portal in children.values(): - # TODO: how do we handle remote host spawned actors? - nursery.start_soon(wait_for_proc, proc, subactor, portal, cs) - - log.debug(f"All subactors for {self} have terminated") - if errors: - # always raise any error if we're also cancelled - raise trio.MultiError(errors) + self._join_procs.set() async def cancel(self, hard_kill: bool = False) -> None: """Cancel this nursery by instructing each subactor to cancel @@ -304,7 +138,7 @@ class ActorNursery: log.debug(f"Cancelling nursery") with trio.move_on_after(3) as cs: - async with trio.open_nursery() as n: + async with trio.open_nursery() as nursery: for subactor, proc, portal in self._children.values(): if hard_kill: do_hard_kill(proc) @@ -331,59 +165,20 @@ class ActorNursery: # spawn cancel tasks for each sub-actor assert portal - n.start_soon(portal.cancel_actor) + nursery.start_soon(portal.cancel_actor) # if we cancelled the cancel (we hung cancelling remote actors) # then hard kill all sub-processes if cs.cancelled_caught: log.error(f"Failed to gracefully cancel {self}, hard killing!") - async with trio.open_nursery() as n: + async with trio.open_nursery(): for subactor, proc, portal in self._children.values(): - n.start_soon(do_hard_kill, proc) + nursery.start_soon(do_hard_kill, proc) # mark ourselves as having (tried to have) cancelled all subactors self.cancelled = True await self.wait() - async def __aexit__(self, etype, value, tb): - """Wait on all subactor's main routines to complete. - """ - # XXX: this is effectively the (for now) lone - # cancellation/supervisor strategy (one-cancels-all) - # which exactly mimicks trio's behaviour - if etype is not None: - try: - # XXX: hypothetically an error could be raised and then - # a cancel signal shows up slightly after in which case - # the `else:` block here might not complete? - # For now, shield both. - with trio.CancelScope(shield=True): - if etype in (trio.Cancelled, KeyboardInterrupt): - log.warning( - f"Nursery for {current_actor().uid} was " - f"cancelled with {etype}") - else: - log.exception( - f"Nursery for {current_actor().uid} " - f"errored with {etype}, ") - await self.cancel() - except trio.MultiError as merr: - if value not in merr.exceptions: - raise trio.MultiError(merr.exceptions + [value]) - raise - else: - log.debug(f"Waiting on subactors {self._children} to complete") - try: - await self.wait() - except (Exception, trio.MultiError) as err: - log.warning(f"Nursery cancelling due to {err}") - if self._children: - with trio.CancelScope(shield=True): - await self.cancel() - raise - - log.debug(f"Nursery teardown complete") - @asynccontextmanager async def open_nursery() -> typing.AsyncGenerator[ActorNursery, None]: @@ -395,12 +190,67 @@ async def open_nursery() -> typing.AsyncGenerator[ActorNursery, None]: if not actor: raise RuntimeError("No actor instance has been defined yet?") - # XXX we need this nursery because TRIP is doing all its stuff with + # XXX we use these nurseries because TRIP is doing all its stuff with # an `@asynccontextmanager` which has an internal nursery *and* the # task that opens a nursery must also close it - so we need a path - # in TRIP to make this all kinda work as well. Note I'm basically - # giving up for now - it's probably equivalent amounts of work to - # make TRIP vs. `multiprocessing` work here. - async with trio.open_nursery() as nursery: - async with ActorNursery(actor, nursery) as anursery: - yield anursery + # in TRIP to make this all kinda work as well. + errors: Dict[str, Exception] = {} + async with trio.open_nursery() as da_nursery: + try: + async with trio.open_nursery() as ria_nursery: + anursery = ActorNursery( + actor, ria_nursery, da_nursery, errors + ) + try: + # spawning of actors happens in this scope after + # we yield to the caller. + yield anursery + log.debug( + f"Waiting on subactors {anursery._children}" + "to complete" + ) + # anursery.wait() + # except (trio.Cancelled, KeyboardInterrupt) as err: + except (BaseException, Exception) as err: + anursery._join_procs.set() + try: + # XXX: hypothetically an error could be raised and then + # a cancel signal shows up slightly after in which case + # the `else:` block here might not complete? + # For now, shield both. + with trio.CancelScope(shield=True): + if err in (trio.Cancelled, KeyboardInterrupt): + log.warning( + f"Nursery for {current_actor().uid} was " + f"cancelled with {err}") + else: + log.exception( + f"Nursery for {current_actor().uid} " + f"errored with {err}, ") + await anursery.cancel() + except trio.MultiError as merr: + if err not in merr.exceptions: + raise trio.MultiError(merr.exceptions + [err]) + else: + raise + + # last bit before first nursery block end + log.debug(f"Waiting on all subactors to complete") + anursery._join_procs.set() + # ria_nursery scope + except (Exception, trio.MultiError) as err: + log.warning(f"Nursery cancelling due to {err}") + if anursery._children: + with trio.CancelScope(shield=True): + await anursery.cancel() + raise + finally: + if errors: + if anursery._children: + with trio.CancelScope(shield=True): + await anursery.cancel() + if len(errors) > 1: + raise trio.MultiError(errors.values()) + else: + raise list(errors.values())[0] + log.debug(f"Nursery teardown complete")