forked from goodboy/tractor
1
0
Fork 0

Merge pull request #245 from goodboy/immediate_remote_cancels

Immediate remote cancels
pubsub_startup_response_msg
goodboy 2021-10-17 08:16:50 -04:00 committed by GitHub
commit 828754dbb5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 591 additions and 398 deletions

View File

@ -1,3 +1,8 @@
'''
Test that a nested nursery will avoid clobbering
the debugger latched by a broken child.
'''
import trio
import tractor
@ -35,6 +40,7 @@ async def main():
"""
async with tractor.open_nursery(
debug_mode=True,
# loglevel='cancel',
) as n:
# spawn both actors

View File

@ -0,0 +1,13 @@
Change the core message loop to handle task and actor-runtime cancel
requests immediately instead of scheduling them as is done for rpc-task
requests.
In order to obtain more reliable teardown mechanics for (complex) actor
trees it's important that we specially treat cancel requests as having
higher priority. Previously, it was possible that task cancel requests
could actually also themselves be cancelled if a "actor-runtime" cancel
request was received (can happen during messy multi actor crashes that
propagate). Instead cancels now block the msg loop until serviced and
a response is relayed back to the requester. This also allows for
improved debugger support since we have determinism guarantees about
which processes must wait before hard killing their children.

View File

@ -4,5 +4,5 @@ now and use the default `fragment set`_.
.. _towncrier docs: https://github.com/twisted/towncrier#quick-start
.. _pluggy release readme: https://github.com/twisted/towncrier#quick-start
.. _pluggy release readme: https://github.com/pytest-dev/pluggy/blob/main/changelog/README.rst
.. _fragment set: https://github.com/twisted/towncrier#news-fragments

View File

@ -1,5 +1,6 @@
"""
Cancellation and error propagation
"""
import os
import signal
@ -365,7 +366,8 @@ async def test_nested_multierrors(loglevel, start_method):
# to happen before an actor is spawned
if isinstance(subexc, trio.Cancelled):
continue
else:
elif isinstance(subexc, tractor.RemoteActorError):
# on windows it seems we can't exactly be sure wtf
# will happen..
assert subexc.type in (
@ -373,6 +375,17 @@ async def test_nested_multierrors(loglevel, start_method):
trio.Cancelled,
trio.MultiError
)
elif isinstance(subexc, trio.MultiError):
for subsub in subexc.exceptions:
if subsub in (tractor.RemoteActorError,):
subsub = subsub.type
assert type(subsub) in (
trio.Cancelled,
trio.MultiError,
)
else:
assert isinstance(subexc, tractor.RemoteActorError)
@ -381,13 +394,14 @@ async def test_nested_multierrors(loglevel, start_method):
# on windows sometimes spawning is just too slow and
# we get back the (sent) cancel signal instead
if platform.system() == 'Windows':
assert (subexc.type is trio.MultiError) or (
subexc.type is tractor.RemoteActorError)
if isinstance(subexc, tractor.RemoteActorError):
assert subexc.type in (trio.MultiError, tractor.RemoteActorError)
else:
assert isinstance(subexc, trio.MultiError)
else:
assert subexc.type is trio.MultiError
else:
assert (subexc.type is tractor.RemoteActorError) or (
subexc.type is trio.Cancelled)
assert subexc.type in (tractor.RemoteActorError, trio.Cancelled)
@no_windows
@ -448,6 +462,7 @@ def test_cancel_via_SIGINT_other_task(
with pytest.raises(KeyboardInterrupt):
trio.run(main)
async def spin_for(period=3):
"Sync sleep."
time.sleep(period)

View File

@ -236,7 +236,8 @@ def test_subactor_breakpoint(spawn):
def test_multi_subactors(spawn):
"""Multiple subactors, both erroring and breakpointing as well as
"""
Multiple subactors, both erroring and breakpointing as well as
a nested subactor erroring.
"""
child = spawn(r'multi_subactors')
@ -259,6 +260,7 @@ def test_multi_subactors(spawn):
# first name_error failure
child.expect(r"\(Pdb\+\+\)")
before = str(child.before.decode())
assert "Attaching to pdb in crashed actor: ('name_error'" in before
assert "NameError" in before
# continue again
@ -267,6 +269,7 @@ def test_multi_subactors(spawn):
# 2nd name_error failure
child.expect(r"\(Pdb\+\+\)")
before = str(child.before.decode())
assert "Attaching to pdb in crashed actor: ('name_error_1'" in before
assert "NameError" in before
# breakpoint loop should re-engage
@ -275,6 +278,19 @@ def test_multi_subactors(spawn):
before = str(child.before.decode())
assert "Attaching pdb to actor: ('breakpoint_forever'" in before
# wait for spawn error to show up
while 'breakpoint_forever' in before:
child.sendline('c')
child.expect(r"\(Pdb\+\+\)")
before = str(child.before.decode())
# 2nd depth nursery should trigger
# child.sendline('c')
# child.expect(r"\(Pdb\+\+\)")
# before = str(child.before.decode())
assert "Attaching to pdb in crashed actor: ('spawn_error'" in before
assert "RemoteActorError: ('name_error_1'" in before
# now run some "continues" to show re-entries
for _ in range(5):
child.sendline('c')
@ -284,16 +300,24 @@ def test_multi_subactors(spawn):
child.sendline('q')
child.expect(r"\(Pdb\+\+\)")
before = str(child.before.decode())
# debugger attaches to root
assert "Attaching to pdb in crashed actor: ('root'" in before
# expect a multierror with exceptions for each sub-actor
assert "RemoteActorError: ('breakpoint_forever'" in before
assert "RemoteActorError: ('name_error'" in before
assert "RemoteActorError: ('spawn_error'" in before
assert "RemoteActorError: ('name_error_1'" in before
assert 'bdb.BdbQuit' in before
# process should exit
child.sendline('c')
child.expect(pexpect.EOF)
# repeat of previous multierror for final output
before = str(child.before.decode())
assert "RemoteActorError: ('breakpoint_forever'" in before
assert "RemoteActorError: ('name_error'" in before
assert "RemoteActorError: ('spawn_error'" in before
assert "RemoteActorError: ('name_error_1'" in before
assert 'bdb.BdbQuit' in before
@ -387,16 +411,29 @@ def test_multi_subactors_root_errors(spawn):
before = str(child.before.decode())
assert "NameError: name 'doggypants' is not defined" in before
# continue again
# continue again to catch 2nd name error from
# actor 'name_error_1' (which is 2nd depth).
child.sendline('c')
child.expect(r"\(Pdb\+\+\)")
# should now get attached in root with assert error
before = str(child.before.decode())
assert "Attaching to pdb in crashed actor: ('name_error_1'" in before
assert "NameError" in before
# should have come just after priot prompt
child.sendline('c')
child.expect(r"\(Pdb\+\+\)")
before = str(child.before.decode())
assert "Attaching to pdb in crashed actor: ('spawn_error'" in before
# boxed error from previous step
assert "RemoteActorError: ('name_error_1'" in before
assert "NameError" in before
child.sendline('c')
child.expect(r"\(Pdb\+\+\)")
before = str(child.before.decode())
assert "Attaching to pdb in crashed actor: ('root'" in before
assert "AssertionError" in before
# boxed error from first level failure
assert "RemoteActorError: ('name_error'" in before
assert "NameError" in before
# warnings assert we probably don't need
# assert "Cancelling nursery in ('spawn_error'," in before
@ -406,6 +443,7 @@ def test_multi_subactors_root_errors(spawn):
child.expect(pexpect.EOF)
before = str(child.before.decode())
# error from root actor and root task that created top level nursery
assert "AssertionError" in before

View File

@ -180,6 +180,7 @@ def test_multi_actor_subs_arbiter_pub(
'streamer',
enable_modules=[__name__],
)
name = 'streamer'
even_portal = await n.run_in_actor(
subs,

View File

@ -49,6 +49,7 @@ async def _invoke(
chan: Channel,
func: typing.Callable,
kwargs: Dict[str, Any],
is_rpc: bool = True,
task_status: TaskStatus[
Union[trio.CancelScope, BaseException]
] = trio.TASK_STATUS_IGNORED,
@ -243,6 +244,7 @@ async def _invoke(
scope, func, is_complete = actor._rpc_tasks.pop((chan, cid))
is_complete.set()
except KeyError:
if is_rpc:
# If we're cancelled before the task returns then the
# cancel scope will not have been inserted yet
log.warning(
@ -503,8 +505,8 @@ class Actor:
log.runtime(f"Peers is {self._peers}")
if not self._peers: # no more channels connected
self._no_more_peers.set()
log.runtime("Signalling no more peer channels")
self._no_more_peers.set()
# # XXX: is this necessary (GC should do it?)
if chan.connected():
@ -671,16 +673,39 @@ class Actor:
f"{ns}.{funcname}({kwargs})")
if ns == 'self':
func = getattr(self, funcname)
if funcname == 'cancel':
# don't start entire actor runtime cancellation if this
# actor is in debug mode
pdb_complete = _debug._local_pdb_complete
if pdb_complete:
await pdb_complete.wait()
# we immediately start the runtime machinery shutdown
with trio.CancelScope(shield=True):
# self.cancel() was called so kill this msg loop
# and break out into ``_async_main()``
log.cancel(
f"Actor {self.uid} was remotely cancelled; "
"waiting on cancellation completion..")
await _invoke(self, cid, chan, func, kwargs, is_rpc=False)
# await self._cancel_complete.wait()
loop_cs.cancel()
break
if funcname == '_cancel_task':
# XXX: a special case is made here for
# remote calls since we don't want the
# remote actor have to know which channel
# the task is associated with and we can't
# pass non-primitive types between actors.
# This means you can use:
# Portal.run('self', '_cancel_task, cid=did)
# without passing the `chan` arg.
# we immediately start the runtime machinery shutdown
with trio.CancelScope(shield=True):
# self.cancel() was called so kill this msg loop
# and break out into ``_async_main()``
kwargs['chan'] = chan
log.cancel(
f"Actor {self.uid} was remotely cancelled; "
"waiting on cancellation completion..")
await _invoke(self, cid, chan, func, kwargs, is_rpc=False)
continue
else:
# complain to client about restricted modules
try:
@ -699,14 +724,15 @@ class Actor:
partial(_invoke, self, cid, chan, func, kwargs),
name=funcname,
)
except RuntimeError:
except (RuntimeError, trio.MultiError):
# avoid reporting a benign race condition
# during actor runtime teardown.
nursery_cancelled_before_task = True
break
# never allow cancelling cancel requests (results in
# deadlock and other weird behaviour)
if func != self.cancel:
# if func != self.cancel:
if isinstance(cs, Exception):
log.warning(
f"Task for RPC func {func} failed with"
@ -719,20 +745,11 @@ class Actor:
# cancelled gracefully if requested
self._rpc_tasks[(chan, cid)] = (
cs, func, trio.Event())
else:
# self.cancel() was called so kill this msg loop
# and break out into ``_async_main()``
log.warning(
f"Actor {self.uid} was remotely cancelled; "
"waiting on cancellation completion..")
await self._cancel_complete.wait()
loop_cs.cancel()
break
log.runtime(
f"Waiting on next msg for {chan} from {chan.uid}")
else:
# channel disconnect
# end of async for, channel disconnect vis ``trio.EndOfChannel``
log.runtime(
f"{chan} for {chan.uid} disconnected, cancelling tasks"
)
@ -947,6 +964,9 @@ class Actor:
# Blocks here as expected until the root nursery is
# killed (i.e. this actor is cancelled or signalled by the parent)
except Exception as err:
log.info("Closing all actor lifetime contexts")
_lifetime_stack.close()
if not registered_with_arbiter:
# TODO: I guess we could try to connect back
# to the parent through a channel and engage a debugger
@ -976,11 +996,21 @@ class Actor:
raise
finally:
log.runtime("Root nursery complete")
log.info("Runtime nursery complete")
# tear down all lifetime contexts if not in guest mode
# XXX: should this just be in the entrypoint?
log.cancel("Closing all actor lifetime contexts")
log.info("Closing all actor lifetime contexts")
# TODO: we can't actually do this bc the debugger
# uses the _service_n to spawn the lock task, BUT,
# in theory if we had the root nursery surround this finally
# block it might be actually possible to debug THIS
# machinery in the same way as user task code?
# if self.name == 'brokerd.ib':
# with trio.CancelScope(shield=True):
# await _debug.breakpoint()
_lifetime_stack.close()
# Unregister actor from the arbiter
@ -1065,7 +1095,7 @@ class Actor:
self._service_n.start_soon(self.cancel)
async def cancel(self) -> bool:
"""Cancel this actor.
"""Cancel this actor's runtime.
The "deterministic" teardown sequence in order is:
- cancel all ongoing rpc tasks by cancel scope
@ -1099,7 +1129,7 @@ class Actor:
if self._service_n:
self._service_n.cancel_scope.cancel()
log.cancel(f"{self.uid} was sucessfullly cancelled")
log.cancel(f"{self.uid} called `Actor.cancel()`")
self._cancel_complete.set()
return True
@ -1158,13 +1188,15 @@ class Actor:
registered for each.
"""
tasks = self._rpc_tasks
if tasks:
log.cancel(f"Cancelling all {len(tasks)} rpc tasks:\n{tasks} ")
for (chan, cid) in tasks.copy():
for (chan, cid), (scope, func, is_complete) in tasks.copy().items():
if only_chan is not None:
if only_chan != chan:
continue
# TODO: this should really done in a nursery batch
if func != self._cancel_task:
await self._cancel_task(cid, chan)
log.cancel(

View File

@ -5,16 +5,23 @@ Multi-core debugging for da peeps!
import bdb
import sys
from functools import partial
from contextlib import asynccontextmanager
from typing import Tuple, Optional, Callable, AsyncIterator
from contextlib import asynccontextmanager as acm
from typing import (
Tuple,
Optional,
Callable,
AsyncIterator,
AsyncGenerator,
)
import tractor
import trio
from trio_typing import TaskStatus
from .log import get_logger
from . import _state
from ._discovery import get_root
from ._state import is_root_process
from ._state import is_root_process, debug_mode
from ._exceptions import is_multi_cancelled
try:
@ -122,7 +129,7 @@ class PdbwTeardown(pdbpp.Pdb):
# break
@asynccontextmanager
@acm
async def _acquire_debug_lock(
uid: Tuple[str, str]
@ -139,7 +146,7 @@ async def _acquire_debug_lock(
task_name = trio.lowlevel.current_task().name
log.pdb(
log.debug(
f"Attempting to acquire TTY lock, remote task: {task_name}:{uid}"
)
@ -187,7 +194,7 @@ async def _acquire_debug_lock(
if (
not stats.owner
):
log.pdb(f"No more tasks waiting on tty lock! says {uid}")
log.debug(f"No more tasks waiting on tty lock! says {uid}")
_no_remote_has_tty.set()
_no_remote_has_tty = None
@ -219,7 +226,8 @@ async def _hijack_stdin_for_child(
subactor_uid: Tuple[str, str]
) -> str:
'''Hijack the tty in the root process of an actor tree such that
'''
Hijack the tty in the root process of an actor tree such that
the pdbpp debugger console can be allocated to a sub-actor for repl
bossing.
@ -254,6 +262,8 @@ async def _hijack_stdin_for_child(
# assert await stream.receive() == 'pdb_unlock'
except (
# BaseException,
trio.MultiError,
trio.BrokenResourceError,
trio.Cancelled, # by local cancellation
trio.ClosedResourceError, # by self._rx_chan
@ -268,12 +278,74 @@ async def _hijack_stdin_for_child(
if isinstance(err, trio.Cancelled):
raise
log.debug(f"TTY lock released, remote task: {task_name}:{subactor_uid}")
finally:
log.debug(
"TTY lock released, remote task:"
f"{task_name}:{subactor_uid}")
return "pdb_unlock_complete"
async def wait_for_parent_stdin_hijack(
actor_uid: Tuple[str, str],
task_status: TaskStatus[trio.CancelScope] = trio.TASK_STATUS_IGNORED
):
'''
Connect to the root actor via a ctx and invoke a task which locks
a root-local TTY lock.
This function is used by any sub-actor to acquire mutex access to
pdb and the root's TTY for interactive debugging (see below inside
``_breakpoint()``). It can be used to ensure that an intermediate
nursery-owning actor does not clobber its children if they are in
debug (see below inside ``maybe_wait_for_debugger()``).
'''
global _debugger_request_cs
with trio.CancelScope(shield=True) as cs:
_debugger_request_cs = cs
try:
async with get_root() as portal:
# this syncs to child's ``Context.started()`` call.
async with portal.open_context(
tractor._debug._hijack_stdin_for_child,
subactor_uid=actor_uid,
) as (ctx, val):
log.pdb('locked context')
assert val == 'Locked'
async with ctx.open_stream() as stream:
# unblock local caller
task_status.started(cs)
try:
assert _local_pdb_complete
await _local_pdb_complete.wait()
finally:
# TODO: shielding currently can cause hangs...
with trio.CancelScope(shield=True):
await stream.send('pdb_unlock')
# sync with callee termination
assert await ctx.result() == "pdb_unlock_complete"
except tractor.ContextCancelled:
log.warning('Root actor cancelled debug lock')
finally:
log.debug(f"Exiting debugger for actor {actor_uid}")
global _local_task_in_debug
_local_task_in_debug = None
log.debug(f"Child {actor_uid} released parent stdio lock")
async def _breakpoint(
debug_func,
@ -300,56 +372,6 @@ async def _breakpoint(
await trio.lowlevel.checkpoint()
async def wait_for_parent_stdin_hijack(
task_status=trio.TASK_STATUS_IGNORED
):
global _debugger_request_cs
with trio.CancelScope(shield=True) as cs:
_debugger_request_cs = cs
try:
async with get_root() as portal:
log.pdb('got portal')
# this syncs to child's ``Context.started()`` call.
async with portal.open_context(
tractor._debug._hijack_stdin_for_child,
subactor_uid=actor.uid,
) as (ctx, val):
log.pdb('locked context')
assert val == 'Locked'
async with ctx.open_stream() as stream:
log.error('opened stream')
# unblock local caller
task_status.started()
try:
await _local_pdb_complete.wait()
finally:
# TODO: shielding currently can cause hangs...
with trio.CancelScope(shield=True):
await stream.send('pdb_unlock')
# sync with callee termination
assert await ctx.result() == "pdb_unlock_complete"
except tractor.ContextCancelled:
log.warning('Root actor cancelled debug lock')
finally:
log.debug(f"Exiting debugger for actor {actor}")
global _local_task_in_debug
_local_task_in_debug = None
log.debug(f"Child {actor} released parent stdio lock")
if not _local_pdb_complete or _local_pdb_complete.is_set():
_local_pdb_complete = trio.Event()
@ -386,7 +408,10 @@ async def _breakpoint(
# cancel on this task start? I *think* this works below?
# actor._service_n.cancel_scope.shield = shield
with trio.CancelScope(shield=True):
await actor._service_n.start(wait_for_parent_stdin_hijack)
await actor._service_n.start(
wait_for_parent_stdin_hijack,
actor.uid,
)
elif is_root_process():
@ -407,11 +432,10 @@ async def _breakpoint(
'Root actor attempting to shield-acquire active tty lock'
f' owned by {_global_actor_in_debug}')
with trio.CancelScope(shield=True):
# must shield here to avoid hitting a ``Cancelled`` and
# a child getting stuck bc we clobbered the tty
with trio.CancelScope(shield=True):
await _debug_lock.acquire()
else:
# may be cancelled
await _debug_lock.acquire()
@ -501,7 +525,7 @@ post_mortem = partial(
async def _maybe_enter_pm(err):
if (
_state.debug_mode()
debug_mode()
# NOTE: don't enter debug mode recursively after quitting pdb
# Iow, don't re-enter the repl if the `quit` command was issued
@ -524,3 +548,80 @@ async def _maybe_enter_pm(err):
else:
return False
@acm
async def acquire_debug_lock(
subactor_uid: Tuple[str, str],
) -> AsyncGenerator[None, tuple]:
'''
Grab root's debug lock on entry, release on exit.
'''
async with trio.open_nursery() as n:
cs = await n.start(
wait_for_parent_stdin_hijack,
subactor_uid,
)
yield None
cs.cancel()
async def maybe_wait_for_debugger(
poll_steps: int = 2,
poll_delay: float = 0.1,
) -> None:
if not debug_mode():
return
if (
is_root_process()
):
global _no_remote_has_tty, _global_actor_in_debug, _wait_all_tasks_lock
# If we error in the root but the debugger is
# engaged we don't want to prematurely kill (and
# thus clobber access to) the local tty since it
# will make the pdb repl unusable.
# Instead try to wait for pdb to be released before
# tearing down.
sub_in_debug = None
for _ in range(poll_steps):
if _global_actor_in_debug:
sub_in_debug = tuple(_global_actor_in_debug)
log.warning(
'Root polling for debug')
with trio.CancelScope(shield=True):
await trio.sleep(poll_delay)
# TODO: could this make things more deterministic? wait
# to see if a sub-actor task will be scheduled and grab
# the tty lock on the next tick?
# XXX: doesn't seem to work
# await trio.testing.wait_all_tasks_blocked(cushion=0)
debug_complete = _no_remote_has_tty
if (
(debug_complete and
not debug_complete.is_set())
):
log.warning(
'Root has errored but pdb is in use by '
f'child {sub_in_debug}\n'
'Waiting on tty lock to release..')
await debug_complete.wait()
await trio.sleep(poll_delay)
continue
else:
log.warning(
'Root acquired TTY LOCK'
)
return

View File

@ -44,7 +44,7 @@ async def get_arbiter(
@asynccontextmanager
async def get_root(
**kwargs,
) -> typing.AsyncGenerator[Union[Portal, LocalPortal], None]:
) -> typing.AsyncGenerator[Portal, None]:
host, port = _runtime_vars['_root_mailbox']
assert host is not None

View File

@ -1,5 +1,6 @@
"""
Machinery for actor process spawning using multiple backends.
"""
import sys
import multiprocessing as mp
@ -8,7 +9,6 @@ from typing import Any, Dict, Optional
import trio
from trio_typing import TaskStatus
from async_generator import asynccontextmanager
try:
from multiprocessing import semaphore_tracker # type: ignore
@ -22,9 +22,15 @@ from multiprocessing import forkserver # type: ignore
from typing import Tuple
from . import _forkserver_override
from ._debug import (
maybe_wait_for_debugger,
acquire_debug_lock,
)
from ._state import (
current_actor,
is_main_process,
is_root_process,
debug_mode,
)
from .log import get_logger
@ -123,17 +129,15 @@ async def cancel_on_completion(
portal: Portal,
actor: Actor,
errors: Dict[Tuple[str, str], Exception],
task_status: TaskStatus[trio.CancelScope] = trio.TASK_STATUS_IGNORED,
) -> None:
"""Cancel actor gracefully once it's "main" portal's
"""
Cancel actor gracefully once it's "main" portal's
result arrives.
Should only be called for actors spawned with `run_in_actor()`.
"""
with trio.CancelScope() as cs:
task_status.started(cs)
# if this call errors we store the exception for later
# in ``errors`` which will be reraised inside
# a MultiError and we still send out a cancel request
@ -155,12 +159,13 @@ async def cancel_on_completion(
async def do_hard_kill(
proc: trio.Process,
terminate_after: int = 3,
) -> None:
# NOTE: this timeout used to do nothing since we were shielding
# the ``.wait()`` inside ``new_proc()`` which will pretty much
# never release until the process exits, now it acts as
# a hard-kill time ultimatum.
with trio.move_on_after(3) as cs:
with trio.move_on_after(terminate_after) as cs:
# NOTE: This ``__aexit__()`` shields internally.
async with proc: # calls ``trio.Process.aclose()``
@ -174,11 +179,33 @@ async def do_hard_kill(
proc.kill()
@asynccontextmanager
async def spawn_subactor(
subactor: 'Actor',
async def new_proc(
name: str,
actor_nursery: 'ActorNursery', # type: ignore # noqa
subactor: Actor,
errors: Dict[Tuple[str, str], Exception],
# passed through to actor main
bind_addr: Tuple[str, int],
parent_addr: Tuple[str, int],
):
_runtime_vars: Dict[str, Any], # serialized and sent to _child
*,
task_status: TaskStatus[Portal] = trio.TASK_STATUS_IGNORED
) -> None:
"""
Create a new ``multiprocessing.Process`` using the
spawn method as configured using ``try_set_start_method()``.
"""
# mark the new actor with the global spawn method
subactor._spawn_method = _spawn_method
uid = subactor.uid
if _spawn_method == 'trio':
spawn_cmd = [
sys.executable,
"-m",
@ -202,53 +229,34 @@ async def spawn_subactor(
subactor.loglevel
]
proc = await trio.open_process(spawn_cmd)
cancelled_during_spawn: bool = False
try:
yield proc
proc = await trio.open_process(spawn_cmd)
finally:
log.runtime(f"Attempting to kill {proc}")
# XXX: do this **after** cancellation/tearfown
# to avoid killing the process too early
# since trio does this internally on ``__aexit__()``
await do_hard_kill(proc)
async def new_proc(
name: str,
actor_nursery: 'ActorNursery', # type: ignore # noqa
subactor: Actor,
errors: Dict[Tuple[str, str], Exception],
# passed through to actor main
bind_addr: Tuple[str, int],
parent_addr: Tuple[str, int],
_runtime_vars: Dict[str, Any], # serialized and sent to _child
*,
task_status: TaskStatus[Portal] = trio.TASK_STATUS_IGNORED
) -> None:
"""Create a new ``multiprocessing.Process`` using the
spawn method as configured using ``try_set_start_method()``.
"""
cancel_scope = None
# mark the new actor with the global spawn method
subactor._spawn_method = _spawn_method
if _spawn_method == 'trio':
async with trio.open_nursery() as nursery:
async with spawn_subactor(
subactor,
parent_addr,
) as proc:
log.runtime(f"Started {proc}")
# wait for actor to spawn and connect back to us
# channel should have handshake completed by the
# local actor by the time we get a ref to it
try:
event, chan = await actor_nursery._actor.wait_for_peer(
subactor.uid)
except trio.Cancelled:
cancelled_during_spawn = True
# we may cancel before the child connects back in which
# case avoid clobbering the pdb tty.
if debug_mode():
with trio.CancelScope(shield=True):
# don't clobber an ongoing pdb
if is_root_process():
await maybe_wait_for_debugger()
else:
async with acquire_debug_lock(uid):
# soft wait on the proc to terminate
with trio.move_on_after(0.5):
await proc.wait()
raise
portal = Portal(chan)
actor_nursery._children[subactor.uid] = (
subactor, proc, portal)
@ -274,8 +282,9 @@ async def new_proc(
with trio.CancelScope(shield=True):
await actor_nursery._join_procs.wait()
async with trio.open_nursery() as nursery:
if portal in actor_nursery._cancel_after_result_on_exit:
cancel_scope = await nursery.start(
nursery.start_soon(
cancel_on_completion,
portal,
subactor,
@ -285,32 +294,45 @@ async def new_proc(
# Wait for proc termination but **dont' yet** call
# ``trio.Process.__aexit__()`` (it tears down stdio
# which will kill any waiting remote pdb trace).
# TODO: No idea how we can enforce zombie
# reaping more stringently without the shield
# we used to have below...
# with trio.CancelScope(shield=True):
# async with proc:
# Always "hard" join sub procs since no actor zombies
# are allowed!
# this is a "light" (cancellable) join, the hard join is
# in the enclosing scope (see above).
# This is a "soft" (cancellable) join/reap.
await proc.wait()
log.debug(f"Joined {proc}")
# pop child entry to indicate we no longer managing this subactor
subactor, proc, portal = actor_nursery._children.pop(subactor.uid)
# cancel result waiter that may have been spawned in
# tandem if not done already
if cancel_scope:
log.warning(
"Cancelling existing result waiter task for "
f"{subactor.uid}")
cancel_scope.cancel()
nursery.cancel_scope.cancel()
finally:
# The "hard" reap since no actor zombies are allowed!
# XXX: do this **after** cancellation/tearfown to avoid
# killing the process too early.
log.cancel(f'Hard reap sequence starting for {uid}')
with trio.CancelScope(shield=True):
# don't clobber an ongoing pdb
if cancelled_during_spawn:
# Try again to avoid TTY clobbering.
async with acquire_debug_lock(uid):
with trio.move_on_after(0.5):
await proc.wait()
if is_root_process():
await maybe_wait_for_debugger()
if proc.poll() is None:
log.cancel(f"Attempting to hard kill {proc}")
await do_hard_kill(proc)
log.debug(f"Joined {proc}")
if not cancelled_during_spawn:
# pop child entry to indicate we no longer managing this
# subactor
actor_nursery._children.pop(subactor.uid)
else:
# `multiprocessing`
# async with trio.open_nursery() as nursery:
@ -341,7 +363,7 @@ async def mp_new_proc(
task_status: TaskStatus[Portal] = trio.TASK_STATUS_IGNORED
) -> None:
async with trio.open_nursery() as nursery:
assert _ctx
start_method = _ctx.get_start_method()
if start_method == 'forkserver':
@ -405,6 +427,11 @@ async def mp_new_proc(
# local actor by the time we get a ref to it
event, chan = await actor_nursery._actor.wait_for_peer(
subactor.uid)
# except:
# TODO: in the case we were cancelled before the sub-proc
# registered itself back we must be sure to try and clean
# any process we may have started.
portal = Portal(chan)
actor_nursery._children[subactor.uid] = (subactor, proc, portal)
@ -420,37 +447,31 @@ async def mp_new_proc(
# while user code is still doing it's thing. Only after the
# nursery block closes do we allow subactor results to be
# awaited and reported upwards to the supervisor.
with trio.CancelScope(shield=True):
await actor_nursery._join_procs.wait()
finally:
# XXX: in the case we were cancelled before the sub-proc
# registered itself back we must be sure to try and clean
# any process we may have started.
reaping_cancelled: bool = False
cancel_scope: Optional[trio.CancelScope] = None
cancel_exc: Optional[trio.Cancelled] = None
async with trio.open_nursery() as nursery:
if portal in actor_nursery._cancel_after_result_on_exit:
try:
# async with trio.open_nursery() as n:
# n.cancel_scope.shield = True
cancel_scope = await nursery.start(
nursery.start_soon(
cancel_on_completion,
portal,
subactor,
errors
)
except trio.Cancelled as err:
cancel_exc = err
# if the reaping task was cancelled we may have hit
# a race where the subproc disconnected before we
# could send it a message to cancel (classic 2 generals)
# in that case, wait shortly then kill the process.
reaping_cancelled = True
await proc_waiter(proc)
# cancel result waiter that may have been spawned in
# tandem if not done already
log.warning(
"Cancelling existing result waiter task for "
f"{subactor.uid}")
nursery.cancel_scope.cancel()
finally:
# hard reap sequence
if proc.is_alive():
log.cancel(f"Attempting to hard kill {proc}")
with trio.move_on_after(0.1) as cs:
cs.shield = True
await proc_waiter(proc)
@ -458,24 +479,8 @@ async def mp_new_proc(
if cs.cancelled_caught:
proc.terminate()
if not reaping_cancelled and proc.is_alive():
await proc_waiter(proc)
# TODO: timeout block here?
proc.join()
log.debug(f"Joined {proc}")
# pop child entry to indicate we are no longer managing subactor
subactor, proc, portal = actor_nursery._children.pop(subactor.uid)
# cancel result waiter that may have been spawned in
# tandem if not done already
if cancel_scope:
log.warning(
"Cancelling existing result waiter task for "
f"{subactor.uid}")
cancel_scope.cancel()
elif reaping_cancelled: # let the cancellation bubble up
assert cancel_exc
raise cancel_exc

View File

@ -12,6 +12,7 @@ import trio
from async_generator import asynccontextmanager
from . import _debug
from ._debug import maybe_wait_for_debugger
from ._state import current_actor, is_main_process, is_root_process
from .log import get_logger, get_loglevel
from ._actor import Actor
@ -280,26 +281,7 @@ async def _open_and_supervise_one_cancels_all_nursery(
# will make the pdb repl unusable.
# Instead try to wait for pdb to be released before
# tearing down.
if is_root_process():
# TODO: could this make things more deterministic?
# wait to see if a sub-actor task will be
# scheduled and grab the tty lock on the next
# tick?
# await trio.testing.wait_all_tasks_blocked()
debug_complete = _debug._no_remote_has_tty
if (
debug_complete and
not debug_complete.is_set()
):
log.warning(
'Root has errored but pdb is in use by '
f'child {_debug._global_actor_in_debug}\n'
'Waiting on tty lock to release..')
# with trio.CancelScope(shield=True):
await debug_complete.wait()
await maybe_wait_for_debugger()
# if the caller's scope errored then we activate our
# one-cancels-all supervisor strategy (don't