Let `ActorNursery` choose whether to raise remote errors
- Don't raise inside `result_from_portal()` and instead return a flag that indicates whether the error was remote or not. - Stick the soft reap sequence outside a `finally:`. - do error tracking in `ActorNursery._handle_err() -> bool:` to avoid duplicate raises on close. - add `ActorNursery.cancel_called: bool` - accept a cancelled soft reap and toss in some logging for now to begin figuring out races with the spawner nursery vs. the enter block being the source of an error that causes actor nursery cancellation. - cancel the spawn nursery if all procs complete but the nursery hasn't been closed (pretty sure this isn't correct nor working.. the nursery should always be closed in order for the join procs event to have arrived). - tossed in some code for the mp backend but none of it works (or is tested) and needs to be rewritten like the trio spawner likely. All still very WIP in case that wasn't clear XDzombie_lord_infinite
parent
5eb7c4c857
commit
348423ece7
|
@ -96,6 +96,7 @@ def try_set_start_method(name: str) -> Optional[mp.context.BaseContext]:
|
||||||
|
|
||||||
|
|
||||||
async def result_from_portal(
|
async def result_from_portal(
|
||||||
|
|
||||||
portal: Portal,
|
portal: Portal,
|
||||||
actor: Actor,
|
actor: Actor,
|
||||||
|
|
||||||
|
@ -103,7 +104,7 @@ async def result_from_portal(
|
||||||
cancel_on_result: bool = False,
|
cancel_on_result: bool = False,
|
||||||
task_status: TaskStatus[trio.CancelScope] = trio.TASK_STATUS_IGNORED,
|
task_status: TaskStatus[trio.CancelScope] = trio.TASK_STATUS_IGNORED,
|
||||||
|
|
||||||
) -> None:
|
) -> tuple[Optional[Any], Optional[BaseException]]:
|
||||||
"""
|
"""
|
||||||
Cancel actor gracefully once it's "main" portal's
|
Cancel actor gracefully once it's "main" portal's
|
||||||
result arrives.
|
result arrives.
|
||||||
|
@ -111,9 +112,11 @@ async def result_from_portal(
|
||||||
Should only be called for actors spawned with `run_in_actor()`.
|
Should only be called for actors spawned with `run_in_actor()`.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
__tracebackhide__ = True
|
# __tracebackhide__ = True
|
||||||
|
|
||||||
uid = portal.channel.uid
|
uid = portal.channel.uid
|
||||||
|
remote_result = None
|
||||||
|
is_remote_result = None
|
||||||
|
|
||||||
# cancel control is explicityl done by the caller
|
# cancel control is explicityl done by the caller
|
||||||
with trio.CancelScope() as cs:
|
with trio.CancelScope() as cs:
|
||||||
|
@ -129,45 +132,37 @@ async def result_from_portal(
|
||||||
# XXX: streams should never be reaped here since they should
|
# XXX: streams should never be reaped here since they should
|
||||||
# always be established and shutdown using a context manager api
|
# always be established and shutdown using a context manager api
|
||||||
result = await portal.result()
|
result = await portal.result()
|
||||||
|
is_remote_result = True
|
||||||
log.info(f"Returning final result: {result}")
|
log.info(f"Returning final result: {result}")
|
||||||
|
|
||||||
|
except RemoteActorError as rerr:
|
||||||
|
# this includes real remote errors as well as
|
||||||
|
# `ContextCancelled`
|
||||||
|
is_remote_result = True
|
||||||
|
result = rerr
|
||||||
|
|
||||||
except (Exception, trio.MultiError) as err:
|
except (Exception, trio.MultiError) as err:
|
||||||
# we reraise in the parent task via a ``trio.MultiError``
|
# we reraise in the parent task via a ``trio.MultiError``
|
||||||
result = err
|
is_remote_result = False
|
||||||
errors[actor.uid] = err
|
|
||||||
raise
|
|
||||||
|
|
||||||
except trio.Cancelled as err:
|
|
||||||
# lol, of course we need this too ;P
|
|
||||||
# TODO: merge with above?
|
|
||||||
log.warning(f"Cancelled `Portal.result()` waiter for {uid}")
|
|
||||||
result = err
|
result = err
|
||||||
# errors[actor.uid] = err
|
# errors[actor.uid] = err
|
||||||
# raise
|
# raise
|
||||||
|
|
||||||
if cancel_on_result:
|
if cs.cancelled_caught:
|
||||||
if isinstance(result, Exception):
|
log.warning(f"Cancelled `Portal.result()` waiter for {uid}")
|
||||||
# errors[actor.uid] = result
|
|
||||||
log.warning(
|
|
||||||
f"Cancelling single-task-run {uid} after error {result}"
|
|
||||||
)
|
|
||||||
# raise result
|
|
||||||
|
|
||||||
else:
|
return result, is_remote_result
|
||||||
log.runtime(
|
|
||||||
f"Cancelling {uid} gracefully "
|
|
||||||
f"after one-time-task result {result}")
|
|
||||||
|
|
||||||
# an actor that was `.run_in_actor()` executes a single task
|
# except trio.Cancelled as err:
|
||||||
# and delivers the result, then we cancel it.
|
# # lol, of course we need this too ;P
|
||||||
# TODO: likely in the future we should just implement this using
|
# # TODO: merge with above?
|
||||||
# the new `open_context()` IPC api, since it's the more general
|
# log.warning(f"Cancelled `Portal.result()` waiter for {uid}")
|
||||||
# api and can represent this form.
|
# result = err
|
||||||
# XXX: do we need this?
|
# # errors[actor.uid] = err
|
||||||
# await maybe_wait_for_debugger()
|
# raise
|
||||||
await portal.cancel_actor()
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
# return result
|
||||||
|
|
||||||
|
|
||||||
async def do_hard_kill(
|
async def do_hard_kill(
|
||||||
|
@ -209,17 +204,17 @@ async def reap_proc(
|
||||||
|
|
||||||
proc: trio.Process,
|
proc: trio.Process,
|
||||||
uid: tuple[str, str],
|
uid: tuple[str, str],
|
||||||
terminate_after: float = float('inf'),
|
terminate_after: Optional[float] = None,
|
||||||
hard_kill_after: int = 0.1,
|
hard_kill_after: int = 0.1,
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
with trio.move_on_after(terminate_after) as cs:
|
with trio.move_on_after(terminate_after or float('inf')) as cs:
|
||||||
# Wait for proc termination but **dont' yet** do
|
# Wait for proc termination but **dont' yet** do
|
||||||
# any out-of-ipc-land termination / process
|
# any out-of-ipc-land termination / process
|
||||||
# killing. This is a "light" (cancellable) join,
|
# killing. This is a "light" (cancellable) join,
|
||||||
# the hard join is below after timeout
|
# the hard join is below after timeout
|
||||||
await proc.wait()
|
await proc.wait()
|
||||||
log.info(f'{uid} terminated gracefully')
|
log.info(f'Proc for {uid} terminated gracefully')
|
||||||
|
|
||||||
if cs.cancelled_caught and terminate_after is not float('inf'):
|
if cs.cancelled_caught and terminate_after is not float('inf'):
|
||||||
# Always "hard" join lingering sub procs since no
|
# Always "hard" join lingering sub procs since no
|
||||||
|
@ -248,6 +243,7 @@ async def new_proc(
|
||||||
bind_addr: Tuple[str, int],
|
bind_addr: Tuple[str, int],
|
||||||
parent_addr: Tuple[str, int],
|
parent_addr: Tuple[str, int],
|
||||||
_runtime_vars: Dict[str, Any], # serialized and sent to _child
|
_runtime_vars: Dict[str, Any], # serialized and sent to _child
|
||||||
|
|
||||||
*,
|
*,
|
||||||
|
|
||||||
graceful_kill_timeout: int = 3,
|
graceful_kill_timeout: int = 3,
|
||||||
|
@ -357,6 +353,7 @@ async def new_proc(
|
||||||
) as cerr:
|
) as cerr:
|
||||||
|
|
||||||
log.exception(f'Relaying unexpected {cerr} to nursery')
|
log.exception(f'Relaying unexpected {cerr} to nursery')
|
||||||
|
await breakpoint()
|
||||||
|
|
||||||
# sending IPC-msg level cancel requests is expected to be
|
# sending IPC-msg level cancel requests is expected to be
|
||||||
# managed by the nursery.
|
# managed by the nursery.
|
||||||
|
@ -373,16 +370,16 @@ async def new_proc(
|
||||||
# True, # cancel_on_result
|
# True, # cancel_on_result
|
||||||
)
|
)
|
||||||
|
|
||||||
finally:
|
# Graceful reap attempt - 2 cases:
|
||||||
# 2 cases:
|
|
||||||
# - actor nursery was cancelled in which case
|
# - actor nursery was cancelled in which case
|
||||||
# we want to try a soft reap of the actor via
|
# we want to try a soft reap of the actor via
|
||||||
# ipc cancellation and then failing that do a hard
|
# ipc cancellation and then failing that do a hard
|
||||||
# reap.
|
# reap.
|
||||||
# - this is normal termination and we must wait indefinitely
|
# - this is normal termination and we must wait indefinitely
|
||||||
# for ria and daemon actors
|
# for ria to return and daemon actors to be cancelled
|
||||||
reaping_cancelled: bool = False
|
reaping_cancelled: bool = False
|
||||||
ria = portal in actor_nursery._cancel_after_result_on_exit
|
ria = portal in actor_nursery._cancel_after_result_on_exit
|
||||||
|
result = None
|
||||||
|
|
||||||
# this is the soft reap sequence. we can
|
# this is the soft reap sequence. we can
|
||||||
# either collect results:
|
# either collect results:
|
||||||
|
@ -399,80 +396,138 @@ async def new_proc(
|
||||||
try:
|
try:
|
||||||
log.cancel(f'Starting soft actor reap for {uid}')
|
log.cancel(f'Starting soft actor reap for {uid}')
|
||||||
cancel_scope = None
|
cancel_scope = None
|
||||||
# async with trio.open_nursery() as nursery:
|
reap_timeout = None
|
||||||
|
|
||||||
if portal.channel.connected() and ria:
|
if portal.channel.connected() and ria:
|
||||||
|
|
||||||
# we wait for result and cancel on completion
|
result, is_remote = await result_from_portal(
|
||||||
# if uid[0] == 'odds':
|
|
||||||
# await breakpoint()
|
|
||||||
await result_from_portal(
|
|
||||||
portal,
|
portal,
|
||||||
subactor,
|
subactor,
|
||||||
errors,
|
errors,
|
||||||
True, # cancel_on_result
|
|
||||||
)
|
|
||||||
# # collect any expected ``.run_in_actor()`` results
|
|
||||||
# cancel_scope = await nursery.start(
|
|
||||||
# result_from_portal,
|
|
||||||
# portal,
|
|
||||||
# subactor,
|
|
||||||
# errors,
|
|
||||||
# True, # cancel_on_result
|
# True, # cancel_on_result
|
||||||
# )
|
)
|
||||||
|
if is_remote:
|
||||||
|
if isinstance(result, RemoteActorError):
|
||||||
|
# errors[actor.uid] = result
|
||||||
|
if (
|
||||||
|
portal.cancel_called and
|
||||||
|
isinstance(result, ContextCancelled)
|
||||||
|
):
|
||||||
|
log.cancel(f'{uid} received expected cancel')
|
||||||
|
errors[uid] = result
|
||||||
|
|
||||||
|
# fall through to below soft proc reap
|
||||||
|
reap_timeout = 0.5
|
||||||
|
|
||||||
|
else:
|
||||||
|
log.warning(
|
||||||
|
f"Cancelling single-task-run {uid} after remote error {result}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# likely a real remote error propagation
|
||||||
|
# so pass up to nursery strat
|
||||||
|
should_raise = await actor_nursery._handle_err(
|
||||||
|
result,
|
||||||
|
portal=portal,
|
||||||
|
)
|
||||||
|
|
||||||
|
# propagate up to spawn nursery to be
|
||||||
|
# grouped into any multierror.
|
||||||
|
# if should_raise:
|
||||||
|
# raise result
|
||||||
|
|
||||||
|
else:
|
||||||
|
log.runtime(
|
||||||
|
f"Cancelling {uid} gracefully "
|
||||||
|
f"after one-time-task result {result}")
|
||||||
|
|
||||||
|
# an actor that was `.run_in_actor()` executes a single task
|
||||||
|
# and delivers the result, then we cancel it.
|
||||||
|
# TODO: likely in the future we should just implement this using
|
||||||
|
# the new `open_context()` IPC api, since it's the more general
|
||||||
|
# api and can represent this form.
|
||||||
|
# XXX: do we need this?
|
||||||
|
# await maybe_wait_for_debugger()
|
||||||
|
await portal.cancel_actor()
|
||||||
|
|
||||||
|
else:
|
||||||
|
log.exception(
|
||||||
|
f"Cancelling single-task-run {uid} after local error"
|
||||||
|
)
|
||||||
|
raise result
|
||||||
|
|
||||||
# soft & cancellable
|
# soft & cancellable
|
||||||
await reap_proc(proc, uid)
|
await reap_proc(proc, uid, terminate_after=reap_timeout)
|
||||||
|
|
||||||
# # if proc terminates before portal result
|
# except (
|
||||||
# if cancel_scope:
|
# ContextCancelled,
|
||||||
# cancel_scope.cancel()
|
# ) as err:
|
||||||
except (
|
# if portal.cancel_called:
|
||||||
ContextCancelled,
|
# log.cancel('{uid} received expected cancel')
|
||||||
) as err:
|
|
||||||
if portal.cancel_called:
|
|
||||||
log.cancel('{uid} received expected cancel')
|
|
||||||
|
|
||||||
# soft & cancellable
|
# # soft & cancellable
|
||||||
await reap_proc(proc, uid, terminate_after=0.1)
|
# await reap_proc(proc, uid, terminate_after=0.1)
|
||||||
|
|
||||||
except (
|
# except (
|
||||||
RemoteActorError,
|
# RemoteActorError,
|
||||||
) as err:
|
# ) as err:
|
||||||
reaping_cancelled = err
|
# reaping_cancelled = err
|
||||||
log.exception(f'{uid} remote error')
|
# log.exception(f'{uid} remote error')
|
||||||
await actor_nursery._handle_err(err, portal=portal)
|
# await actor_nursery._handle_err(err, portal=portal)
|
||||||
|
|
||||||
except (
|
except (
|
||||||
trio.Cancelled,
|
trio.Cancelled,
|
||||||
) as err:
|
) as err:
|
||||||
reaping_cancelled = err
|
|
||||||
if actor_nursery.cancelled:
|
|
||||||
log.cancel(f'{uid} wait cancelled by nursery')
|
|
||||||
else:
|
|
||||||
log.exception(f'{uid} soft wait error?')
|
|
||||||
|
|
||||||
except (
|
# NOTE: for now we pack the cancelleds and expect the actor
|
||||||
BaseException
|
# nursery to re-raise them in a multierror but we could
|
||||||
) as err:
|
# have also let them bubble up through the spawn nursery.
|
||||||
reaping_cancelled = err
|
|
||||||
log.exception(f'{uid} soft reap local error')
|
# in theory it's more correct to raise any
|
||||||
|
# `ContextCancelled` errors we get back from the
|
||||||
|
# `Portal.cancel_actor()` call and in that error
|
||||||
|
# have meta-data about whether we timeout out or
|
||||||
|
# actually got a cancel message back from the remote task.
|
||||||
|
|
||||||
|
# IF INSTEAD we raised *here* then this logic has to be
|
||||||
|
# handled inside the oca supervisor block and the spawn_n
|
||||||
|
# task cancelleds would have to be replaced with the remote
|
||||||
|
# task `ContextCancelled`s, *if* they ever arrive.
|
||||||
|
errors[uid] = err
|
||||||
|
# with trio.CancelScope(shield=True):
|
||||||
|
# await breakpoint()
|
||||||
|
|
||||||
|
if actor_nursery.cancel_called:
|
||||||
|
log.cancel(f'{uid} soft reap cancelled by nursery')
|
||||||
|
else:
|
||||||
|
if not actor_nursery._spawn_n.cancel_scope.cancel_called:
|
||||||
|
# this would be pretty weird and unexpected
|
||||||
|
await breakpoint()
|
||||||
|
|
||||||
|
# actor nursery wasn't cancelled before the spawn
|
||||||
|
# nursery was which likely means that there was
|
||||||
|
# an error in the actor nursery enter and the
|
||||||
|
# spawn nursery cancellation "beat" the call to
|
||||||
|
# .cancel()? that's a bug right?
|
||||||
|
|
||||||
|
# saw this with settings bugs in the ordermode pane in
|
||||||
|
# piker.
|
||||||
|
log.exception(f'{uid} soft wait error?')
|
||||||
|
raise RuntimeError(
|
||||||
|
'Task spawn nursery cancelled before actor nursery?')
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
if reaping_cancelled:
|
if reaping_cancelled:
|
||||||
|
assert actor_nursery.cancel_called
|
||||||
if actor_nursery.cancelled:
|
if actor_nursery.cancelled:
|
||||||
log.cancel(f'Nursery cancelled during soft wait for {uid}')
|
log.cancel(f'Nursery cancelled during soft wait for {uid}')
|
||||||
|
|
||||||
with trio.CancelScope(shield=True):
|
with trio.CancelScope(shield=True):
|
||||||
await maybe_wait_for_debugger()
|
await maybe_wait_for_debugger()
|
||||||
|
|
||||||
# XXX: can't do this, it'll hang some tests.. no
|
# XXX: we should probably just
|
||||||
# idea why yet.
|
# check for a `ContextCancelled` on portals
|
||||||
# with trio.CancelScope(shield=True):
|
# here and fill them in over `trio.Cancelled` right?
|
||||||
# await actor_nursery._handle_err(
|
|
||||||
# reaping_cancelled,
|
|
||||||
# portal=portal
|
|
||||||
# )
|
|
||||||
|
|
||||||
# hard reap sequence with timeouts
|
# hard reap sequence with timeouts
|
||||||
if proc.poll() is None:
|
if proc.poll() is None:
|
||||||
|
@ -519,15 +574,26 @@ async def new_proc(
|
||||||
# subactor
|
# subactor
|
||||||
subactor, proc, portal = actor_nursery._children.pop(
|
subactor, proc, portal = actor_nursery._children.pop(
|
||||||
subactor.uid)
|
subactor.uid)
|
||||||
|
|
||||||
if not actor_nursery._children:
|
if not actor_nursery._children:
|
||||||
|
# all subactor children have completed
|
||||||
log.cancel(f"{uid} reports all children complete!")
|
log.cancel(f"{uid} reports all children complete!")
|
||||||
|
|
||||||
actor_nursery._all_children_reaped.set()
|
actor_nursery._all_children_reaped.set()
|
||||||
|
|
||||||
|
spawn_n = actor_nursery._spawn_n
|
||||||
|
# with trio.CancelScope(shield=True):
|
||||||
|
# await breakpoint()
|
||||||
|
if not spawn_n._closed:
|
||||||
|
# the parent task that opened the actor nursery
|
||||||
|
# hasn't yet closed it so we cancel that task now.
|
||||||
|
spawn_n.cancel_scope.cancel()
|
||||||
|
|
||||||
# not entirely sure why we need this.. but without it
|
# not entirely sure why we need this.. but without it
|
||||||
# the reaping cancelled error is never reported upwards
|
# the reaping cancelled error is never reported upwards
|
||||||
# to the spawn nursery?
|
# to the spawn nursery?
|
||||||
if reaping_cancelled:
|
# if reaping_cancelled:
|
||||||
raise reaping_cancelled
|
# raise reaping_cancelled
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# `multiprocessing`
|
# `multiprocessing`
|
||||||
|
@ -644,6 +710,7 @@ async def mp_new_proc(
|
||||||
|
|
||||||
# no shield is required here (vs. above on the trio backend)
|
# no shield is required here (vs. above on the trio backend)
|
||||||
# since debug mode is not supported on mp.
|
# since debug mode is not supported on mp.
|
||||||
|
with trio.CancelScope(shield=True):
|
||||||
await actor_nursery._join_procs.wait()
|
await actor_nursery._join_procs.wait()
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
|
@ -659,13 +726,23 @@ async def mp_new_proc(
|
||||||
try:
|
try:
|
||||||
# async with trio.open_nursery() as n:
|
# async with trio.open_nursery() as n:
|
||||||
# n.cancel_scope.shield = True
|
# n.cancel_scope.shield = True
|
||||||
cancel_scope = await nursery.start(
|
print('soft mp reap')
|
||||||
result_from_portal,
|
# cancel_scope = await nursery.start(
|
||||||
|
result = await result_from_portal(
|
||||||
portal,
|
portal,
|
||||||
subactor,
|
subactor,
|
||||||
errors
|
errors,
|
||||||
|
# True,
|
||||||
)
|
)
|
||||||
except trio.Cancelled as err:
|
|
||||||
|
# except trio.Cancelled as err:
|
||||||
|
except BaseException as err:
|
||||||
|
|
||||||
|
log.exception('hard mp reap')
|
||||||
|
with trio.CancelScope(shield=True):
|
||||||
|
await actor_nursery._handle_err(err, portal=portal)
|
||||||
|
print('sent to nursery')
|
||||||
|
|
||||||
cancel_exc = err
|
cancel_exc = err
|
||||||
|
|
||||||
# if the reaping task was cancelled we may have hit
|
# if the reaping task was cancelled we may have hit
|
||||||
|
@ -675,23 +752,34 @@ async def mp_new_proc(
|
||||||
reaping_cancelled = True
|
reaping_cancelled = True
|
||||||
|
|
||||||
if proc.is_alive():
|
if proc.is_alive():
|
||||||
|
with trio.CancelScope(shield=True):
|
||||||
|
print('hard reaping')
|
||||||
with trio.move_on_after(0.1) as cs:
|
with trio.move_on_after(0.1) as cs:
|
||||||
cs.shield = True
|
cs.shield = True
|
||||||
await proc_waiter(proc)
|
await proc_waiter(proc)
|
||||||
|
|
||||||
if cs.cancelled_caught:
|
if cs.cancelled_caught:
|
||||||
|
print('pwning mp proc')
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
|
finally:
|
||||||
|
|
||||||
if not reaping_cancelled and proc.is_alive():
|
# if not reaping_cancelled and proc.is_alive():
|
||||||
await proc_waiter(proc)
|
# await proc_waiter(proc)
|
||||||
|
|
||||||
# TODO: timeout block here?
|
# TODO: timeout block here?
|
||||||
proc.join()
|
proc.join()
|
||||||
|
|
||||||
log.debug(f"Joined {proc}")
|
log.debug(f"Joined {proc}")
|
||||||
|
|
||||||
# pop child entry to indicate we are no longer managing subactor
|
# pop child entry to indicate we are no longer managing subactor
|
||||||
subactor, proc, portal = actor_nursery._children.pop(subactor.uid)
|
subactor, proc, portal = actor_nursery._children.pop(subactor.uid)
|
||||||
|
|
||||||
|
if not actor_nursery._children:
|
||||||
|
# all subactor children have completed
|
||||||
|
# log.cancel(f"{uid} reports all children complete!")
|
||||||
|
actor_nursery._all_children_reaped.set()
|
||||||
|
|
||||||
|
|
||||||
# cancel result waiter that may have been spawned in
|
# cancel result waiter that may have been spawned in
|
||||||
# tandem if not done already
|
# tandem if not done already
|
||||||
if cancel_scope:
|
if cancel_scope:
|
||||||
|
@ -700,6 +788,7 @@ async def mp_new_proc(
|
||||||
f"{subactor.uid}")
|
f"{subactor.uid}")
|
||||||
cancel_scope.cancel()
|
cancel_scope.cancel()
|
||||||
|
|
||||||
elif reaping_cancelled: # let the cancellation bubble up
|
if reaping_cancelled: # let the cancellation bubble up
|
||||||
|
print('raising')
|
||||||
assert cancel_exc
|
assert cancel_exc
|
||||||
raise cancel_exc
|
raise cancel_exc
|
||||||
|
|
|
@ -12,7 +12,7 @@ import trio
|
||||||
from async_generator import asynccontextmanager
|
from async_generator import asynccontextmanager
|
||||||
|
|
||||||
from . import _debug
|
from . import _debug
|
||||||
from ._debug import maybe_wait_for_debugger
|
from ._debug import maybe_wait_for_debugger, breakpoint
|
||||||
from ._state import current_actor, is_main_process, is_root_process
|
from ._state import current_actor, is_main_process, is_root_process
|
||||||
from .log import get_logger, get_loglevel
|
from .log import get_logger, get_loglevel
|
||||||
from ._actor import Actor
|
from ._actor import Actor
|
||||||
|
@ -48,10 +48,19 @@ class ActorNursery:
|
||||||
# cancelled when their "main" result arrives
|
# cancelled when their "main" result arrives
|
||||||
self._cancel_after_result_on_exit: set = set()
|
self._cancel_after_result_on_exit: set = set()
|
||||||
self.cancelled: bool = False
|
self.cancelled: bool = False
|
||||||
|
self._cancel_called: bool = False
|
||||||
self._join_procs = trio.Event()
|
self._join_procs = trio.Event()
|
||||||
self._all_children_reaped = trio.Event()
|
self._all_children_reaped = trio.Event()
|
||||||
self.errors = errors
|
self.errors = errors
|
||||||
|
|
||||||
|
@property
|
||||||
|
def cancel_called(self) -> bool:
|
||||||
|
'''
|
||||||
|
Same principle as ``trio.CancelScope.cancel_called``.
|
||||||
|
|
||||||
|
'''
|
||||||
|
return self._cancel_called
|
||||||
|
|
||||||
async def start_actor(
|
async def start_actor(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
|
@ -177,17 +186,22 @@ class ActorNursery:
|
||||||
|
|
||||||
If ``hard_killl`` is set to ``True`` then kill the processes
|
If ``hard_killl`` is set to ``True`` then kill the processes
|
||||||
directly without any far end graceful ``trio`` cancellation.
|
directly without any far end graceful ``trio`` cancellation.
|
||||||
"""
|
|
||||||
self.cancelled = True
|
|
||||||
|
|
||||||
|
"""
|
||||||
# entries may be poppsed by the spawning backend as
|
# entries may be poppsed by the spawning backend as
|
||||||
# actors cancel individually
|
# actors cancel individually
|
||||||
childs = self._children.copy()
|
childs = self._children.copy()
|
||||||
|
|
||||||
|
if self.cancel_called:
|
||||||
|
log.warning(
|
||||||
|
f'Nursery with children {len(childs)} already cancelled')
|
||||||
|
return
|
||||||
|
|
||||||
log.cancel(
|
log.cancel(
|
||||||
f'Cancelling nursery in {self._actor.uid} with children\n'
|
f'Cancelling nursery in {self._actor.uid} with children\n'
|
||||||
f'{childs.keys()}'
|
f'{childs.keys()}'
|
||||||
)
|
)
|
||||||
|
self._cancel_called = True
|
||||||
|
|
||||||
# wake up all spawn tasks to move on as those nursery
|
# wake up all spawn tasks to move on as those nursery
|
||||||
# has ``__aexit__()``-ed
|
# has ``__aexit__()``-ed
|
||||||
|
@ -196,25 +210,45 @@ class ActorNursery:
|
||||||
await maybe_wait_for_debugger()
|
await maybe_wait_for_debugger()
|
||||||
|
|
||||||
# one-cancels-all strat
|
# one-cancels-all strat
|
||||||
|
try:
|
||||||
async with trio.open_nursery() as cancel_sender:
|
async with trio.open_nursery() as cancel_sender:
|
||||||
for subactor, proc, portal in childs.values():
|
for subactor, proc, portal in childs.values():
|
||||||
if proc.poll() is None and not portal.cancel_called:
|
if not portal.cancel_called and portal.channel.connected():
|
||||||
cancel_sender.start_soon(portal.cancel_actor)
|
cancel_sender.start_soon(portal.cancel_actor)
|
||||||
|
|
||||||
|
except trio.MultiError as err:
|
||||||
|
_err = err
|
||||||
|
log.exception(f'{self} errors during cancel')
|
||||||
|
# await breakpoint()
|
||||||
|
# # LOL, ok so multiprocessing requires this for some reason..
|
||||||
|
# with trio.CancelScope(shield=True):
|
||||||
|
# await trio.lowlevel.checkpoint()
|
||||||
|
|
||||||
# cancel all spawner tasks
|
# cancel all spawner tasks
|
||||||
# self._spawn_n.cancel_scope.cancel()
|
# self._spawn_n.cancel_scope.cancel()
|
||||||
|
self.cancelled = True
|
||||||
|
|
||||||
async def _handle_err(
|
async def _handle_err(
|
||||||
self,
|
self,
|
||||||
err: BaseException,
|
err: BaseException,
|
||||||
portal: Optional[Portal] = None,
|
portal: Optional[Portal] = None,
|
||||||
|
is_ctx_error: bool = False,
|
||||||
|
|
||||||
) -> None:
|
) -> bool:
|
||||||
# XXX: hypothetically an error could be
|
# XXX: hypothetically an error could be
|
||||||
# raised and then a cancel signal shows up
|
# raised and then a cancel signal shows up
|
||||||
# slightly after in which case the `else:`
|
# slightly after in which case the `else:`
|
||||||
# block here might not complete? For now,
|
# block here might not complete? For now,
|
||||||
# shield both.
|
# shield both.
|
||||||
|
if is_ctx_error:
|
||||||
|
assert not portal
|
||||||
|
uid = self._actor.uid
|
||||||
|
else:
|
||||||
|
uid = portal.channel.uid
|
||||||
|
|
||||||
|
if err not in self.errors.values():
|
||||||
|
self.errors[uid] = err
|
||||||
|
|
||||||
with trio.CancelScope(shield=True):
|
with trio.CancelScope(shield=True):
|
||||||
etype = type(err)
|
etype = type(err)
|
||||||
|
|
||||||
|
@ -228,13 +262,18 @@ class ActorNursery:
|
||||||
f"Nursery for {current_actor().uid} "
|
f"Nursery for {current_actor().uid} "
|
||||||
f"was cancelled with {etype}")
|
f"was cancelled with {etype}")
|
||||||
else:
|
else:
|
||||||
log.exception(
|
log.error(
|
||||||
f"Nursery for {current_actor().uid} "
|
f"Nursery for {current_actor().uid} "
|
||||||
f"errored with {err}, ")
|
f"errored from {uid} with\n{err}")
|
||||||
|
|
||||||
# cancel all subactors
|
# cancel all subactors
|
||||||
await self.cancel()
|
await self.cancel()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
log.warning(f'Skipping duplicate error for {uid}')
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def _open_and_supervise_one_cancels_all_nursery(
|
async def _open_and_supervise_one_cancels_all_nursery(
|
||||||
|
@ -251,6 +290,7 @@ async def _open_and_supervise_one_cancels_all_nursery(
|
||||||
# actors spawned in "daemon mode" (aka started using
|
# actors spawned in "daemon mode" (aka started using
|
||||||
# ``ActorNursery.start_actor()``).
|
# ``ActorNursery.start_actor()``).
|
||||||
src_err: Optional[BaseException] = None
|
src_err: Optional[BaseException] = None
|
||||||
|
nurse_err: Optional[BaseException] = None
|
||||||
|
|
||||||
# errors from this daemon actor nursery bubble up to caller
|
# errors from this daemon actor nursery bubble up to caller
|
||||||
try:
|
try:
|
||||||
|
@ -303,9 +343,16 @@ async def _open_and_supervise_one_cancels_all_nursery(
|
||||||
src_err = err
|
src_err = err
|
||||||
|
|
||||||
# with trio.CancelScope(shield=True):
|
# with trio.CancelScope(shield=True):
|
||||||
await anursery._handle_err(err)
|
should_raise = await anursery._handle_err(err, is_ctx_error=True)
|
||||||
|
|
||||||
|
# XXX: raising here causes some cancellation
|
||||||
|
# / multierror tests to fail because of what appears to
|
||||||
|
# be double raise? we probably need to see how `trio`
|
||||||
|
# does this case..
|
||||||
|
if should_raise:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
# except trio.MultiError as err:
|
||||||
except BaseException as err:
|
except BaseException as err:
|
||||||
# nursery bubble up
|
# nursery bubble up
|
||||||
nurse_err = err
|
nurse_err = err
|
||||||
|
@ -331,15 +378,15 @@ async def _open_and_supervise_one_cancels_all_nursery(
|
||||||
# collected in ``errors`` so cancel all actors, summarize
|
# collected in ``errors`` so cancel all actors, summarize
|
||||||
# all errors and re-raise.
|
# all errors and re-raise.
|
||||||
|
|
||||||
if src_err and src_err not in errors.values():
|
# await breakpoint()
|
||||||
errors[actor.uid] = src_err
|
|
||||||
|
|
||||||
if errors:
|
if errors:
|
||||||
|
# if nurse_err or src_err:
|
||||||
if anursery._children:
|
if anursery._children:
|
||||||
raise RuntimeError("WHERE TF IS THE ZOMBIE LORD!?!?!")
|
raise RuntimeError("WHERE TF IS THE ZOMBIE LORD!?!?!")
|
||||||
# with trio.CancelScope(shield=True):
|
# with trio.CancelScope(shield=True):
|
||||||
# await anursery.cancel()
|
# await anursery.cancel()
|
||||||
|
|
||||||
|
|
||||||
# use `MultiError` as needed
|
# use `MultiError` as needed
|
||||||
if len(errors) > 1:
|
if len(errors) > 1:
|
||||||
raise trio.MultiError(tuple(errors.values()))
|
raise trio.MultiError(tuple(errors.values()))
|
||||||
|
|
Loading…
Reference in New Issue