forked from goodboy/tractor
1
0
Fork 0

Tweaks to get us down to 3 failed cancel tests

The remaining errors all have to do with not getting the exact same
format as previous of collected `.run_in_actor()` errors as `MultiError`s.
Not even sure at this point if the whole collect single task results and
bubble should be a thing but trying to keep the support for now I guess.

There's still issues with a hang in the pub sub tests and the one
debugger test has a different outcome due to the root getting the lock
from the breakpoint forever child too quickly.

- go back to raising portal result-that-are-errors in the spawn task
- go back to shielding the nursery close / proc join event
  - report any error on this shielded join and relay to nursery handler
    method (which should be customizable in the future for alternate
    strats then OCA) as well try to collect ria (run in actor) result
- drop async (via nursery) ria result collection, just do it sync with
  the soft `proc.wait()` reap immediately after, which should work
  presuming that the ipc connection will break on process termination
  anyway and it'll mean no multierror to deal with and no cancel scope
  to manage on the ria reaper task.
zombie_lord_infinite
Tyler Goodlet 2021-10-10 12:28:42 -04:00
parent 8a59713d48
commit 0dcffeee0f
1 changed files with 123 additions and 77 deletions

View File

@ -31,7 +31,7 @@ from .log import get_logger
from ._portal import Portal from ._portal import Portal
from ._actor import Actor from ._actor import Actor
from ._entry import _mp_main from ._entry import _mp_main
from ._exceptions import ActorFailure from ._exceptions import ActorFailure, RemoteActorError
from ._debug import maybe_wait_for_debugger from ._debug import maybe_wait_for_debugger
@ -109,6 +109,8 @@ async def result_from_portal(
""" """
__tracebackhide__ = True __tracebackhide__ = True
uid = portal.channel.uid
# cancel control is explicityl done by the caller # cancel control is explicityl done by the caller
with trio.CancelScope() as cs: with trio.CancelScope() as cs:
task_status.started(cs) task_status.started(cs)
@ -129,12 +131,12 @@ async def result_from_portal(
# we reraise in the parent task via a ``trio.MultiError`` # we reraise in the parent task via a ``trio.MultiError``
result = err result = err
errors[actor.uid] = err errors[actor.uid] = err
# raise raise
except trio.Cancelled as err: except trio.Cancelled as err:
# lol, of course we need this too ;P # lol, of course we need this too ;P
# TODO: merge with above? # TODO: merge with above?
log.warning(f"Cancelled result waiter for {portal.channel.uid}") log.warning(f"Cancelled `Portal.result()` waiter for {uid}")
result = err result = err
# errors[actor.uid] = err # errors[actor.uid] = err
# raise # raise
@ -143,14 +145,14 @@ async def result_from_portal(
if isinstance(result, Exception): if isinstance(result, Exception):
# errors[actor.uid] = result # errors[actor.uid] = result
log.warning( log.warning(
f"Cancelling {portal.channel.uid} after error {result}" f"Cancelling single-task-run {uid} after error {result}"
) )
raise result # raise result
else: else:
log.runtime( log.runtime(
f"Cancelling {portal.channel.uid} gracefully " f"Cancelling {uid} gracefully "
f"after result {result}") f"after one-time-task result {result}")
# an actor that was `.run_in_actor()` executes a single task # an actor that was `.run_in_actor()` executes a single task
# and delivers the result, then we cancel it. # and delivers the result, then we cancel it.
@ -185,9 +187,13 @@ async def do_hard_kill(
# would tear down stdstreams via ``trio.Process.aclose()``. # would tear down stdstreams via ``trio.Process.aclose()``.
async with proc: async with proc:
log.debug(f"Terminating {proc}") log.debug(f"Terminating {proc}")
# proc.terminate()
if cs.cancelled_caught: if cs.cancelled_caught:
# this is a "softer" kill that we should probably use
# eventually and let the zombie lord do the `.kill()`
# proc.terminate()
# XXX: should pretty much never get here unless we have # XXX: should pretty much never get here unless we have
# to move the bits from ``proc.__aexit__()`` out and # to move the bits from ``proc.__aexit__()`` out and
# into here. # into here.
@ -284,8 +290,10 @@ async def new_proc(
proc = await trio.open_process(spawn_cmd) proc = await trio.open_process(spawn_cmd)
log.info(f"Started {proc}") log.info(f"Started {proc}")
portal: Optional[Portal] = None portal: Optional[Portal] = None
# handle cancellation during child connect-back, kill
# any cancelled spawn sequence immediately.
try: try:
# wait for actor to spawn and connect back to us # wait for actor to spawn and connect back to us
# channel should have handshake completed by the # channel should have handshake completed by the
@ -300,9 +308,11 @@ async def new_proc(
with trio.CancelScope(shield=True): with trio.CancelScope(shield=True):
await do_hard_kill(proc, 0.1) await do_hard_kill(proc, 0.1)
# this should break here # TODO: should we have a custom error for this maybe derived
# from ``subprocess``?
raise raise
# the child successfully connected back to us.
actor_nursery_cancel_called = None actor_nursery_cancel_called = None
portal = Portal(chan) portal = Portal(chan)
actor_nursery._children[subactor.uid] = ( actor_nursery._children[subactor.uid] = (
@ -326,35 +336,36 @@ async def new_proc(
# resume caller at next checkpoint now that child is up # resume caller at next checkpoint now that child is up
task_status.started(portal) task_status.started(portal)
# wait on actor nursery to complete # this either completes or is cancelled and should only
# with trio.CancelScope(shield=True): # **and always** be set once the actor nursery has errored
# or exitted.
# this either completes or is cancelled with trio.CancelScope(shield=True):
# and should only arrive once the actor nursery
# has errored or exitted.
await actor_nursery._join_procs.wait() await actor_nursery._join_procs.wait()
except ( except (
trio.Cancelled, BaseException
# trio.Cancelled,
# KeyboardInterrupt, # KeyboardInterrupt,
# required to collect errors from multiple subactors # trio.MultiError,
trio.MultiError,
# RuntimeError, # RuntimeError,
) as cerr: ) as cerr:
actor_nursery_cancel_called = cerr
if actor_nursery.cancelled: log.exception(f'Relaying unexpected {cerr} to nursery')
log.cancel(f'{uid}: nursery cancelled before exit')
else:
log.error(f'Child {uid} was cancelled before nursery exit?')
# we were specifically cancelled by our parent nursery # sending IPC-msg level cancel requests is expected to be
# managed by the nursery.
with trio.CancelScope(shield=True): with trio.CancelScope(shield=True):
await actor_nursery._handle_err(err, portal=portal)
if portal.channel.connected(): if portal.channel.connected():
log.cancel(f'Sending cancel IPC-msg to {uid}') if ria:
# try to cancel the actor @ IPC level # this may raise which we want right?
await portal.cancel_actor() await result_from_portal(
portal,
subactor,
errors,
# True, # cancel_on_result
)
finally: finally:
# 2 cases: # 2 cases:
@ -382,68 +393,98 @@ async def new_proc(
try: try:
log.cancel(f'Starting soft actor reap for {uid}') log.cancel(f'Starting soft actor reap for {uid}')
cancel_scope = None cancel_scope = None
async with trio.open_nursery() as nursery: # async with trio.open_nursery() as nursery:
if ria:
# collect any expected ``.run_in_actor()`` results if portal.channel.connected() and ria:
cancel_scope = await nursery.start(
result_from_portal, # we wait for result and cancel on completion
await result_from_portal(
portal, portal,
subactor, subactor,
errors, errors,
True, # cancel_on_result True, # cancel_on_result
) )
# # collect any expected ``.run_in_actor()`` results
# cancel_scope = await nursery.start(
# result_from_portal,
# portal,
# subactor,
# errors,
# True, # cancel_on_result
# )
# soft & cancellable # soft & cancellable
await reap_proc(proc) await reap_proc(proc)
# if proc terminates before portal result # # if proc terminates before portal result
if cancel_scope: # if cancel_scope:
cancel_scope.cancel() # cancel_scope.cancel()
except (
RemoteActorError,
) as err:
reaping_cancelled = err
log.exception(f'{uid} remote error')
await actor_nursery._handle_err(err, portal=portal)
except ( except (
trio.Cancelled, trio.Cancelled,
# is this required to collect errors from multiple subactors? ) as err:
trio.MultiError, reaping_cancelled = err
) as rerr:
# nursery was closed but was cancelled during normal
# reaping.
reaping_cancelled = rerr
if actor_nursery.cancelled: if actor_nursery.cancelled:
log.cancel(f'Nursery cancelled during soft reap for {uid}') log.cancel(f'{uid} wait cancelled by nursery')
else:
log.exception(f'{uid} soft wait error?')
# hard reap sequence except (
if proc.poll() is None: BaseException
log.cancel('Attempting hard reap for {uid}') ) as err:
reaping_cancelled = err
log.exception(f'{uid} soft reap local error')
# hard reap sequence finally:
await maybe_wait_for_debugger() if reaping_cancelled:
if actor_nursery.cancelled:
log.cancel(f'Nursery cancelled during soft wait for {uid}')
with trio.CancelScope(shield=True): with trio.CancelScope(shield=True):
if portal.channel.connected(): await maybe_wait_for_debugger()
# cancel the process @ the IPC level
await portal.cancel_actor()
# TODO: do we need to try the ria portals # XXX: can't do this, it'll hang some tests.. no
# again? # idea why yet.
# await result_from_portal( # with trio.CancelScope(shield=True):
# portal, # await actor_nursery._handle_err(
# subactor, # reaping_cancelled,
# errors # portal=portal
# ) # )
# hard zombie lord reap, with timeout # hard reap sequence with timeouts
if proc.poll() is None:
log.cancel(f'Attempting hard reap for {uid}')
with trio.CancelScope(shield=True):
# hard reap sequence
# ``Portal.cancel_actor()`` is expected to have
# been called by the supervising nursery so we
# do **not** call it here.
await reap_proc( await reap_proc(
proc, proc,
terminate_after=2, # this is the same as previous timeout
# setting before rewriting this spawn
# section
terminate_after=3,
) )
finally:
# 2 cases:
# - the actor terminated gracefully
# - we're cancelled and likely need to re-raise
# if somehow the hard reap didn't collect the child then
# we send in the big gunz.
while proc.poll() is None: while proc.poll() is None:
log.critical("ZOMBIE LORD HAS ARRIVED for your {proc}") log.critical(
f'ZOMBIE LORD HAS ARRIVED for your {uid}:\n'
f'{proc}'
)
with trio.CancelScope(shield=True): with trio.CancelScope(shield=True):
await reap_proc( await reap_proc(
proc, proc,
@ -452,16 +493,21 @@ async def new_proc(
log.info(f"Joined {proc}") log.info(f"Joined {proc}")
# 2 cases:
# - the actor terminated gracefully
# - we're cancelled and likely need to re-raise
# pop child entry to indicate we no longer managing this # pop child entry to indicate we no longer managing this
# subactor # subactor
subactor, proc, portal = actor_nursery._children.pop( subactor, proc, portal = actor_nursery._children.pop(
subactor.uid) subactor.uid)
if not actor_nursery._children: if not actor_nursery._children:
log.cancel(f"{uid} reports all children complete!")
actor_nursery._all_children_reaped.set() actor_nursery._all_children_reaped.set()
if actor_nursery_cancel_called: # not entirely sure why we need this.. but without it
raise actor_nursery_cancel_called # the reaping cancelled error is never reported upwards
# to the spawn nursery?
if reaping_cancelled: if reaping_cancelled:
raise reaping_cancelled raise reaping_cancelled