Add `._implicit_runtime_started` mark, better logs

After some deep logging improvements to many parts of `._runtime`,
I realized a silly detail where we are always waiting on any opened
`local_nursery: ActorNursery` to signal exit from
`Actor._stream_handler()` even in the case of being an implicitly opened
root actor (`open_root_actor()` wasn't called by user/app code) via
`._supervise.open_nursery()`..

So, to address this add a `ActorNursery._implicit_runtime_started: bool`
that can be set and then checked to avoid doing the unnecessary
`.exited.wait()` (and any subsequent warn logging on an exit timeout) in
that special but most common case XD

Matching with other subsys log format refinements, improve readability
and simplicity of the actor-nursery supervisory log msgs, including:
- simplify and/or remove any content that more or less duplicates msg
  content found in emissions from lower-level primitives and sub-systems
  (like `._runtime`, `_context`, `_portal` etc.).
- add a specific `._open_and_supervise_one_cancels_all_nursery()`
  handler block for `ContextCancelled` to log with `.cancel()` level
  noting that the case is a "remote cancellation".
- put the nursery-exit and actor-tree shutdown status into a single msg
  in the `implicit_runtime` case.
modden_spawn_from_client_req
Tyler Goodlet 2024-03-01 15:44:01 -05:00
parent 50465d4b34
commit 08a6a51cb8
1 changed files with 73 additions and 23 deletions

View File

@ -34,7 +34,10 @@ from ._state import current_actor, is_main_process
from .log import get_logger, get_loglevel from .log import get_logger, get_loglevel
from ._runtime import Actor from ._runtime import Actor
from ._portal import Portal from ._portal import Portal
from ._exceptions import is_multi_cancelled from ._exceptions import (
is_multi_cancelled,
ContextCancelled,
)
from ._root import open_root_actor from ._root import open_root_actor
from . import _state from . import _state
from . import _spawn from . import _spawn
@ -104,6 +107,14 @@ class ActorNursery:
self.errors = errors self.errors = errors
self.exited = trio.Event() self.exited = trio.Event()
# NOTE: when no explicit call is made to
# `.open_root_actor()` by application code,
# `.open_nursery()` will implicitly call it to start the
# actor-tree runtime. In this case we mark ourselves as
# such so that runtime components can be aware for logging
# and syncing purposes to any actor opened nurseries.
self._implicit_runtime_started: bool = False
async def start_actor( async def start_actor(
self, self,
name: str, name: str,
@ -249,10 +260,11 @@ class ActorNursery:
''' '''
self.cancelled = True self.cancelled = True
log.cancel( # TODO: impl a repr for spawn more compact
'Cancelling actor nursery\n' # then `._children`..
f'|_{self._children}\n' children: dict = self._children
) child_count: int = len(children)
msg: str = f'Cancelling actor nursery with {child_count} children\n'
with trio.move_on_after(3) as cs: with trio.move_on_after(3) as cs:
async with trio.open_nursery() as tn: async with trio.open_nursery() as tn:
@ -263,7 +275,7 @@ class ActorNursery:
subactor, subactor,
proc, proc,
portal, portal,
) in self._children.values(): ) in children.values():
# TODO: are we ever even going to use this or # TODO: are we ever even going to use this or
# is the spawning backend responsible for such # is the spawning backend responsible for such
@ -275,12 +287,13 @@ class ActorNursery:
if portal is None: # actor hasn't fully spawned yet if portal is None: # actor hasn't fully spawned yet
event = self._actor._peer_connected[subactor.uid] event = self._actor._peer_connected[subactor.uid]
log.warning( log.warning(
f"{subactor.uid} wasn't finished spawning?") f"{subactor.uid} never 't finished spawning?"
)
await event.wait() await event.wait()
# channel/portal should now be up # channel/portal should now be up
_, _, portal = self._children[subactor.uid] _, _, portal = children[subactor.uid]
# XXX should be impossible to get here # XXX should be impossible to get here
# unless method was called from within # unless method was called from within
@ -299,11 +312,13 @@ class ActorNursery:
if portal.channel.connected(): if portal.channel.connected():
tn.start_soon(portal.cancel_actor) tn.start_soon(portal.cancel_actor)
log.cancel(msg)
# if we cancelled the cancel (we hung cancelling remote actors) # if we cancelled the cancel (we hung cancelling remote actors)
# then hard kill all sub-processes # then hard kill all sub-processes
if cs.cancelled_caught: if cs.cancelled_caught:
log.error( log.error(
f'Failed to cancel {self}\nHard killing process tree!' f'Failed to cancel {self}?\n'
'Hard killing underlying subprocess tree!\n'
) )
subactor: Actor subactor: Actor
proc: trio.Process proc: trio.Process
@ -312,7 +327,7 @@ class ActorNursery:
subactor, subactor,
proc, proc,
portal, portal,
) in self._children.values(): ) in children.values():
log.warning(f"Hard killing process {proc}") log.warning(f"Hard killing process {proc}")
proc.terminate() proc.terminate()
@ -390,26 +405,39 @@ async def _open_and_supervise_one_cancels_all_nursery(
# worry more are coming). # worry more are coming).
an._join_procs.set() an._join_procs.set()
# XXX: hypothetically an error could be # XXX NOTE XXX: hypothetically an error could
# raised and then a cancel signal shows up # be raised and then a cancel signal shows up
# slightly after in which case the `else:` # slightly after in which case the `else:`
# block here might not complete? For now, # block here might not complete? For now,
# shield both. # shield both.
with trio.CancelScope(shield=True): with trio.CancelScope(shield=True):
etype = type(inner_err) etype: type = type(inner_err)
if etype in ( if etype in (
trio.Cancelled, trio.Cancelled,
KeyboardInterrupt KeyboardInterrupt,
) or ( ) or (
is_multi_cancelled(inner_err) is_multi_cancelled(inner_err)
): ):
log.cancel( log.cancel(
f"Nursery for {current_actor().uid} " f'Actor-nursery cancelled by {etype}\n\n'
f"was cancelled with {etype}")
f'{current_actor().uid}\n'
f' |_{an}\n\n'
# TODO: show tb str?
# f'{tb_str}'
)
elif etype in {
ContextCancelled,
}:
log.cancel(
'Actor-nursery caught remote cancellation\n\n'
f'{inner_err.tb_str}'
)
else: else:
log.exception( log.exception(
f"Nursery for {current_actor().uid} " 'Nursery errored with:\n'
"errored with:"
# TODO: same thing as in # TODO: same thing as in
# `._invoke()` to compute how to # `._invoke()` to compute how to
@ -450,11 +478,15 @@ async def _open_and_supervise_one_cancels_all_nursery(
# ".run_in_actor()" actors then we also want to cancel all # ".run_in_actor()" actors then we also want to cancel all
# remaining sub-actors (due to our lone strategy: # remaining sub-actors (due to our lone strategy:
# one-cancels-all). # one-cancels-all).
log.cancel(f"Nursery cancelling due to {err}")
if an._children: if an._children:
log.cancel(
'Actor-nursery cancelling due error type:\n'
f'{err}\n'
)
with trio.CancelScope(shield=True): with trio.CancelScope(shield=True):
await an.cancel() await an.cancel()
raise raise
finally: finally:
# No errors were raised while awaiting ".run_in_actor()" # No errors were raised while awaiting ".run_in_actor()"
# actors but those actors may have returned remote errors as # actors but those actors may have returned remote errors as
@ -500,7 +532,7 @@ async def open_nursery(
which cancellation scopes correspond to each spawned subactor set. which cancellation scopes correspond to each spawned subactor set.
''' '''
implicit_runtime = False implicit_runtime: bool = False
actor = current_actor(err_on_no_runtime=False) actor = current_actor(err_on_no_runtime=False)
@ -512,7 +544,7 @@ async def open_nursery(
log.info("Starting actor runtime!") log.info("Starting actor runtime!")
# mark us for teardown on exit # mark us for teardown on exit
implicit_runtime = True implicit_runtime: bool = True
async with open_root_actor(**kwargs) as actor: async with open_root_actor(**kwargs) as actor:
assert actor is current_actor() assert actor is current_actor()
@ -521,8 +553,21 @@ async def open_nursery(
async with _open_and_supervise_one_cancels_all_nursery( async with _open_and_supervise_one_cancels_all_nursery(
actor actor
) as an: ) as an:
# NOTE: mark this nursery as having
# implicitly started the root actor so
# that `._runtime` machinery can avoid
# certain teardown synchronization
# blocking/waits and any associated (warn)
# logging when it's known that this
# nursery shouldn't be exited before the
# root actor is.
an._implicit_runtime_started = True
yield an yield an
finally: finally:
# XXX: this event will be set after the root actor
# runtime is already torn down, so we want to
# avoid any blocking on it.
an.exited.set() an.exited.set()
else: # sub-nursery case else: # sub-nursery case
@ -536,8 +581,13 @@ async def open_nursery(
an.exited.set() an.exited.set()
finally: finally:
log.debug("Nursery teardown complete") msg: str = (
'Actor-nursery exited\n'
f'|_{an}\n\n'
)
# shutdown runtime if it was started # shutdown runtime if it was started
if implicit_runtime: if implicit_runtime:
log.info("Shutting down actor tree") msg += '=> Shutting down actor runtime <=\n'
log.info(msg)