Merge pull request #234 from goodboy/root_tty_hangs

Root tty hangs
alpha2
goodboy 2021-09-02 16:58:25 -04:00 committed by GitHub
commit 87ce6c8eb3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 70 additions and 26 deletions

View File

@ -0,0 +1,6 @@
Handle broken channel/stream faults where the root's tty lock is left acquired by some
child actor who went MIA and the root ends up hanging indefinitely.
There's two parts here:
- Don't shield wait on the lock
- Always do our best to release the lock on the expected worst case connection faults

View File

@ -23,6 +23,7 @@ from ._exceptions import (
from ._debug import breakpoint, post_mortem
from . import msg
from ._root import run, run_daemon, open_root_actor
from ._portal import Portal
__all__ = [
@ -40,6 +41,7 @@ __all__ = [
'msg',
'open_nursery',
'open_root_actor',
'Portal',
'post_mortem',
'run',
'run_daemon',

View File

@ -196,6 +196,22 @@ async def _acquire_debug_lock(
log.debug(f"TTY lock released, remote task: {task_name}:{uid}")
def handler(signum, frame, *args):
"""Specialized debugger compatible SIGINT handler.
In childred we always ignore to avoid deadlocks since cancellation
should always be managed by the parent supervising actor. The root
is always cancelled on ctrl-c.
"""
if is_root_process():
tractor.current_actor().cancel_soon()
else:
print(
"tractor ignores SIGINT while in debug mode\n"
"If you have a special need for it please open an issue.\n"
)
@tractor.context
async def _hijack_stdin_for_child(
@ -222,24 +238,36 @@ async def _hijack_stdin_for_child(
with trio.CancelScope(shield=True):
async with _acquire_debug_lock(subactor_uid):
try:
lock = None
async with _acquire_debug_lock(subactor_uid) as lock:
# indicate to child that we've locked stdio
await ctx.started('Locked')
log.pdb(f"Actor {subactor_uid} ACQUIRED stdin hijack lock")
# indicate to child that we've locked stdio
await ctx.started('Locked')
log.pdb(f"Actor {subactor_uid} ACQUIRED stdin hijack lock")
# wait for unlock pdb by child
async with ctx.open_stream() as stream:
try:
# wait for unlock pdb by child
async with ctx.open_stream() as stream:
assert await stream.receive() == 'pdb_unlock'
except trio.BrokenResourceError:
# XXX: there may be a race with the portal teardown
# with the calling actor which we can safely ignore.
# The alternative would be sending an ack message
# and allowing the client to wait for us to teardown
# first?
pass
# try:
# assert await stream.receive() == 'pdb_unlock'
except (
trio.BrokenResourceError,
trio.Cancelled, # by local cancellation
trio.ClosedResourceError, # by self._rx_chan
) as err:
# XXX: there may be a race with the portal teardown
# with the calling actor which we can safely ignore.
# The alternative would be sending an ack message
# and allowing the client to wait for us to teardown
# first?
if lock and lock.locked():
lock.release()
if isinstance(err, trio.Cancelled):
raise
log.debug(f"TTY lock released, remote task: {task_name}:{subactor_uid}")
@ -283,7 +311,7 @@ async def _breakpoint(
try:
async with get_root() as portal:
log.error('got portal')
log.pdb('got portal')
# this syncs to child's ``Context.started()`` call.
async with portal.open_context(
@ -293,7 +321,7 @@ async def _breakpoint(
) as (ctx, val):
log.error('locked context')
log.pdb('locked context')
assert val == 'Locked'
async with ctx.open_stream() as stream:
@ -376,10 +404,17 @@ async def _breakpoint(
# callbacks. Can't think of a nicer way then this atm.
if _debug_lock.locked():
log.warning(
'Root actor attempting to acquire active tty lock'
'Root actor attempting to shield-acquire active tty lock'
f' owned by {_global_actor_in_debug}')
await _debug_lock.acquire()
with trio.CancelScope(shield=True):
# must shield here to avoid hitting a ``Cancelled`` and
# a child getting stuck bc we clobbered the tty
await _debug_lock.acquire()
else:
# may be cancelled
await _debug_lock.acquire()
_global_actor_in_debug = actor.uid
_local_task_in_debug = task_name
@ -402,12 +437,13 @@ async def _breakpoint(
debug_func(actor)
def _mk_pdb():
def _mk_pdb() -> PdbwTeardown:
# XXX: setting these flags on the pdb instance are absolutely
# critical to having ctrl-c work in the ``trio`` standard way!
# The stdlib's pdb supports entering the current sync frame
# on a SIGINT, with ``trio`` we pretty much never want this
# and we did we can handle it in the ``tractor`` task runtime.
# critical to having ctrl-c work in the ``trio`` standard way! The
# stdlib's pdb supports entering the current sync frame on a SIGINT,
# with ``trio`` we pretty much never want this and if we did we can
# handle it in the ``tractor`` task runtime.
pdb = PdbwTeardown()
pdb.allow_kbdint = True

View File

@ -62,7 +62,7 @@ def _trio_main(
"""
# Disable sigint handling in children;
# we don't need it thanks to our cancellation machinery.
signal.signal(signal.SIGINT, signal.SIG_IGN)
# signal.signal(signal.SIGINT, signal.SIG_IGN)
log.info(f"Started new trio process for {actor.uid}")

View File

@ -298,8 +298,8 @@ async def _open_and_supervise_one_cancels_all_nursery(
f'child {_debug._global_actor_in_debug}\n'
'Waiting on tty lock to release..')
with trio.CancelScope(shield=True):
await debug_complete.wait()
# with trio.CancelScope(shield=True):
await debug_complete.wait()
# if the caller's scope errored then we activate our
# one-cancels-all supervisor strategy (don't