Merge pull request #266 from goodboy/faster_daemon_cancels

Faster graceful daemon cancels
agpl
goodboy 2021-11-30 09:29:13 -05:00 committed by GitHub
commit d05885d650
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 73 additions and 1 deletions

View File

@ -0,0 +1,12 @@
Fix graceful cancellation of daemon actors
Previously, his was a bug where if the soft wait on a sub-process (the
``await .proc.wait()``) in the reaper task teardown was cancelled we
would fail over to the hard reaping sequence (meant for culling off any
potential zombies via system kill signals). The hard reap has a timeout
of 3s (currently though in theory we could make it shorter?) before
system signalling kicks in. This means that any daemon actor still
running during nursery exit would get hard reaped (3s later) instead of
cancelled via IPC message. Now we catch the ``trio.Cancelled``, call
``Portal.cancel_actor()`` on the daemon and expect the child to
self-terminate after the runtime cancels and shuts down the process.

View File

@ -501,3 +501,53 @@ def test_cancel_while_childs_child_in_sync_sleep(
with pytest.raises(AssertionError):
trio.run(main)
def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon(
start_method,
):
'''
This is a very subtle test which demonstrates how cancellation
during process collection can result in non-optimal teardown
performance on daemon actors. The fix for this test was to handle
``trio.Cancelled`` specially in the spawn task waiting in
`proc.wait()` such that ``Portal.cancel_actor()`` is called before
executing the "hard reap" sequence (which has an up to 3 second
delay currently).
In other words, if we can cancel the actor using a graceful remote
cancellation, and it's faster, we might as well do it.
'''
kbi_delay = 0.2
async def main():
start = time.time()
try:
async with trio.open_nursery() as nurse:
async with tractor.open_nursery() as tn:
p = await tn.start_actor(
'fast_boi',
enable_modules=[__name__],
)
async def delayed_kbi():
await trio.sleep(kbi_delay)
print(f'RAISING KBI after {kbi_delay} s')
raise KeyboardInterrupt
# start task which raises a kbi **after**
# the actor nursery ``__aexit__()`` has
# been run.
nurse.start_soon(delayed_kbi)
await p.run(do_nuthin)
finally:
duration = time.time() - start
if duration > 2.9:
raise trio.TooSlowError(
'daemon cancel was slower then necessary..'
)
with pytest.raises(KeyboardInterrupt):
trio.run(main)

View File

@ -295,7 +295,17 @@ async def new_proc(
# ``trio.Process.__aexit__()`` (it tears down stdio
# which will kill any waiting remote pdb trace).
# This is a "soft" (cancellable) join/reap.
try:
await proc.wait()
except trio.Cancelled:
# if cancelled during a soft wait, cancel the child
# actor before entering the hard reap sequence
# below. This means we try to do a graceful teardown
# via sending a cancel message before getting out
# zombie killing tools.
with trio.CancelScope(shield=True):
await portal.cancel_actor()
raise
# cancel result waiter that may have been spawned in
# tandem if not done already