diff --git a/newsfragments/266.bug.rst b/newsfragments/266.bug.rst new file mode 100644 index 0000000..a246baf --- /dev/null +++ b/newsfragments/266.bug.rst @@ -0,0 +1,12 @@ +Fix graceful cancellation of daemon actors + +Previously, his was a bug where if the soft wait on a sub-process (the +``await .proc.wait()``) in the reaper task teardown was cancelled we +would fail over to the hard reaping sequence (meant for culling off any +potential zombies via system kill signals). The hard reap has a timeout +of 3s (currently though in theory we could make it shorter?) before +system signalling kicks in. This means that any daemon actor still +running during nursery exit would get hard reaped (3s later) instead of +cancelled via IPC message. Now we catch the ``trio.Cancelled``, call +``Portal.cancel_actor()`` on the daemon and expect the child to +self-terminate after the runtime cancels and shuts down the process. diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index a589f81..9f8ae0d 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -501,3 +501,53 @@ def test_cancel_while_childs_child_in_sync_sleep( with pytest.raises(AssertionError): trio.run(main) + + +def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon( + start_method, +): + ''' + This is a very subtle test which demonstrates how cancellation + during process collection can result in non-optimal teardown + performance on daemon actors. The fix for this test was to handle + ``trio.Cancelled`` specially in the spawn task waiting in + `proc.wait()` such that ``Portal.cancel_actor()`` is called before + executing the "hard reap" sequence (which has an up to 3 second + delay currently). + + In other words, if we can cancel the actor using a graceful remote + cancellation, and it's faster, we might as well do it. + + ''' + kbi_delay = 0.2 + + async def main(): + start = time.time() + try: + async with trio.open_nursery() as nurse: + async with tractor.open_nursery() as tn: + p = await tn.start_actor( + 'fast_boi', + enable_modules=[__name__], + ) + + async def delayed_kbi(): + await trio.sleep(kbi_delay) + print(f'RAISING KBI after {kbi_delay} s') + raise KeyboardInterrupt + + # start task which raises a kbi **after** + # the actor nursery ``__aexit__()`` has + # been run. + nurse.start_soon(delayed_kbi) + + await p.run(do_nuthin) + finally: + duration = time.time() - start + if duration > 2.9: + raise trio.TooSlowError( + 'daemon cancel was slower then necessary..' + ) + + with pytest.raises(KeyboardInterrupt): + trio.run(main) diff --git a/tractor/_spawn.py b/tractor/_spawn.py index bca812d..6ede566 100644 --- a/tractor/_spawn.py +++ b/tractor/_spawn.py @@ -295,7 +295,17 @@ async def new_proc( # ``trio.Process.__aexit__()`` (it tears down stdio # which will kill any waiting remote pdb trace). # This is a "soft" (cancellable) join/reap. - await proc.wait() + try: + await proc.wait() + except trio.Cancelled: + # if cancelled during a soft wait, cancel the child + # actor before entering the hard reap sequence + # below. This means we try to do a graceful teardown + # via sending a cancel message before getting out + # zombie killing tools. + with trio.CancelScope(shield=True): + await portal.cancel_actor() + raise # cancel result waiter that may have been spawned in # tandem if not done already