From 121f7fd8440c69c7822dde0d839a836e090e7e83 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 29 Nov 2021 12:41:40 -0500 Subject: [PATCH 1/4] Draft test that shows a slow daemon cancellation Currently if the spawn task is waiting on a daemon actor it is likely in `await proc.wait()`, however, if the actor nursery is subsequently cancelled this checkpoint will be abandoned and the hard proc reaping sequence will execute which results in a up to 3 second wait before a "hard" system signal is sent to the child. Ideally such a cancelled-during-daemon-actor-wait condition is instead handled by first trying to cancel the remote actor using `Portal.cancel_actor()` (a "graceful" remote cancel request) which should (presuming normal runtime operation) result in an immediate collection of the process after normal actor (remotely triggered) runtime cancellation. --- tests/test_cancellation.py | 50 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index a589f81..726ffd9 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -501,3 +501,53 @@ def test_cancel_while_childs_child_in_sync_sleep( with pytest.raises(AssertionError): trio.run(main) + + +def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon( + start_method, +): + ''' + This is a very subtle test which demonstrates how cancellation + during process collection can result in non-optimal teardown + performance on daemon actors. The fix for this test was to handle + ``trio.Cancelled`` specially in the spawn task waiting in + `proc.wait()` such that ``Portal.cancel_actor()`` is called before + executing the "hard reap" sequence (which has an up to 3 second + delay currently). + + In other words, if we can cancel the actor using a graceful remote + cancellation, and it's faster, we might as well do it. + + ''' + kbi_delay = 0.2 + + async def main(): + start = time.time() + try: + async with trio.open_nursery() as nurse: + async with tractor.open_nursery() as tn: + p = await tn.start_actor( + 'fast_boi', + enable_modules=[__name__], + ) + + async def delayed_kbi(): + await trio.sleep(kbi_delay) + print(f'RAISING KBI after {kbi_delay} s') + raise KeyboardInterrupt + + # start task which raises a kbi **after** + # the actor nursery ``__aexit__()`` has + # been run. + nurse.start_soon(delayed_kbi) + + await p.run(do_nuthin) + finally: + duration = time.time() - start + if duration > 1: + raise trio.TooSlowError( + 'daemon cancel was slower then necessary..' + ) + + with pytest.raises(KeyboardInterrupt): + trio.run(main) From 7eb465a69941535e0c9407fbd84187f0c828ac61 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 25 Nov 2021 17:14:16 -0500 Subject: [PATCH 2/4] Graceful cancel actors before hard reaping --- tractor/_spawn.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tractor/_spawn.py b/tractor/_spawn.py index bca812d..4fb05d7 100644 --- a/tractor/_spawn.py +++ b/tractor/_spawn.py @@ -295,7 +295,17 @@ async def new_proc( # ``trio.Process.__aexit__()`` (it tears down stdio # which will kill any waiting remote pdb trace). # This is a "soft" (cancellable) join/reap. - await proc.wait() + try: + await proc.wait() + except trio.Cancelled: + # if cancelled during a soft wait cancel the child + # actor before entering the hard reap + # sequence below. This means we try to do a + # graceful teardown before getting out zombie + # killing tools. + with trio.CancelScope(shield=True): + await portal.cancel_actor() + raise # cancel result waiter that may have been spawned in # tandem if not done already From 16a3321a3883863ecc8201f9d47d38845183a340 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 29 Nov 2021 19:43:10 -0500 Subject: [PATCH 3/4] Increase timeout for windows.. --- tests/test_cancellation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py index 726ffd9..9f8ae0d 100644 --- a/tests/test_cancellation.py +++ b/tests/test_cancellation.py @@ -544,7 +544,7 @@ def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon( await p.run(do_nuthin) finally: duration = time.time() - start - if duration > 1: + if duration > 2.9: raise trio.TooSlowError( 'daemon cancel was slower then necessary..' ) From 77fc705b1f0f3840683b95488cbb06f4ad3065bc Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 29 Nov 2021 22:52:19 -0500 Subject: [PATCH 4/4] Add nooz --- newsfragments/266.bug.rst | 12 ++++++++++++ tractor/_spawn.py | 10 +++++----- 2 files changed, 17 insertions(+), 5 deletions(-) create mode 100644 newsfragments/266.bug.rst diff --git a/newsfragments/266.bug.rst b/newsfragments/266.bug.rst new file mode 100644 index 0000000..a246baf --- /dev/null +++ b/newsfragments/266.bug.rst @@ -0,0 +1,12 @@ +Fix graceful cancellation of daemon actors + +Previously, his was a bug where if the soft wait on a sub-process (the +``await .proc.wait()``) in the reaper task teardown was cancelled we +would fail over to the hard reaping sequence (meant for culling off any +potential zombies via system kill signals). The hard reap has a timeout +of 3s (currently though in theory we could make it shorter?) before +system signalling kicks in. This means that any daemon actor still +running during nursery exit would get hard reaped (3s later) instead of +cancelled via IPC message. Now we catch the ``trio.Cancelled``, call +``Portal.cancel_actor()`` on the daemon and expect the child to +self-terminate after the runtime cancels and shuts down the process. diff --git a/tractor/_spawn.py b/tractor/_spawn.py index 4fb05d7..6ede566 100644 --- a/tractor/_spawn.py +++ b/tractor/_spawn.py @@ -298,11 +298,11 @@ async def new_proc( try: await proc.wait() except trio.Cancelled: - # if cancelled during a soft wait cancel the child - # actor before entering the hard reap - # sequence below. This means we try to do a - # graceful teardown before getting out zombie - # killing tools. + # if cancelled during a soft wait, cancel the child + # actor before entering the hard reap sequence + # below. This means we try to do a graceful teardown + # via sending a cancel message before getting out + # zombie killing tools. with trio.CancelScope(shield=True): await portal.cancel_actor() raise