From a2171c7e7109a190f6786d27b5e298e43d33e8d2 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Wed, 22 Dec 2021 14:00:34 -0500 Subject: [PATCH 1/6] Cancel the `.cancel_actor()` request on proc death Adjust the `soft_wait()` strategy to avoid sending needless cancel requests if it is known that a child process is already terminated or does so before the cancel request times out. This should be no slower and should avoid needless waits on either closure-in-progress or already closed channels. Basic strategy is, - request child actor to cancel - if process termination is detected, cancel the cancel - if the process is still alive after a cancel request timeout warn the user and yield back to the hard reap handling --- tractor/_spawn.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/tractor/_spawn.py b/tractor/_spawn.py index ead91df..139b222 100644 --- a/tractor/_spawn.py +++ b/tractor/_spawn.py @@ -218,7 +218,9 @@ async def soft_wait( # ``trio.Process.__aexit__()`` (it tears down stdio # which will kill any waiting remote pdb trace). # This is a "soft" (cancellable) join/reap. + uid = portal.channel.uid try: + log.cancel(f'Soft waiting on actor:\n{uid}') await wait_func(proc) except trio.Cancelled: # if cancelled during a soft wait, cancel the child @@ -226,8 +228,26 @@ async def soft_wait( # below. This means we try to do a graceful teardown # via sending a cancel message before getting out # zombie killing tools. - with trio.CancelScope(shield=True): + async with trio.open_nursery() as n: + n.cancel_scope.shield = True + + async def cancel_on_proc_deth(): + ''' + Cancel the actor cancel request if we detect that + that the process terminated. + + ''' + await wait_func(proc) + n.cancel_scope.cancel() + + n.start_soon(cancel_on_proc_deth) await portal.cancel_actor() + + if proc.poll() is None: + log.warning( + f'Process still alive after cancel request:\n{uid}') + + n.cancel_scope.cancel() raise @@ -373,9 +393,8 @@ async def new_proc( # The "hard" reap since no actor zombies are allowed! # XXX: do this **after** cancellation/tearfown to avoid # killing the process too early. - log.cancel(f'Hard reap sequence starting for {uid}') - if proc: + log.cancel(f'Hard reap sequence starting for {uid}') with trio.CancelScope(shield=True): # don't clobber an ongoing pdb From b1d72b77c9ee6ffba7d8cd352c85f6418d5ae581 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Wed, 22 Dec 2021 14:53:36 -0500 Subject: [PATCH 2/6] Patch mp procs with a `.poll()` Not sure why they don't already expose this from the `Popen` backends but, k. --- tractor/_spawn.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tractor/_spawn.py b/tractor/_spawn.py index 139b222..f15f7b3 100644 --- a/tractor/_spawn.py +++ b/tractor/_spawn.py @@ -82,14 +82,16 @@ else: def try_set_start_method(name: str) -> Optional[mp.context.BaseContext]: - """Attempt to set the method for process starting, aka the "actor + ''' + Attempt to set the method for process starting, aka the "actor spawning backend". If the desired method is not supported this function will error. On Windows only the ``multiprocessing`` "spawn" method is offered besides the default ``trio`` which uses async wrapping around ``subprocess.Popen``. - """ + + ''' global _ctx global _spawn_method @@ -243,7 +245,7 @@ async def soft_wait( n.start_soon(cancel_on_proc_deth) await portal.cancel_actor() - if proc.poll() is None: + if proc.poll() is None: # type: ignore log.warning( f'Process still alive after cancel request:\n{uid}') @@ -502,6 +504,7 @@ async def mp_new_proc( # daemon=True, name=name, ) + # `multiprocessing` only (since no async interface): # register the process before start in case we get a cancel # request before the actor has fully spawned - then we can wait @@ -520,6 +523,11 @@ async def mp_new_proc( # local actor by the time we get a ref to it event, chan = await actor_nursery._actor.wait_for_peer( subactor.uid) + + # XXX: monkey patch poll API to match the ``subprocess`` API.. + # not sure why they don't expose this but kk. + proc.poll = proc._popen.poll # type: ignore + # except: # TODO: in the case we were cancelled before the sub-proc # registered itself back we must be sure to try and clean From 532974fb90522fa2eeda09ba5a6e37e729389e0d Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Thu, 23 Dec 2021 11:54:24 -0500 Subject: [PATCH 3/6] Drop leftover print --- tractor/trionics/_mngrs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tractor/trionics/_mngrs.py b/tractor/trionics/_mngrs.py index 3834983..76f6467 100644 --- a/tractor/trionics/_mngrs.py +++ b/tractor/trionics/_mngrs.py @@ -170,7 +170,6 @@ async def maybe_open_context( await _Cache.lock.acquire() ctx_key = (id(acm_func), key or tuple(kwargs.items())) - print(ctx_key) value = None try: @@ -180,7 +179,7 @@ async def maybe_open_context( value = _Cache.values[ctx_key] except KeyError: - log.info(f'Allocating new resource for {ctx_key}') + log.info(f'Allocating new {acm_func} for {ctx_key}') mngr = acm_func(**kwargs) # TODO: avoid pulling from ``tractor`` internals and From 9650055519a325ac057240b64996cd1f7a4b3088 Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Fri, 21 Jan 2022 09:05:35 -0500 Subject: [PATCH 4/6] Use `.exitcode` which is poll + error handling --- tractor/_spawn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tractor/_spawn.py b/tractor/_spawn.py index f15f7b3..3d7e6b1 100644 --- a/tractor/_spawn.py +++ b/tractor/_spawn.py @@ -526,7 +526,7 @@ async def mp_new_proc( # XXX: monkey patch poll API to match the ``subprocess`` API.. # not sure why they don't expose this but kk. - proc.poll = proc._popen.poll # type: ignore + proc.poll = lambda: proc.exitcode # type: ignore # except: # TODO: in the case we were cancelled before the sub-proc From 41296448e8c0d0351e3c372693384818ee8da9bf Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Fri, 21 Jan 2022 12:16:08 -0500 Subject: [PATCH 5/6] Add nooz --- nooz/248.misc.rst | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 nooz/248.misc.rst diff --git a/nooz/248.misc.rst b/nooz/248.misc.rst new file mode 100644 index 0000000..453c829 --- /dev/null +++ b/nooz/248.misc.rst @@ -0,0 +1,8 @@ +Adjust the `tractor._spawn.soft_wait()` strategy to avoid sending an +actor cancel request (via `Portal.cancel_actor()`) if either the child +process is detected as having terminated or the IPC channel is detected +to be closed. + +This ensures (even) more deterministic inter-actor cancellation by +avoiding the timeout condition where possible when a whild never +sucessfully spawned, crashed, or became un-contactable over IPC. From 4bf7992200c1e601d445be0d69cd2df3be5a9b1c Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Fri, 21 Jan 2022 13:05:26 -0500 Subject: [PATCH 6/6] Bump to alpha 5 dev --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 14a65f7..b17bb57 100755 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ with open('docs/README.rst', encoding='utf-8') as f: setup( name="tractor", - version='0.1.0a4', # alpha zone + version='0.1.0a5.dev', # alpha zone description='structured concurrrent "actors"', long_description=readme, license='GPLv3',