diff --git a/tractor/_spawn.py b/tractor/_spawn.py index e55e59f..2936220 100644 --- a/tractor/_spawn.py +++ b/tractor/_spawn.py @@ -204,6 +204,21 @@ async def do_hard_kill( # terminate_after: int = 99999, ) -> None: + ''' + Un-gracefully terminate an OS level `trio.Process` after timeout. + + Used in 2 main cases: + + - "unknown remote runtime state": a hanging/stalled actor that + isn't responding after sending a (graceful) runtime cancel + request via an IPC msg. + - "cancelled during spawn": a process who's actor runtime was + cancelled before full startup completed (such that + cancel-request-handling machinery was never fully + initialized) and thus a "cancel request msg" is never going + to be handled. + + ''' # NOTE: this timeout used to do nothing since we were shielding # the ``.wait()`` inside ``new_proc()`` which will pretty much # never release until the process exits, now it acts as @@ -219,6 +234,9 @@ async def do_hard_kill( # and wait for it to exit. If cancelled, kills the process and # waits for it to finish exiting before propagating the # cancellation. + # + # This code was originally triggred by ``proc.__aexit__()`` + # but now must be called manually. with trio.CancelScope(shield=True): if proc.stdin is not None: await proc.stdin.aclose() @@ -234,10 +252,14 @@ async def do_hard_kill( with trio.CancelScope(shield=True): await proc.wait() + # XXX NOTE XXX: zombie squad dispatch: + # (should ideally never, but) If we do get here it means + # graceful termination of a process failed and we need to + # resort to OS level signalling to interrupt and cancel the + # (presumably stalled or hung) actor. Since we never allow + # zombies (as a feature) we ask the OS to do send in the + # removal swad as the last resort. if cs.cancelled_caught: - # XXX: should pretty much never get here unless we have - # to move the bits from ``proc.__aexit__()`` out and - # into here. log.critical(f"#ZOMBIE_LORD_IS_HERE: {proc}") proc.kill() @@ -252,10 +274,13 @@ async def soft_wait( portal: Portal, ) -> None: - # Wait for proc termination but **dont' yet** call - # ``trio.Process.__aexit__()`` (it tears down stdio - # which will kill any waiting remote pdb trace). - # This is a "soft" (cancellable) join/reap. + ''' + Wait for proc termination but **dont' yet** teardown + std-streams (since it will clobber any ongoing pdb REPL + session). This is our "soft" (and thus itself cancellable) + join/reap on an actor-runtime-in-process. + + ''' uid = portal.channel.uid try: log.cancel(f'Soft waiting on actor:\n{uid}') @@ -278,7 +303,13 @@ async def soft_wait( await wait_func(proc) n.cancel_scope.cancel() + # start a task to wait on the termination of the + # process by itself waiting on a (caller provided) wait + # function which should unblock when the target process + # has terminated. n.start_soon(cancel_on_proc_deth) + + # send the actor-runtime a cancel request. await portal.cancel_actor() if proc.poll() is None: # type: ignore