From fc95c6719f7aa686d7a46752de7c21ff913b0e6f Mon Sep 17 00:00:00 2001 From: Tyler Goodlet Date: Mon, 8 Jul 2024 20:57:41 -0400 Subject: [PATCH] Get multi-threaded sync-pausing fully workin! The final issue was making sure we do the same thing on ctl-c/SIGINT from the user. That is, if there's already a bg-thread in REPL, we `log.pdb()` about SIGINT shielding and re-draw the prompt; the same UX as normal actor-runtime-task behaviour. Reasons this wasn't workin.. and the fix: - `.pause_from_sync()` was overriding the local `repl` var with `None` delivered by (transitive) calls to `_pause(debug_func=None)`.. so remove all that and only assign it OAOO prior to thread-type case branching. - always call `DebugStatus.shield_sigint()` as needed from all requesting threads/tasks: - in `_pause_from_bg_root_thread()` BEFORE calling `._pause()` AND BEFORE yielding back to the bg-thread via `.started(out)` to ensure we're definitely overriding the handler in the `trio`-main-thread task before unblocking the requesting bg-thread. - from any requesting bg-thread in the root actor such that both its main-`trio`-thread scheduled task (as per above bullet) AND it are SIGINT shielded. - always call `.shield_sigint()` BEFORE any `greenback._await()` case don't entirely grok why yet, but it works)? - for `greenback._await()` case always set `bg_task` to the current one.. - tweaks to the `SIGINT` handler, now renamed `sigint_shield()` so as not to name-collide with the methods when editor-searching: - always try to `repr()` the REPL thread/task "owner" as well as the active `PdbREPL` instance. - add `.devx()` notes around the prompt flushing deats and comments for any root-actor-bg-thread edge cases. Related/supporting refinements: - add `get_lock()`/`get_debug_req()` factory funcs since the plan is to eventually implement both as `@singleton` instances per actor. - fix `acquire_debug_lock()`'s call-sig-bug for scheduling `request_root_stdio_lock()`.. - in `._pause()` only call `mk_pdb()` when `debug_func != None`. - add some todo/warning notes around the `cls.repl = None` in `DebugStatus.release()` `test_pause_from_sync()` tweaks: - don't use a `attach_patts.copy()`, since we always `break` on match. - do `pytest.fail()` on that ^ loop's fallthrough.. - pass `do_ctlc(child, patt=attach_key)` such that we always match the the current thread's name with the ctl-c triggered `.pdb()` emission. - oh yeah, return the last `before: str` from `do_ctlc()`. - in the script, flip `abandon_on_cancel=True` since when `False` it seems to cause `trio.run()` to hang on exit from the last bg-thread case?!? --- examples/debugging/sync_bp.py | 24 ++++- tests/test_debugger.py | 57 +++++++++-- tractor/devx/__init__.py | 2 +- tractor/devx/_debug.py | 184 ++++++++++++++++++++++++++-------- 4 files changed, 211 insertions(+), 56 deletions(-) diff --git a/examples/debugging/sync_bp.py b/examples/debugging/sync_bp.py index 137710f..95472c9 100644 --- a/examples/debugging/sync_bp.py +++ b/examples/debugging/sync_bp.py @@ -4,6 +4,13 @@ import time import trio import tractor +# TODO: only import these when not running from test harness? +# can we detect `pexpect` usage maybe? +# from tractor.devx._debug import ( +# get_lock, +# get_debug_req, +# ) + def sync_pause( use_builtin: bool = False, @@ -18,7 +25,13 @@ def sync_pause( breakpoint(hide_tb=hide_tb) else: + # TODO: maybe for testing some kind of cm style interface + # where the `._set_trace()` call doesn't happen until block + # exit? + # assert get_lock().ctx_in_debug is None + # assert get_debug_req().repl is None tractor.pause_from_sync() + # assert get_debug_req().repl is None if error: raise RuntimeError('yoyo sync code error') @@ -41,10 +54,11 @@ async def start_n_sync_pause( async def main() -> None: async with ( tractor.open_nursery( - # NOTE: required for pausing from sync funcs - maybe_enable_greenback=True, debug_mode=True, - # loglevel='cancel', + maybe_enable_greenback=True, + enable_stack_on_sig=True, + # loglevel='warning', + # loglevel='devx', ) as an, trio.open_nursery() as tn, ): @@ -138,7 +152,9 @@ async def main() -> None: # the case 2. from above still exists! use_builtin=True, ), - abandon_on_cancel=False, + # TODO: with this `False` we can hang!??! + # abandon_on_cancel=False, + abandon_on_cancel=True, thread_name='inline_root_bg_thread', ) diff --git a/tests/test_debugger.py b/tests/test_debugger.py index 71c691a..7acf984 100644 --- a/tests/test_debugger.py +++ b/tests/test_debugger.py @@ -299,7 +299,9 @@ def do_ctlc( # needs some further investigation potentially... expect_prompt: bool = not _ci_env, -) -> None: +) -> str|None: + + before: str|None = None # make sure ctl-c sends don't do anything but repeat output for _ in range(count): @@ -309,15 +311,18 @@ def do_ctlc( # TODO: figure out why this makes CI fail.. # if you run this test manually it works just fine.. if expect_prompt: - before = str(child.before.decode()) time.sleep(delay) child.expect(PROMPT) + before = str(child.before.decode()) time.sleep(delay) if patt: # should see the last line on console assert patt in before + # return the console content up to the final prompt + return before + def test_root_actor_bp_forever( spawn, @@ -1085,10 +1090,10 @@ def test_pause_from_sync( ) if ctlc: do_ctlc(child) + # ^NOTE^ subactor not spawned yet; don't need extra delay. child.sendline('c') - # first `await tractor.pause()` inside `p.open_context()` body child.expect(PROMPT) @@ -1109,7 +1114,27 @@ def test_pause_from_sync( ) if ctlc: - do_ctlc(child) + do_ctlc( + child, + # NOTE: setting this to 0 (or some other sufficient + # small val) can cause the test to fail since the + # `subactor` suffers a race where the root/parent + # sends an actor-cancel prior to it hitting its pause + # point; by def the value is 0.1 + delay=0.3, + ) + + # XXX, fwiw without a brief sleep here the SIGINT might actually + # trigger "subactor" cancellation by its parent before the + # shield-handler is engaged. + # + # => similar to the `delay` input to `do_ctlc()` below, setting + # this too low can cause the test to fail since the `subactor` + # suffers a race where the root/parent sends an actor-cancel + # prior to the context task hitting its pause point (and thus + # engaging the `sigint_shield()` handler in time); this value + # seems be good enuf? + time.sleep(0.6) # one of the bg thread or subactor should have # `Lock.acquire()`-ed @@ -1128,29 +1153,45 @@ def test_pause_from_sync( "('root'", ], } + conts: int = 0 # for debugging below matching logic on failure while attach_patts: child.sendline('c') + conts += 1 child.expect(PROMPT) before = str(child.before.decode()) - for key in attach_patts.copy(): + for key in attach_patts: if key in before: + attach_key: str = key expected_patts: str = attach_patts.pop(key) assert_before( child, - [_pause_msg] + expected_patts + [_pause_msg] + + + expected_patts ) break + else: + pytest.fail( + f'No keys found?\n\n' + f'{attach_patts.keys()}\n\n' + f'{before}\n' + ) # ensure no other task/threads engaged a REPL # at the same time as the one that was detected above. - for key, other_patts in attach_patts.items(): + for key, other_patts in attach_patts.copy().items(): assert not in_prompt_msg( before, other_patts, ) if ctlc: - do_ctlc(child) + do_ctlc( + child, + patt=attach_key, + # NOTE same as comment above + delay=0.3, + ) child.sendline('c') child.expect(pexpect.EOF) diff --git a/tractor/devx/__init__.py b/tractor/devx/__init__.py index 86dd128..bb6791a 100644 --- a/tractor/devx/__init__.py +++ b/tractor/devx/__init__.py @@ -26,7 +26,7 @@ from ._debug import ( breakpoint as breakpoint, pause as pause, pause_from_sync as pause_from_sync, - shield_sigint_handler as shield_sigint_handler, + sigint_shield as sigint_shield, open_crash_handler as open_crash_handler, maybe_open_crash_handler as maybe_open_crash_handler, maybe_init_greenback as maybe_init_greenback, diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py index ae1c46d..02551fa 100644 --- a/tractor/devx/_debug.py +++ b/tractor/devx/_debug.py @@ -409,9 +409,9 @@ class Lock: repl_task ) message += ( - f'\nA non-caller task still owns this lock on behalf of ' - f'{behalf_of_task}\n' - f'|_{lock_stats.owner}\n' + f'A non-caller task still owns this lock on behalf of ' + f'`{behalf_of_task}`\n' + f'lock owner task: {lock_stats.owner}\n' ) if ( @@ -523,6 +523,10 @@ class Lock: ) +def get_lock() -> Lock: + return Lock + + @tractor.context( # enable the locking msgspec pld_spec=__pld_spec__, @@ -788,13 +792,13 @@ class DebugStatus: cls._orig_sigint_handler: Callable = trio.from_thread.run_sync( signal.signal, signal.SIGINT, - shield_sigint_handler, + sigint_shield, ) else: cls._orig_sigint_handler = signal.signal( signal.SIGINT, - shield_sigint_handler, + sigint_shield, ) @classmethod @@ -900,12 +904,30 @@ class DebugStatus: # actor-local state, irrelevant for non-root. cls.repl_task = None + + # XXX WARNING needs very special caughtion, and we should + # prolly make a more explicit `@property` API? + # + # - if unset in root multi-threaded case can cause + # issues with detecting that some root thread is + # using a REPL, + # + # - what benefit is there to unsetting, it's always + # set again for the next task in some actor.. + # only thing would be to avoid in the sigint-handler + # logging when we don't need to? cls.repl = None # restore original sigint handler cls.unshield_sigint() + +# TODO: use the new `@lowlevel.singleton` for this! +def get_debug_req() -> DebugStatus|None: + return DebugStatus + + class TractorConfig(pdbp.DefaultConfig): ''' Custom `pdbp` config which tries to use the best tradeoff @@ -1311,7 +1333,7 @@ def any_connected_locker_child() -> bool: return False -def shield_sigint_handler( +def sigint_shield( signum: int, frame: 'frame', # type: ignore # noqa *args, @@ -1351,13 +1373,17 @@ def shield_sigint_handler( # root actor branch that reports whether or not a child # has locked debugger. if is_root_process(): + # log.warning( + log.devx( + 'Handling SIGINT in root actor\n' + f'{Lock.repr()}' + f'{DebugStatus.repr()}\n' + ) # try to see if the supposed (sub)actor in debug still # has an active connection to *this* actor, and if not # it's likely they aren't using the TTY lock / debugger # and we should propagate SIGINT normally. any_connected: bool = any_connected_locker_child() - # if not any_connected: - # return do_cancel() problem = ( f'root {actor.uid} handling SIGINT\n' @@ -1406,19 +1432,25 @@ def shield_sigint_handler( # an actor using the `Lock` (a bug state) ?? # => so immediately cancel any stale lock cs and revert # the handler! - if not repl: + if not DebugStatus.repl: # TODO: WHEN should we revert back to ``trio`` # handler if this one is stale? # -[ ] maybe after a counts work of ctl-c mashes? # -[ ] use a state var like `stale_handler: bool`? problem += ( - '\n' 'No subactor is using a `pdb` REPL according `Lock.ctx_in_debug`?\n' - 'BUT, the root should be using it, WHY this handler ??\n' + 'BUT, the root should be using it, WHY this handler ??\n\n' + 'So either..\n' + '- some root-thread is using it but has no `.repl` set?, OR\n' + '- something else weird is going on outside the runtime!?\n' ) else: + # NOTE: since we emit this msg on ctl-c, we should + # also always re-print the prompt the tail block! log.pdb( 'Ignoring SIGINT while pdb REPL in use by root actor..\n' + f'{DebugStatus.repl_task}\n' + f' |_{repl}\n' ) problem = None @@ -1468,7 +1500,6 @@ def shield_sigint_handler( 'Allowing SIGINT propagation..' ) DebugStatus.unshield_sigint() - # do_cancel() repl_task: str|None = DebugStatus.repl_task req_task: str|None = DebugStatus.req_task @@ -1483,10 +1514,15 @@ def shield_sigint_handler( f' |_{repl}\n' ) elif req_task: - log.pdb( - f'Ignoring SIGINT while debug request task is open\n' + log.debug( + 'Ignoring SIGINT while debug request task is open but either,\n' + '- someone else is already REPL-in and has the `Lock`, or\n' + '- some other local task already is replin?\n' f'|_{req_task}\n' ) + + # TODO can we remove this now? + # -[ ] does this path ever get hit any more? else: msg: str = ( 'SIGINT shield handler still active BUT, \n\n' @@ -1522,31 +1558,47 @@ def shield_sigint_handler( # https://github.com/goodboy/tractor/issues/320 # elif debug_mode(): - # NOTE: currently (at least on ``fancycompleter`` 0.9.2) - # it looks to be that the last command that was run (eg. ll) - # will be repeated by default. - # maybe redraw/print last REPL output to console since # we want to alert the user that more input is expect since # nothing has been done dur to ignoring sigint. if ( - repl # only when current actor has a REPL engaged + DebugStatus.repl # only when current actor has a REPL engaged ): + flush_status: str = ( + 'Flushing stdout to ensure new prompt line!\n' + ) + # XXX: yah, mega hack, but how else do we catch this madness XD - if repl.shname == 'xonsh': + if ( + repl.shname == 'xonsh' + ): + flush_status += ( + '-> ALSO re-flushing due to `xonsh`..\n' + ) repl.stdout.write(repl.prompt) + # log.warning( + log.devx( + flush_status + ) repl.stdout.flush() - # TODO: make this work like sticky mode where if there is output - # detected as written to the tty we redraw this part underneath - # and erase the past draw of this same bit above? + # TODO: better console UX to match the current "mode": + # -[ ] for example if in sticky mode where if there is output + # detected as written to the tty we redraw this part underneath + # and erase the past draw of this same bit above? # repl.sticky = True # repl._print_if_sticky() - # also see these links for an approach from ``ptk``: + # also see these links for an approach from `ptk`: # https://github.com/goodboy/tractor/issues/130#issuecomment-663752040 # https://github.com/prompt-toolkit/python-prompt-toolkit/blob/c2c6af8a0308f9e5d7c0e28cb8a02963fe0ce07a/prompt_toolkit/patch_stdout.py + else: + log.devx( + # log.warning( + 'Not flushing stdout since not needed?\n' + f'|_{repl}\n' + ) # XXX only for tracing this handler log.devx('exiting SIGINT') @@ -1617,7 +1669,7 @@ async def _pause( # 'directly (infected) `asyncio` tasks!' # ) from rte - raise + raise rte if debug_func is not None: debug_func = partial(debug_func) @@ -1625,9 +1677,13 @@ async def _pause( # XXX NOTE XXX set it here to avoid ctl-c from cancelling a debug # request from a subactor BEFORE the REPL is entered by that # process. - if not repl: + if ( + not repl + and + debug_func + ): + repl: PdbREPL = mk_pdb() DebugStatus.shield_sigint() - repl: PdbREPL = repl or mk_pdb() # TODO: move this into a `open_debug_request()` @acm? # -[ ] prolly makes the most sense to do the request @@ -1662,7 +1718,13 @@ async def _pause( # recurrent entries/requests from the same # actor-local task. DebugStatus.repl_task = task - DebugStatus.repl = repl + if repl: + DebugStatus.repl = repl + else: + log.error( + 'No REPl instance set before entering `debug_func`?\n' + f'{debug_func}\n' + ) # invoke the low-level REPL activation routine which itself # should call into a `Pdb.set_trace()` of some sort. @@ -2001,7 +2063,7 @@ async def _pause( DebugStatus.release(cancel_req_task=True) # sanity checks for ^ on request/status teardown - assert DebugStatus.repl is None + # assert DebugStatus.repl is None # XXX no more bc bg thread cases? assert DebugStatus.repl_task is None # sanity, for when hackin on all this? @@ -2240,7 +2302,12 @@ async def _pause_from_bg_root_thread( 'Trying to acquire `Lock` on behalf of bg thread\n' f'|_{behalf_of_thread}\n' ) - # DebugStatus.repl_task = behalf_of_thread + + # NOTE: this is already a task inside the main-`trio`-thread, so + # we don't need to worry about calling it another time from the + # bg thread on which who's behalf this task is operating. + DebugStatus.shield_sigint() + out = await _pause( debug_func=None, repl=repl, @@ -2249,6 +2316,8 @@ async def _pause_from_bg_root_thread( called_from_bg_thread=True, **_pause_kwargs ) + DebugStatus.repl_task = behalf_of_thread + lock: trio.FIFOLock = Lock._debug_lock stats: trio.LockStatistics= lock.statistics() assert stats.owner is task @@ -2282,7 +2351,6 @@ async def _pause_from_bg_root_thread( f'|_{behalf_of_thread}\n' ) task_status.started(out) - DebugStatus.shield_sigint() # wait for bg thread to exit REPL sesh. try: @@ -2323,7 +2391,7 @@ def pause_from_sync( err_on_no_runtime=False, ) message: str = ( - f'{actor.uid} task called `tractor.pause_from_sync()`\n\n' + f'{actor.uid} task called `tractor.pause_from_sync()`\n' ) if not actor: raise RuntimeError( @@ -2347,7 +2415,6 @@ def pause_from_sync( 'for infected `asyncio` mode!' ) - DebugStatus.shield_sigint() repl: PdbREPL = mk_pdb() # message += f'-> created local REPL {repl}\n' @@ -2365,6 +2432,10 @@ def pause_from_sync( # thread which will call `._pause()` manually with special # handling for root-actor caller usage. if not DebugStatus.is_main_trio_thread(): + + # TODO: `threading.Lock()` this so we don't get races in + # multi-thr cases where they're acquiring/releasing the + # REPL and setting request/`Lock` state, etc.. thread: threading.Thread = threading.current_thread() repl_owner = thread @@ -2372,9 +2443,16 @@ def pause_from_sync( if is_root: message += ( f'-> called from a root-actor bg {thread}\n' - f'-> scheduling `._pause_from_sync_thread()`..\n' + f'-> scheduling `._pause_from_bg_root_thread()`..\n' ) - bg_task, repl = trio.from_thread.run( + # XXX SUBTLE BADNESS XXX that should really change! + # don't over-write the `repl` here since when + # this behalf-of-bg_thread-task calls pause it will + # pass `debug_func=None` which will result in it + # returing a `repl==None` output and that get's also + # `.started(out)` back here! So instead just ignore + # that output and assign the `repl` created above! + bg_task, _ = trio.from_thread.run( afn=partial( actor._service_n.start, partial( @@ -2386,8 +2464,9 @@ def pause_from_sync( ), ) ) + DebugStatus.shield_sigint() message += ( - f'-> `._pause_from_sync_thread()` started bg task {bg_task}\n' + f'-> `._pause_from_bg_root_thread()` started bg task {bg_task}\n' ) else: message += f'-> called from a bg {thread}\n' @@ -2396,7 +2475,7 @@ def pause_from_sync( # `request_root_stdio_lock()` and we don't need to # worry about all the special considerations as with # the root-actor per above. - bg_task, repl = trio.from_thread.run( + bg_task, _ = trio.from_thread.run( afn=partial( _pause, debug_func=None, @@ -2411,6 +2490,9 @@ def pause_from_sync( **_pause_kwargs ), ) + # ?TODO? XXX where do we NEED to call this in the + # subactor-bg-thread case? + DebugStatus.shield_sigint() assert bg_task is not DebugStatus.repl_task else: # we are presumably the `trio.run()` + main thread @@ -2423,6 +2505,11 @@ def pause_from_sync( # greenback: ModuleType = await maybe_init_greenback() message += f'-> imported {greenback}\n' + + # NOTE XXX seems to need to be set BEFORE the `_pause()` + # invoke using gb below? + DebugStatus.shield_sigint() + repl_owner: Task = current_task() message += '-> calling `greenback.await_(_pause(debug_func=None))` from sync caller..\n' try: @@ -2448,9 +2535,12 @@ def pause_from_sync( raise if out: - bg_task, repl = out - assert repl is repl - assert bg_task is repl_owner + bg_task, _ = out + else: + bg_task: Task = current_task() + + # assert repl is repl + assert bg_task is repl_owner # NOTE: normally set inside `_enter_repl_sync()` DebugStatus.repl_task: str = repl_owner @@ -2464,7 +2554,10 @@ def pause_from_sync( ) log.devx(message) + # NOTE set as late as possible to avoid state clobbering + # in the multi-threaded case! DebugStatus.repl = repl + _set_trace( api_frame=api_frame or inspect.currentframe(), repl=repl, @@ -2665,7 +2758,8 @@ async def acquire_debug_lock( tuple, ]: ''' - Request to acquire the TTY `Lock` in the root actor, release on exit. + Request to acquire the TTY `Lock` in the root actor, release on + exit. This helper is for actor's who don't actually need to acquired the debugger but want to wait until the lock is free in the @@ -2677,10 +2771,14 @@ async def acquire_debug_lock( yield None return + task: Task = current_task() async with trio.open_nursery() as n: ctx: Context = await n.start( - request_root_stdio_lock, - subactor_uid, + partial( + request_root_stdio_lock, + actor_uid=subactor_uid, + task_uid=(task.name, id(task)), + ) ) yield ctx ctx.cancel()