tractor/tractor/_debug.py

"""
Multi-core debugging for da peeps!

"""
import bdb
import sys
from functools import partial
from contextlib import asynccontextmanager
from typing import Tuple, Optional, Callable, AsyncIterator

import tractor
import trio

from .log import get_logger
from . import _state
from ._discovery import get_root
from ._state import is_root_process
from ._exceptions import is_multi_cancelled

try:
    # wtf: only exported when installed in dev mode?
    import pdbpp
except ImportError:
    # pdbpp is installed in regular mode...it monkey patches stuff
    import pdb
    assert pdb.xpm, "pdbpp is not installed?"  # type: ignore
    pdbpp = pdb

log = get_logger(__name__)


__all__ = ['breakpoint', 'post_mortem']


# TODO: wrap all these in a static global class: ``DebugLock`` maybe?

# placeholder for function to set a ``trio.Event`` on debugger exit
_pdb_release_hook: Optional[Callable] = None

# actor-wide variable pointing to current task name using debugger
_local_task_in_debug: Optional[str] = None

# actor tree-wide actor uid that supposedly has the tty lock
_global_actor_in_debug: Optional[Tuple[str, str]] = None

# lock in root actor preventing multi-access to local tty
_debug_lock: trio.StrictFIFOLock = trio.StrictFIFOLock()
_pdb_complete: Optional[trio.Event] = None

# XXX: set by the current task waiting on the root tty lock
# and must be cancelled if this actor is cancelled via message
# otherwise deadlocks with the parent actor may ensure
_debugger_request_cs: Optional[trio.CancelScope] = None


class TractorConfig(pdbpp.DefaultConfig):
    """Custom ``pdbpp`` goodness.
    """
    # sticky_by_default = True


class PdbwTeardown(pdbpp.Pdb):
    """Add teardown hooks to the regular ``pdbpp.Pdb``.
    """
    # override the pdbpp config with our coolio one
    DefaultConfig = TractorConfig

    # TODO: figure out how to dissallow recursive .set_trace() entry
    # since that'll cause deadlock for us.
    def set_continue(self):
        try:
            super().set_continue()
        finally:
            global _local_task_in_debug
            _local_task_in_debug = None
            _pdb_release_hook()

    def set_quit(self):
        try:
            super().set_quit()
        finally:
            global _local_task_in_debug
            _local_task_in_debug = None
            _pdb_release_hook()


# TODO: will be needed whenever we get to true remote debugging.
# XXX see https://github.com/goodboy/tractor/issues/130

# # TODO: is there some way to determine this programatically?
# _pdb_exit_patterns = tuple(
#     str.encode(patt + "\n") for patt in (
#         'c', 'cont', 'continue', 'q', 'quit')
# )

# def subactoruid2proc(
#     actor: 'Actor',  # noqa
#     uid: Tuple[str, str]
# ) -> trio.Process:
#     n = actor._actoruid2nursery[uid]
#     _, proc, _ = n._children[uid]
#     return proc

# async def hijack_stdin():
#     log.info(f"Hijacking stdin from {actor.uid}")

#     trap std in and relay to subproc
#     async_stdin = trio.wrap_file(sys.stdin)

#     async with aclosing(async_stdin):
#         async for msg in async_stdin:
#             log.trace(f"Stdin input:\n{msg}")
#             # encode to bytes
#             bmsg = str.encode(msg)

#             # relay bytes to subproc over pipe
#             # await proc.stdin.send_all(bmsg)

#             if bmsg in _pdb_exit_patterns:
#                 log.info("Closing stdin hijack")
#                 break


@asynccontextmanager
async def _acquire_debug_lock(uid: Tuple[str, str]) -> AsyncIterator[None]:
    """Acquire a actor local FIFO lock meant to mutex entry to a local
    debugger entry point to avoid tty clobbering by multiple processes.
    """
    global _debug_lock, _global_actor_in_debug

    task_name = trio.lowlevel.current_task().name

    log.debug(
        f"Attempting to acquire TTY lock, remote task: {task_name}:{uid}")

    async with _debug_lock:

        # _debug_lock._uid = uid
        _global_actor_in_debug = uid
        log.debug(f"TTY lock acquired, remote task: {task_name}:{uid}")
        yield

    _global_actor_in_debug = None
    log.debug(f"TTY lock released, remote task: {task_name}:{uid}")


# @contextmanager
# def _disable_sigint():
#     try:
#         # disable sigint handling while in debug
#         prior_handler = signal.signal(signal.SIGINT, handler)
#         yield
#     finally:
#         # restore SIGINT handling
#         signal.signal(signal.SIGINT, prior_handler)


@tractor.context
async def _hijack_stdin_relay_to_child(

    ctx: tractor.Context,
    subactor_uid: Tuple[str, str]

) -> str:

    global _pdb_complete

    task_name = trio.lowlevel.current_task().name

    # TODO: when we get to true remote debugging
    # this will deliver stdin data?

    log.debug(
        "Attempting to acquire TTY lock, "
        f"remote task: {task_name}:{subactor_uid}"
    )

    log.debug(f"Actor {subactor_uid} is WAITING on stdin hijack lock")

    async with _acquire_debug_lock(subactor_uid):

        # XXX: only shield the context sync step!
        with trio.CancelScope(shield=True):

            # indicate to child that we've locked stdio
            await ctx.started('Locked')
            log.runtime(  # type: ignore
                f"Actor {subactor_uid} ACQUIRED stdin hijack lock")

        # wait for unlock pdb by child
        async with ctx.open_stream() as stream:
            try:
                assert await stream.receive() == 'pdb_unlock'

            except trio.BrokenResourceError:
                # XXX: there may be a race with the portal teardown
                # with the calling actor which we can safely ignore
                # the alternative would be sending an ack message
                # and allowing the client to wait for us to teardown
                # first?
                pass

    log.debug(
        f"TTY lock released, remote task: {task_name}:{subactor_uid}")

    log.debug(f"Actor {subactor_uid} RELEASED stdin hijack lock")
    return "pdb_unlock_complete"


async def _breakpoint(debug_func) -> None:
    """``tractor`` breakpoint entry for engaging pdb machinery
    in subactors.

    """
    actor = tractor.current_actor()
    task_name = trio.lowlevel.current_task().name

    global _pdb_complete, _pdb_release_hook
    global _local_task_in_debug, _global_actor_in_debug

    async def wait_for_parent_stdin_hijack(
        task_status=trio.TASK_STATUS_IGNORED
    ):
        global _debugger_request_cs

        with trio.CancelScope() as cs:
            _debugger_request_cs = cs

            try:
                async with get_root() as portal:

                    # this syncs to child's ``Context.started()`` call.
                    async with portal.open_context(

                        tractor._debug._hijack_stdin_relay_to_child,
                        subactor_uid=actor.uid,

                    ) as (ctx, val):

                        assert val == 'Locked'

                        async with ctx.open_stream() as stream:

                            # unblock local caller
                            task_status.started()

                            # TODO: shielding currently can cause hangs...
                            # with trio.CancelScope(shield=True):

                            await _pdb_complete.wait()
                            await stream.send('pdb_unlock')

                            # sync with callee termination
                            assert await ctx.result() == "pdb_unlock_complete"

            except tractor.ContextCancelled:
                log.warning('Root actor cancelled debug lock')

            finally:
                log.debug(f"Exiting debugger for actor {actor}")
                global _local_task_in_debug
                _local_task_in_debug = None
                log.debug(f"Child {actor} released parent stdio lock")

    if not _pdb_complete or _pdb_complete.is_set():
        _pdb_complete = trio.Event()

    # TODO: need a more robust check for the "root" actor
    if actor._parent_chan and not is_root_process():
        if _local_task_in_debug:
            if _local_task_in_debug == task_name:
                # this task already has the lock and is
                # likely recurrently entering a breakpoint
                return

            # if **this** actor is already in debug mode block here
            # waiting for the control to be released - this allows
            # support for recursive entries to `tractor.breakpoint()`
            log.warning(f"{actor.uid} already has a debug lock, waiting...")

            await _pdb_complete.wait()
            await trio.sleep(0.1)

        # mark local actor as "in debug mode" to avoid recurrent
        # entries/requests to the root process
        _local_task_in_debug = task_name

        # assign unlock callback for debugger teardown hooks
        _pdb_release_hook = _pdb_complete.set

        # this **must** be awaited by the caller and is done using the
        # root nursery so that the debugger can continue to run without
        # being restricted by the scope of a new task nursery.
        await actor._service_n.start(wait_for_parent_stdin_hijack)

    elif is_root_process():

        # we also wait in the root-parent for any child that
        # may have the tty locked prior
        global _debug_lock

        # TODO: wait, what about multiple root tasks acquiring
        # it though.. shrug?
        # root process (us) already has it; ignore
        if _global_actor_in_debug == actor.uid:
            return

        # XXX: since we need to enter pdb synchronously below,
        # we have to release the lock manually from pdb completion
        # callbacks. Can't think of a nicer way then this atm.
        await _debug_lock.acquire()

        _global_actor_in_debug = actor.uid
        _local_task_in_debug = task_name

        # the lock must be released on pdb completion
        def teardown():
            global _pdb_complete, _debug_lock
            global _global_actor_in_debug, _local_task_in_debug

            _debug_lock.release()
            _global_actor_in_debug = None
            _local_task_in_debug = None
            _pdb_complete.set()

        _pdb_release_hook = teardown

    # block here one (at the appropriate frame *up* where
    # ``breakpoint()`` was awaited and begin handling stdio
    log.debug("Entering the synchronous world of pdb")
    debug_func(actor)


def _mk_pdb():
    # XXX: setting these flags on the pdb instance are absolutely
    # critical to having ctrl-c work in the ``trio`` standard way!
    # The stdlib's pdb supports entering the current sync frame
    # on a SIGINT, with ``trio`` we pretty much never want this
    # and we did we can handle it in the ``tractor`` task runtime.

    pdb = PdbwTeardown()
    pdb.allow_kbdint = True
    pdb.nosigint = True

    return pdb


def _set_trace(actor=None):
    pdb = _mk_pdb()

    if actor is not None:
        log.runtime(f"\nAttaching pdb to actor: {actor.uid}\n")  # type: ignore

        pdb.set_trace(
            # start 2 levels up in user code
            frame=sys._getframe().f_back.f_back,
        )

    else:
        # we entered the global ``breakpoint()`` built-in from sync code
        global _local_task_in_debug, _pdb_release_hook
        _local_task_in_debug = 'sync'

        def nuttin():
            pass

        _pdb_release_hook = nuttin

        pdb.set_trace(
            # start 2 levels up in user code
            frame=sys._getframe().f_back,
        )


breakpoint = partial(
    _breakpoint,
    _set_trace,
)


def _post_mortem(actor):
    log.runtime(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")
    pdb = _mk_pdb()

    # custom Pdb post-mortem entry
    pdbpp.xpm(Pdb=lambda: pdb)


post_mortem = partial(
    _breakpoint,
    _post_mortem,
)


async def _maybe_enter_pm(err):
    if (
        _state.debug_mode()

        # NOTE: don't enter debug mode recursively after quitting pdb
        # Iow, don't re-enter the repl if the `quit` command was issued
        # by the user.
        and not isinstance(err, bdb.BdbQuit)

        # XXX: if the error is the likely result of runtime-wide
        # cancellation, we don't want to enter the debugger since
        # there's races between when the parent actor has killed all
        # comms and when the child tries to contact said parent to
        # acquire the tty lock.

        # Really we just want to mostly avoid catching KBIs here so there
        # might be a simpler check we can do?
        and not is_multi_cancelled(err)
    ):
        log.debug("Actor crashed, entering debug mode")
        await post_mortem()
        return True

    else:
        return False