7 changed files with 61 additions and 271 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -93,7 +93,7 @@ testing = [
 repl = [
  "pyperclip>=1.9.0",
  "prompt-toolkit>=3.0.50",
-  "xonsh>=0.23.0",
+  "xonsh>=0.22.8",
  "psutil>=7.0.0",
 ]
 lint = [
@ -134,7 +134,7 @@ sync_pause = {requires-python = ">=3.13, <3.14"}
 # xonsh = { git = 'https://github.com/anki-code/xonsh.git', branch = 'prompt_next_suggestion' }
 # ^ https://github.com/xonsh/xonsh/pull/6048
 # xonsh = { git = 'https://github.com/xonsh/xonsh.git', branch = 'main' }
-# xonsh = { path = "../xonsh", editable = true }
+xonsh = { path = "../xonsh", editable = true }

 # [tool.uv.sources.pdbp]
 # XXX, in case we need to tmp patch again.
@ -203,35 +203,7 @@ all_bullets = true

 [tool.pytest.ini_options]
 minversion = '6.0'
-# NOTE: `pytest-timeout`'s global per-test cap is intentionally
-# NOT set — both of its enforcement methods break trio's
-# runtime under our fork-based spawn backends:
-#
-# - `method='signal'` (the default; SIGALRM) raises `Failed`
-#   synchronously from the signal handler in trio's main
-#   thread, which leaves `GLOBAL_RUN_CONTEXT` half-installed
-#   ("Trio guest run got abandoned"). EVERY subsequent
-#   `trio.run()` in the same pytest session then bails with
-#   `RuntimeError: Attempted to call run() from inside a
-#   run()` — full-session poison: a single 200s hang
-#   cascades into 30+ false-positive failures across
-#   downstream test files.
-#
-# - `method='thread'` calls `_thread.interrupt_main()` which
-#   can let the resulting `KeyboardInterrupt` escape trio's
-#   `KIManager` under fork-cascade teardown races, killing
-#   the whole pytest session.
-#
-# For tests that legitimately need a wall-clock cap, use
-# `with trio.fail_after(N):` INSIDE the test — trio's own
-# Cancelled machinery handles the timeout cleanly through
-# the actor nursery without disturbing global state. See
-# `tests/test_advanced_streaming.py::test_dynamic_pub_sub`'s
-# module-level NOTE for the canonical pattern.
-#
-# CI environments should rely on job-level wall-clock
-# timeouts (e.g. GitHub Actions `timeout-minutes`) for an
-# escape hatch on genuinely-stuck suites.
+timeout = 200  # per-test hard limit
 # https://docs.pytest.org/en/stable/reference/reference.html#configuration-options
 testpaths = [
  'tests'
--- a/tests/msg/test_pldrx_limiting.py
+++ b/tests/msg/test_pldrx_limiting.py
@ -55,37 +55,12 @@ async def maybe_expect_raises(
    raises: BaseException|None = None,
    ensure_in_message: list[str]|None = None,
    post_mortem: bool = False,
-    # NOTE, `None` selects a backend-aware default below —
-    # see `_BACKEND_TIMEOUT_DEFAULTS` for rationale. Caller
-    # can override with an explicit value to opt out.
-    timeout: int|None = None,
+    timeout: int = 3,
 ) -> None:
    '''
    Async wrapper for ensuring errors propagate from the inner scope.

    '''
-    if timeout is None:
-        # Pick a backend-aware default. Fork-based backends
-        # (`main_thread_forkserver`) need much more headroom
-        # because actor spawn + IPC ctx-exit + msg-validation
-        # error path takes longer than under `trio` backend
-        # — especially under cross-pytest-stream contention
-        # (#451). `test_basic_payload_spec` empirically:
-        #   - 3s flaked all-valid variant (`TooSlowError`)
-        #   - 8s flaked `invalid-return` variant
-        #     (`Cancelled` surfaced instead of `MsgTypeError`
-        #     because `fail_after` fired mid-error-path)
-        #   - 15s flaked under cross-stream contention
-        # 30s for fork-based gives plenty of headroom while
-        # still failing-loud on a genuine hang. Other
-        # backends keep the original 3s.
-        from tractor.spawn import _spawn as _spawn_mod
-        timeout = (
-            30
-            if _spawn_mod._spawn_method == 'main_thread_forkserver'
-            else 3
-        )
-
    if tractor.debug_mode():
        timeout += 999

--- a/tests/test_advanced_streaming.py
+++ b/tests/test_advanced_streaming.py
@ -157,26 +157,17 @@ def test_dynamic_pub_sub(
    from multiprocessing import cpu_count
    cpus = cpu_count()

-    # Hard safety cap via trio's own cancellation — see the
-    # module-level NOTE on why we avoid `pytest-timeout` for
-    # this test. Picked backend-aware: under `trio` backend
-    # spawn is cheap (~1s for `cpus` actors) but fork-based
-    # backends pay a per-spawn cost (forkserver round-trip +
-    # IPC peer-handshake) that can stack up over `cpus - 1`
-    # sequential `n.run_in_actor()` calls — especially on UDS
-    # under cross-pytest contention (#451 / #452). Empirically
-    # 12s flakes on `main_thread_forkserver`; 30s gives
-    # plenty of headroom while still failing-loud on a real
-    # hang.
-    from tractor.spawn import _spawn as _spawn_mod
-    fail_after_s: int = (
-        30
-        if _spawn_mod._spawn_method == 'main_thread_forkserver'
-        else 12
-    )
-
    async def main():
-        with trio.fail_after(fail_after_s):
+        # Hard safety cap via trio's own cancellation — see
+        # the module-level NOTE on why we avoid `pytest-timeout`
+        # for this test. Total expected runtime: ~1s spawn + 3s
+        # sleep + ~1-2s cancel cascade ≈ 5-6s. 12s gives plenty
+        # of headroom; if exceeded, trio raises `TooSlowError`
+        # which the outer `try` block treats as a hang report
+        # (or, if `expect_cancel_exc is trio.TooSlowError`, as
+        # the test passing — either way, no global state
+        # corruption).
+        with trio.fail_after(12):
            async with tractor.open_nursery(
                registry_addrs=[reg_addr],
                debug_mode=debug_mode,
--- a/tractor/_testing/pytest.py
+++ b/tractor/_testing/pytest.py
@ -213,21 +213,6 @@ def pytest_addoption(
        ),
    )

-    parser.addoption(
-        "--enable-stackscope",
-        action="store_true",
-        dest='tractor_enable_stackscope',
-        default=False,
-        help=(
-            'Install `stackscope` SIGUSR1 handler in pytest + '
-            'every spawned subactor for live trio task-tree '
-            'dumps during hang investigations. Lighter than '
-            '`--tpdb` (no pdb machinery / tty-lock contention) '
-            '— use when you only need stack visibility. To '
-            'capture: `kill -USR1 <pytest-or-subactor-pid>`.'
-        ),
-    )
-
    # provide which IPC transport protocols opting-in test suites
    # should accumulatively run against.
    parser.addoption(
@ -268,37 +253,6 @@ def pytest_configure(
        'in `ai/conc-anal/subint_sigint_starvation_issue.md`).'
    )

-    # `--enable-stackscope`: install SIGUSR1 → trio task-tree
-    # dump in pytest itself + propagate to every subactor via
-    # an env var that fork-children inherit and the runtime
-    # gate honors. Lighter than `--tpdb` (no pdb machinery) —
-    # purely for hang-investigation stack visibility.
-    if getattr(
-        config.option, 'tractor_enable_stackscope', False
-    ):
-        import os
-        # Env var inherited via fork → subactor's runtime
-        # picks it up at `Actor.async_main` startup. See the
-        # gate in `tractor.runtime._runtime` matching this
-        # var name.
-        os.environ['TRACTOR_ENABLE_STACKSCOPE'] = '1'
-
-        # Install in pytest itself so `kill -USR1 <pytest>`
-        # dumps the parent trio task-tree (which is where
-        # most Mode-A-class hangs park).
-        try:
-            from tractor.devx._stackscope import (
-                enable_stack_on_sig,
-            )
-            enable_stack_on_sig()
-        except ImportError:
-            import warnings
-            warnings.warn(
-                '`stackscope` not installed — '
-                '--enable-stackscope is a no-op. '
-                'Install via the `devx` dep group.'
-            )
-

 def pytest_collection_modifyitems(
    config: pytest.Config,
--- a/tractor/devx/_stackscope.py
+++ b/tractor/devx/_stackscope.py
@ -66,20 +66,7 @@ def dump_task_tree() -> None:
    Do a classic `stackscope.extract()` task-tree dump to console at
    `.devx()` level.

-    Also unconditionally tee the rendered tree to two
-    capture-bypassing sinks so SIGUSR1 dumps remain visible
-    when the parent process has captured stdio (e.g. pytest's
-    default `--capture=fd`):
-
-    - `/tmp/tractor-stackscope-<pid>.log` (append-mode, always
-      written) — guaranteed-readable artifact even under CI
-      / `nohup` / no-tty conditions. `tail -f` to follow.
-    - `/dev/tty` if a controlling terminal is attached —
-      best-effort, ignored if the device is missing or write
-      fails. pytest never captures the tty.
-
    '''
-    import os
    import stackscope
    tree_str: str = str(
        stackscope.extract(
@ -109,7 +96,7 @@ def dump_task_tree() -> None:
    # |_{Supervisor/Scope
    # |_[Storage/Memory/IPC-Stream/Data-Struct

-    full_dump: str = (
+    log.devx(
        f'Dumping `stackscope` tree for actor\n'
        f'(>: {actor.uid!r}\n'
        f' |_{mp.current_process()}\n'
@ -118,35 +105,33 @@ def dump_task_tree() -> None:
        f'\n'
        f'{sigint_handler_report}\n'
        f'signal.getsignal(SIGINT) -> {current_sigint_handler!r}\n'
+        # f'\n'
+        # start-of-trace-tree delimiter (mostly for testing)
+        # f'------ {actor.uid!r} ------\n'
        f'\n'
        f'------ start-of-{actor.uid!r} ------\n'
        f'|\n'
        f'{tree_str}'
+        # end-of-trace-tree delimiter (mostly for testing)
        f'|\n'
        f'|_____ end-of-{actor.uid!r} ______\n'
    )
-    log.devx(full_dump)
-
-    # NOTE, capture-bypass sinks. Pytest's default
-    # `--capture=fd` swallows `log.devx()` above; the
-    # following two writes guarantee the dump reaches the
-    # human even when stdio is captured.
-    fpath: str = f'/tmp/tractor-stackscope-{os.getpid()}.log'
-    try:
-        with open(fpath, 'a') as f:
-            f.write(full_dump + '\n')
-    except OSError:
-        log.exception(
-            f'Failed to tee stackscope dump to {fpath!r}'
-        )
-
-    try:
-        with open('/dev/tty', 'w') as tty:
-            tty.write(full_dump + '\n')
-    except OSError:
-        # no controlling tty (CI / nohup / detached) —
-        # silently fall through; the file sink covers it.
-        pass
+    # TODO: can remove this right?
+    # -[ ] was original code from author
+    #
+    # print(
+    #     'DUMPING FROM PRINT\n'
+    #     +
+    #     content
+    # )
+    # import logging
+    # try:
+    #     with open("/dev/tty", "w") as tty:
+    #         tty.write(tree_str)
+    # except BaseException:
+    #     logging.getLogger(
+    #         "task_tree"
+    #     ).exception("Error printing task tree")

 _handler_lock = RLock()
 _tree_dumped: bool = False
@ -248,20 +233,7 @@ def enable_stack_on_sig(

    '''
    try:
-        # NOTE, `stackscope._glue` does intentional async-gen type
-        # introspection at import-time which trips
-        # `RuntimeWarning: coroutine method 'asend'/'athrow' was
-        # never awaited`. Benign — they only want the wrapper
-        # type — but visible to users. Squelch the import-only
-        # warning so SIGUSR1 setup stays quiet.
-        import warnings
-        with warnings.catch_warnings():
-            warnings.filterwarnings(
-                'ignore',
-                category=RuntimeWarning,
-                message=r"coroutine method '(asend|athrow)' .* was never awaited",
-            )
-            import stackscope
+        import stackscope
    except ImportError:
        log.warning(
            'The `stackscope` lib is not installed!\n'
--- a/tractor/runtime/_runtime.py
+++ b/tractor/runtime/_runtime.py
@ -932,20 +932,7 @@ class Actor:
                # => update process-wide globals
                # TODO! -[ ] another `Struct` for rtvs..
                rvs: dict[str, Any] = spawnspec._runtime_vars
-
-                # `stackscope` SIGUSR1 handler: install when EITHER
-                # `_debug_mode=True` (full multi-actor pdb support
-                # path) OR the `TRACTOR_ENABLE_STACKSCOPE` env var
-                # is set (lighter test-time hang-debug path; see
-                # `tractor._testing.pytest`'s `--enable-stackscope`
-                # CLI flag — env var propagates via fork-inherited
-                # environ).
-                import os
-                if (
-                    rvs['_debug_mode']
-                    or
-                    os.environ.get('TRACTOR_ENABLE_STACKSCOPE')
-                ):
+                if rvs['_debug_mode']:
                    from ..devx import (
                        enable_stack_on_sig,
                        maybe_init_greenback,
@ -961,8 +948,7 @@ class Actor:

                    except ImportError:
                        log.warning(
-                            '`stackscope` not installed for use in '
-                            'debug mode / `--enable-stackscope`!'
+                            '`stackscope` not installed for use in debug mode!'
                        )

                    if rvs.get('use_greenback', False):
--- a/tractor/spawn/_main_thread_forkserver.py
+++ b/tractor/spawn/_main_thread_forkserver.py
@ -38,7 +38,6 @@ Two empirical CPython properties drive the design:
   the forked child otherwise (`Fatal Python error: not main
   interpreter`). Full source-level walkthrough:
   `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`.
-
 2. **`os.fork()` from a regular `threading.Thread` attached to
   the *main* interpreter — i.e. a worker thread that has never
   entered a subint — works cleanly.** Empirically validated
@ -87,11 +86,9 @@ costs:

 - **Sidecar lifecycle**: a second long-lived process per
  parent, with its own start/stop/health-check semantics.
-
 - **IPC overhead per spawn**: every actor-spawn round-trips
  an `mp` request message through a unix socket before any
  child code runs.
-
 - **State isolation by process boundary**: the sidecar can't
  share parent state at all — every spawn is a "cold" child
  re-importing modules from disk.
@ -109,7 +106,6 @@ For the full variant-2 picture see
 1) we already get costs 1 + 2 collapsed; cost 3 will land
 when msgspec#1026 unblocks isolated-mode subints.

-
 What survives the fork? — POSIX semantics
 -----------------------------------------

@ -117,58 +113,33 @@ A natural worry when forking from a parent that's running
 `trio.run()` on another thread: does that trio thread (and
 any other threads in the parent) keep running in the child?

-**No** — but with a precise meaning that's worth pinning
-down, since the canonical trio framing
-([python-trio/trio#1614](https://github.com/python-trio/trio/issues/1614))
-puts it the opposite-sounding way:
-
-> If you use `fork()` in a process with multiple threads,
-> all the other thread stacks are just leaked: there's
-> nothing else you can reasonably do with them.
-
-Both statements describe the same POSIX reality from
-opposite sides:
-
- **Execution-side ("gone")**: POSIX `fork()` only
-  preserves the *calling* thread as a runnable thread in
-  the child. Every other thread in the parent — trio's
-  runner thread, any `to_thread` cache threads, anything
-  else — never executes another instruction post-fork.
-
- **Memory-side ("leaked")**: those non-running threads'
-  *stacks* and per-thread heap structures are still
-  COW-inherited into the child's address space. They
-  persist as orphaned bytes with no owning thread, no
-  scheduler entry, and no way for the child to clean
-  them up — hence trio's word "leaked".
+**No.** POSIX `fork()` only preserves the *calling* thread
+in the child. Every other thread in the parent — trio's
+runner thread, any `to_thread` cache threads, anything else
+— is gone the instant `fork()` returns in the child.

 Concretely, after the forkserver worker calls `os.fork()`:

-| thread              | parent    | child (executing) | child (memory)              |
-|---------------------|-----------|-------------------|-----------------------------|
-| forkserver worker   | continues | sole survivor     | live stack                  |
-| `trio.run()` thread | continues | not running       | leaked stack (zombie bytes) |
-| any other thread    | continues | not running       | leaked stack (zombie bytes) |
+| thread                | parent    | child         |
+|-----------------------|-----------|---------------|
+| forkserver worker     | continues | sole survivor |
+| `trio.run()` thread   | continues | gone          |
+| any other thread      | continues | gone          |

 The forkserver worker becomes the new "main" execution
 context in the child; `trio.run()` and every other parent
-thread never executes a single instruction post-fork.
-Their stack memory rides along as inert COW pages until
-the child's fresh `trio.run()` boots and overwrites/GCs
-it (or until the child `exec()`s and discards the entire
-image).
+thread never executes a single instruction post-fork in the
+child.

 This is exactly *why* `os.fork()` is delegated to a
 dedicated worker thread that has provably never entered
 trio: we want that trio-free thread to be the surviving
-*executing* thread in the child, with the leaked trio
-stack reduced to inert COW pages we don't touch.
+one in the child.

-The leaked-stack residue is one slice of the broader
-"fork in a multithreaded program is dangerous" hazard
-class (see `man pthread_atfork`). Other dead-thread
-artifacts that cross the fork boundary, and how we handle
-each:
+That said, dead-thread *artifacts* still cross the fork
+boundary (canonical "fork in a multithreaded program is
+dangerous" — see `man pthread_atfork`). What persists, and
+how we handle each:

 - **Inherited file descriptors** — the dead trio thread's
  epoll fd, signal-wakeup-fd, eventfds, sockets, IPC
@ -177,20 +148,16 @@ each:
  `_close_inherited_fds()` in the child prelude — walks
  `/proc/self/fd` and closes everything except stdio +
  the channel pipe to the forkserver.
-
 - **Memory image** — trio's internal data structures
  (scheduler, task queues, runner state) sit in COW
-  memory alongside the leaked stacks above. Nobody's
-  executing them; they get GC'd / overwritten when the
-  child's fresh `trio.run()` boots.
-
+  memory but nobody's executing them. Get GC'd /
+  overwritten when the child's fresh `trio.run()` boots.
 - **Python thread state** — handled automatically by
  CPython. `PyOS_AfterFork_Child()` calls
  `_PyThreadState_DeleteExceptCurrent()`, so dead
  `PyThreadState` objects are cleaned and
  `threading.enumerate()` returns just the surviving
  thread.
-
 - **User-level locks (`threading.Lock`)** —
  held-by-dead-thread state is the canonical fork hazard.
  Not an issue in practice for tractor: trio doesn't hold
@ -199,7 +166,6 @@ each:
  either direction). CPython's GIL is auto-reset by the
  fork callback.

-
 FYI: how this dodges the `trio.run()` × `fork()` hazards
 --------------------------------------------------------

@ -217,16 +183,13 @@ design dodges each class explicitly:
  reader. *Dodge*: the inherited wakeup-fd is closed by
  `_close_inherited_fds()`, then the child's own
  `trio.run()` installs a fresh one.
-
 - **`epoll`/`kqueue` instance**: trio's I/O backend holds
  one. Inherited as a dead fd; same fix as above.
-
 - **Threadpool cache threads** (`trio.to_thread`): worker
  threads with cached tstate. Don't exist in the child
  (POSIX); cache state is meaningless garbage that gets
  reset when the child's trio.run() initializes its own
  thread cache.
-
 - **Cancel scopes / nurseries / open `trio.Process` /
  open sockets**: these are trio-runtime objects, not
  kernel objects. The runtime that owns them is gone in
@ -234,11 +197,9 @@ design dodges each class explicitly:
  in COW memory and get overwritten as the child runs.
  Inherited *kernel* fds those objects wrapped (sockets,
  proc pipes) are caught by `_close_inherited_fds()`.
-
 - **`atexit` handlers**: trio doesn't register any that
  would mis-fire post-fork; trio's lifetime-stack is
  all `with`-block-scoped and dies with the runner.
-
 - **Foreign-language I/O state** (libcurl, OpenSSL session
  caches, etc.): out of scope — same hazard as any
  fork-without-exec; users layering those on top of
@ -250,7 +211,6 @@ isolation + `_close_inherited_fds()` cleanup gives the
 forked child a clean trio environment. Everything else
 falls under the standard fork-without-exec disclaimer.

-
 Implementation status
 ---------------------

@ -271,11 +231,10 @@ follow-up) including the

 Still-open work (tracked on tractor #379):

- [ ] no cancellation / hard-kill stress coverage yet
+- no cancellation / hard-kill stress coverage yet
  (counterpart to `tests/test_subint_cancellation.py` for
  the plain `subint` backend),
-
- [ ] `child_sigint='trio'` mode (flag scaffolded below; default
+- `child_sigint='trio'` mode (flag scaffolded below; default
  is `'ipc'`). Originally intended as a manual SIGINT →
  trio-cancel bridge, but investigation showed trio's
  handler IS already correctly installed in the fork-child
@ -328,22 +287,18 @@ See also
 - `tractor.spawn._subint_forkserver` — variant-2 placeholder
  module; reserved for the future subint-isolated-child
  runtime once jcrist/msgspec#1026 unblocks.
-
 - `tractor.spawn._subint_fork` — the stub for the
  fork-from-non-main-subint strategy that DIDN'T work (kept
  in-tree as documentation of the attempt + the CPython-level
  block).
-
 - `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
  — CPython source walkthrough of why fork-from-subint is dead.
-
 - `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`
  — standalone feasibility check (delegates to this module
  for the primitives it exercises).

 '''
 from __future__ import annotations
-import errno
 import os
 import signal
 import sys
@ -468,24 +423,9 @@ def _close_inherited_fds(
        try:
            os.close(fd)
            closed += 1
-        except OSError as oserr:
-            # `EBADF` is the benign-and-expected case: the
-            # `os.listdir('/proc/self/fd')` call above itself
-            # opens a transient dirfd that ends up in
-            # `candidates`, then auto-closes before this loop
-            # reaches it. Same for any fd whose Python wrapper
-            # was GC'd between `listdir` and `os.close`.
-            # Suppress at debug-level — surfacing every
-            # EBADF as a full traceback (prior `log.exception`
-            # behavior) drowned the post-fork log channel.
-            if oserr.errno == errno.EBADF:
-                log.debug(
-                    f'Skip already-closed inherited fd {fd!r} '
-                    f'(EBADF, benign race with listdir)\n'
-                )
-                continue
-            # Other errnos (EIO / EPERM / EINTR / ...) are
-            # genuinely unexpected — keep the loud surface.
+        except OSError:
+            # fd was already closed (race with listdir) or otherwise
+            # unclosable — either is fine.
            log.exception(
                f'Failed to close inherited fd in child ??\n'
                f'{fd!r}\n'