Compare commits
5 Commits
f8178df0fd
...
8c730193f9
| Author | SHA1 | Date |
|---|---|---|
|
|
8c730193f9 | |
|
|
5418f2dc3c | |
|
|
383b0fdd75 | |
|
|
060f7d24c4 | |
|
|
3c366cac13 |
|
|
@ -93,7 +93,7 @@ testing = [
|
||||||
repl = [
|
repl = [
|
||||||
"pyperclip>=1.9.0",
|
"pyperclip>=1.9.0",
|
||||||
"prompt-toolkit>=3.0.50",
|
"prompt-toolkit>=3.0.50",
|
||||||
"xonsh>=0.22.8",
|
"xonsh>=0.23.0",
|
||||||
"psutil>=7.0.0",
|
"psutil>=7.0.0",
|
||||||
]
|
]
|
||||||
lint = [
|
lint = [
|
||||||
|
|
@ -134,7 +134,7 @@ sync_pause = {requires-python = ">=3.13, <3.14"}
|
||||||
# xonsh = { git = 'https://github.com/anki-code/xonsh.git', branch = 'prompt_next_suggestion' }
|
# xonsh = { git = 'https://github.com/anki-code/xonsh.git', branch = 'prompt_next_suggestion' }
|
||||||
# ^ https://github.com/xonsh/xonsh/pull/6048
|
# ^ https://github.com/xonsh/xonsh/pull/6048
|
||||||
# xonsh = { git = 'https://github.com/xonsh/xonsh.git', branch = 'main' }
|
# xonsh = { git = 'https://github.com/xonsh/xonsh.git', branch = 'main' }
|
||||||
xonsh = { path = "../xonsh", editable = true }
|
# xonsh = { path = "../xonsh", editable = true }
|
||||||
|
|
||||||
# [tool.uv.sources.pdbp]
|
# [tool.uv.sources.pdbp]
|
||||||
# XXX, in case we need to tmp patch again.
|
# XXX, in case we need to tmp patch again.
|
||||||
|
|
@ -203,7 +203,35 @@ all_bullets = true
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
minversion = '6.0'
|
minversion = '6.0'
|
||||||
timeout = 200 # per-test hard limit
|
# NOTE: `pytest-timeout`'s global per-test cap is intentionally
|
||||||
|
# NOT set — both of its enforcement methods break trio's
|
||||||
|
# runtime under our fork-based spawn backends:
|
||||||
|
#
|
||||||
|
# - `method='signal'` (the default; SIGALRM) raises `Failed`
|
||||||
|
# synchronously from the signal handler in trio's main
|
||||||
|
# thread, which leaves `GLOBAL_RUN_CONTEXT` half-installed
|
||||||
|
# ("Trio guest run got abandoned"). EVERY subsequent
|
||||||
|
# `trio.run()` in the same pytest session then bails with
|
||||||
|
# `RuntimeError: Attempted to call run() from inside a
|
||||||
|
# run()` — full-session poison: a single 200s hang
|
||||||
|
# cascades into 30+ false-positive failures across
|
||||||
|
# downstream test files.
|
||||||
|
#
|
||||||
|
# - `method='thread'` calls `_thread.interrupt_main()` which
|
||||||
|
# can let the resulting `KeyboardInterrupt` escape trio's
|
||||||
|
# `KIManager` under fork-cascade teardown races, killing
|
||||||
|
# the whole pytest session.
|
||||||
|
#
|
||||||
|
# For tests that legitimately need a wall-clock cap, use
|
||||||
|
# `with trio.fail_after(N):` INSIDE the test — trio's own
|
||||||
|
# Cancelled machinery handles the timeout cleanly through
|
||||||
|
# the actor nursery without disturbing global state. See
|
||||||
|
# `tests/test_advanced_streaming.py::test_dynamic_pub_sub`'s
|
||||||
|
# module-level NOTE for the canonical pattern.
|
||||||
|
#
|
||||||
|
# CI environments should rely on job-level wall-clock
|
||||||
|
# timeouts (e.g. GitHub Actions `timeout-minutes`) for an
|
||||||
|
# escape hatch on genuinely-stuck suites.
|
||||||
# https://docs.pytest.org/en/stable/reference/reference.html#configuration-options
|
# https://docs.pytest.org/en/stable/reference/reference.html#configuration-options
|
||||||
testpaths = [
|
testpaths = [
|
||||||
'tests'
|
'tests'
|
||||||
|
|
|
||||||
|
|
@ -55,12 +55,37 @@ async def maybe_expect_raises(
|
||||||
raises: BaseException|None = None,
|
raises: BaseException|None = None,
|
||||||
ensure_in_message: list[str]|None = None,
|
ensure_in_message: list[str]|None = None,
|
||||||
post_mortem: bool = False,
|
post_mortem: bool = False,
|
||||||
timeout: int = 3,
|
# NOTE, `None` selects a backend-aware default below —
|
||||||
|
# see `_BACKEND_TIMEOUT_DEFAULTS` for rationale. Caller
|
||||||
|
# can override with an explicit value to opt out.
|
||||||
|
timeout: int|None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
'''
|
'''
|
||||||
Async wrapper for ensuring errors propagate from the inner scope.
|
Async wrapper for ensuring errors propagate from the inner scope.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
if timeout is None:
|
||||||
|
# Pick a backend-aware default. Fork-based backends
|
||||||
|
# (`main_thread_forkserver`) need much more headroom
|
||||||
|
# because actor spawn + IPC ctx-exit + msg-validation
|
||||||
|
# error path takes longer than under `trio` backend
|
||||||
|
# — especially under cross-pytest-stream contention
|
||||||
|
# (#451). `test_basic_payload_spec` empirically:
|
||||||
|
# - 3s flaked all-valid variant (`TooSlowError`)
|
||||||
|
# - 8s flaked `invalid-return` variant
|
||||||
|
# (`Cancelled` surfaced instead of `MsgTypeError`
|
||||||
|
# because `fail_after` fired mid-error-path)
|
||||||
|
# - 15s flaked under cross-stream contention
|
||||||
|
# 30s for fork-based gives plenty of headroom while
|
||||||
|
# still failing-loud on a genuine hang. Other
|
||||||
|
# backends keep the original 3s.
|
||||||
|
from tractor.spawn import _spawn as _spawn_mod
|
||||||
|
timeout = (
|
||||||
|
30
|
||||||
|
if _spawn_mod._spawn_method == 'main_thread_forkserver'
|
||||||
|
else 3
|
||||||
|
)
|
||||||
|
|
||||||
if tractor.debug_mode():
|
if tractor.debug_mode():
|
||||||
timeout += 999
|
timeout += 999
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -157,17 +157,26 @@ def test_dynamic_pub_sub(
|
||||||
from multiprocessing import cpu_count
|
from multiprocessing import cpu_count
|
||||||
cpus = cpu_count()
|
cpus = cpu_count()
|
||||||
|
|
||||||
|
# Hard safety cap via trio's own cancellation — see the
|
||||||
|
# module-level NOTE on why we avoid `pytest-timeout` for
|
||||||
|
# this test. Picked backend-aware: under `trio` backend
|
||||||
|
# spawn is cheap (~1s for `cpus` actors) but fork-based
|
||||||
|
# backends pay a per-spawn cost (forkserver round-trip +
|
||||||
|
# IPC peer-handshake) that can stack up over `cpus - 1`
|
||||||
|
# sequential `n.run_in_actor()` calls — especially on UDS
|
||||||
|
# under cross-pytest contention (#451 / #452). Empirically
|
||||||
|
# 12s flakes on `main_thread_forkserver`; 30s gives
|
||||||
|
# plenty of headroom while still failing-loud on a real
|
||||||
|
# hang.
|
||||||
|
from tractor.spawn import _spawn as _spawn_mod
|
||||||
|
fail_after_s: int = (
|
||||||
|
30
|
||||||
|
if _spawn_mod._spawn_method == 'main_thread_forkserver'
|
||||||
|
else 12
|
||||||
|
)
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# Hard safety cap via trio's own cancellation — see
|
with trio.fail_after(fail_after_s):
|
||||||
# the module-level NOTE on why we avoid `pytest-timeout`
|
|
||||||
# for this test. Total expected runtime: ~1s spawn + 3s
|
|
||||||
# sleep + ~1-2s cancel cascade ≈ 5-6s. 12s gives plenty
|
|
||||||
# of headroom; if exceeded, trio raises `TooSlowError`
|
|
||||||
# which the outer `try` block treats as a hang report
|
|
||||||
# (or, if `expect_cancel_exc is trio.TooSlowError`, as
|
|
||||||
# the test passing — either way, no global state
|
|
||||||
# corruption).
|
|
||||||
with trio.fail_after(12):
|
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
|
|
|
||||||
|
|
@ -213,6 +213,21 @@ def pytest_addoption(
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.addoption(
|
||||||
|
"--enable-stackscope",
|
||||||
|
action="store_true",
|
||||||
|
dest='tractor_enable_stackscope',
|
||||||
|
default=False,
|
||||||
|
help=(
|
||||||
|
'Install `stackscope` SIGUSR1 handler in pytest + '
|
||||||
|
'every spawned subactor for live trio task-tree '
|
||||||
|
'dumps during hang investigations. Lighter than '
|
||||||
|
'`--tpdb` (no pdb machinery / tty-lock contention) '
|
||||||
|
'— use when you only need stack visibility. To '
|
||||||
|
'capture: `kill -USR1 <pytest-or-subactor-pid>`.'
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
# provide which IPC transport protocols opting-in test suites
|
# provide which IPC transport protocols opting-in test suites
|
||||||
# should accumulatively run against.
|
# should accumulatively run against.
|
||||||
parser.addoption(
|
parser.addoption(
|
||||||
|
|
@ -253,6 +268,37 @@ def pytest_configure(
|
||||||
'in `ai/conc-anal/subint_sigint_starvation_issue.md`).'
|
'in `ai/conc-anal/subint_sigint_starvation_issue.md`).'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# `--enable-stackscope`: install SIGUSR1 → trio task-tree
|
||||||
|
# dump in pytest itself + propagate to every subactor via
|
||||||
|
# an env var that fork-children inherit and the runtime
|
||||||
|
# gate honors. Lighter than `--tpdb` (no pdb machinery) —
|
||||||
|
# purely for hang-investigation stack visibility.
|
||||||
|
if getattr(
|
||||||
|
config.option, 'tractor_enable_stackscope', False
|
||||||
|
):
|
||||||
|
import os
|
||||||
|
# Env var inherited via fork → subactor's runtime
|
||||||
|
# picks it up at `Actor.async_main` startup. See the
|
||||||
|
# gate in `tractor.runtime._runtime` matching this
|
||||||
|
# var name.
|
||||||
|
os.environ['TRACTOR_ENABLE_STACKSCOPE'] = '1'
|
||||||
|
|
||||||
|
# Install in pytest itself so `kill -USR1 <pytest>`
|
||||||
|
# dumps the parent trio task-tree (which is where
|
||||||
|
# most Mode-A-class hangs park).
|
||||||
|
try:
|
||||||
|
from tractor.devx._stackscope import (
|
||||||
|
enable_stack_on_sig,
|
||||||
|
)
|
||||||
|
enable_stack_on_sig()
|
||||||
|
except ImportError:
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
'`stackscope` not installed — '
|
||||||
|
'--enable-stackscope is a no-op. '
|
||||||
|
'Install via the `devx` dep group.'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def pytest_collection_modifyitems(
|
def pytest_collection_modifyitems(
|
||||||
config: pytest.Config,
|
config: pytest.Config,
|
||||||
|
|
|
||||||
|
|
@ -66,7 +66,20 @@ def dump_task_tree() -> None:
|
||||||
Do a classic `stackscope.extract()` task-tree dump to console at
|
Do a classic `stackscope.extract()` task-tree dump to console at
|
||||||
`.devx()` level.
|
`.devx()` level.
|
||||||
|
|
||||||
|
Also unconditionally tee the rendered tree to two
|
||||||
|
capture-bypassing sinks so SIGUSR1 dumps remain visible
|
||||||
|
when the parent process has captured stdio (e.g. pytest's
|
||||||
|
default `--capture=fd`):
|
||||||
|
|
||||||
|
- `/tmp/tractor-stackscope-<pid>.log` (append-mode, always
|
||||||
|
written) — guaranteed-readable artifact even under CI
|
||||||
|
/ `nohup` / no-tty conditions. `tail -f` to follow.
|
||||||
|
- `/dev/tty` if a controlling terminal is attached —
|
||||||
|
best-effort, ignored if the device is missing or write
|
||||||
|
fails. pytest never captures the tty.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
import os
|
||||||
import stackscope
|
import stackscope
|
||||||
tree_str: str = str(
|
tree_str: str = str(
|
||||||
stackscope.extract(
|
stackscope.extract(
|
||||||
|
|
@ -96,7 +109,7 @@ def dump_task_tree() -> None:
|
||||||
# |_{Supervisor/Scope
|
# |_{Supervisor/Scope
|
||||||
# |_[Storage/Memory/IPC-Stream/Data-Struct
|
# |_[Storage/Memory/IPC-Stream/Data-Struct
|
||||||
|
|
||||||
log.devx(
|
full_dump: str = (
|
||||||
f'Dumping `stackscope` tree for actor\n'
|
f'Dumping `stackscope` tree for actor\n'
|
||||||
f'(>: {actor.uid!r}\n'
|
f'(>: {actor.uid!r}\n'
|
||||||
f' |_{mp.current_process()}\n'
|
f' |_{mp.current_process()}\n'
|
||||||
|
|
@ -105,33 +118,35 @@ def dump_task_tree() -> None:
|
||||||
f'\n'
|
f'\n'
|
||||||
f'{sigint_handler_report}\n'
|
f'{sigint_handler_report}\n'
|
||||||
f'signal.getsignal(SIGINT) -> {current_sigint_handler!r}\n'
|
f'signal.getsignal(SIGINT) -> {current_sigint_handler!r}\n'
|
||||||
# f'\n'
|
|
||||||
# start-of-trace-tree delimiter (mostly for testing)
|
|
||||||
# f'------ {actor.uid!r} ------\n'
|
|
||||||
f'\n'
|
f'\n'
|
||||||
f'------ start-of-{actor.uid!r} ------\n'
|
f'------ start-of-{actor.uid!r} ------\n'
|
||||||
f'|\n'
|
f'|\n'
|
||||||
f'{tree_str}'
|
f'{tree_str}'
|
||||||
# end-of-trace-tree delimiter (mostly for testing)
|
|
||||||
f'|\n'
|
f'|\n'
|
||||||
f'|_____ end-of-{actor.uid!r} ______\n'
|
f'|_____ end-of-{actor.uid!r} ______\n'
|
||||||
)
|
)
|
||||||
# TODO: can remove this right?
|
log.devx(full_dump)
|
||||||
# -[ ] was original code from author
|
|
||||||
#
|
# NOTE, capture-bypass sinks. Pytest's default
|
||||||
# print(
|
# `--capture=fd` swallows `log.devx()` above; the
|
||||||
# 'DUMPING FROM PRINT\n'
|
# following two writes guarantee the dump reaches the
|
||||||
# +
|
# human even when stdio is captured.
|
||||||
# content
|
fpath: str = f'/tmp/tractor-stackscope-{os.getpid()}.log'
|
||||||
# )
|
try:
|
||||||
# import logging
|
with open(fpath, 'a') as f:
|
||||||
# try:
|
f.write(full_dump + '\n')
|
||||||
# with open("/dev/tty", "w") as tty:
|
except OSError:
|
||||||
# tty.write(tree_str)
|
log.exception(
|
||||||
# except BaseException:
|
f'Failed to tee stackscope dump to {fpath!r}'
|
||||||
# logging.getLogger(
|
)
|
||||||
# "task_tree"
|
|
||||||
# ).exception("Error printing task tree")
|
try:
|
||||||
|
with open('/dev/tty', 'w') as tty:
|
||||||
|
tty.write(full_dump + '\n')
|
||||||
|
except OSError:
|
||||||
|
# no controlling tty (CI / nohup / detached) —
|
||||||
|
# silently fall through; the file sink covers it.
|
||||||
|
pass
|
||||||
|
|
||||||
_handler_lock = RLock()
|
_handler_lock = RLock()
|
||||||
_tree_dumped: bool = False
|
_tree_dumped: bool = False
|
||||||
|
|
@ -233,7 +248,20 @@ def enable_stack_on_sig(
|
||||||
|
|
||||||
'''
|
'''
|
||||||
try:
|
try:
|
||||||
import stackscope
|
# NOTE, `stackscope._glue` does intentional async-gen type
|
||||||
|
# introspection at import-time which trips
|
||||||
|
# `RuntimeWarning: coroutine method 'asend'/'athrow' was
|
||||||
|
# never awaited`. Benign — they only want the wrapper
|
||||||
|
# type — but visible to users. Squelch the import-only
|
||||||
|
# warning so SIGUSR1 setup stays quiet.
|
||||||
|
import warnings
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings(
|
||||||
|
'ignore',
|
||||||
|
category=RuntimeWarning,
|
||||||
|
message=r"coroutine method '(asend|athrow)' .* was never awaited",
|
||||||
|
)
|
||||||
|
import stackscope
|
||||||
except ImportError:
|
except ImportError:
|
||||||
log.warning(
|
log.warning(
|
||||||
'The `stackscope` lib is not installed!\n'
|
'The `stackscope` lib is not installed!\n'
|
||||||
|
|
|
||||||
|
|
@ -932,7 +932,20 @@ class Actor:
|
||||||
# => update process-wide globals
|
# => update process-wide globals
|
||||||
# TODO! -[ ] another `Struct` for rtvs..
|
# TODO! -[ ] another `Struct` for rtvs..
|
||||||
rvs: dict[str, Any] = spawnspec._runtime_vars
|
rvs: dict[str, Any] = spawnspec._runtime_vars
|
||||||
if rvs['_debug_mode']:
|
|
||||||
|
# `stackscope` SIGUSR1 handler: install when EITHER
|
||||||
|
# `_debug_mode=True` (full multi-actor pdb support
|
||||||
|
# path) OR the `TRACTOR_ENABLE_STACKSCOPE` env var
|
||||||
|
# is set (lighter test-time hang-debug path; see
|
||||||
|
# `tractor._testing.pytest`'s `--enable-stackscope`
|
||||||
|
# CLI flag — env var propagates via fork-inherited
|
||||||
|
# environ).
|
||||||
|
import os
|
||||||
|
if (
|
||||||
|
rvs['_debug_mode']
|
||||||
|
or
|
||||||
|
os.environ.get('TRACTOR_ENABLE_STACKSCOPE')
|
||||||
|
):
|
||||||
from ..devx import (
|
from ..devx import (
|
||||||
enable_stack_on_sig,
|
enable_stack_on_sig,
|
||||||
maybe_init_greenback,
|
maybe_init_greenback,
|
||||||
|
|
@ -948,7 +961,8 @@ class Actor:
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
log.warning(
|
log.warning(
|
||||||
'`stackscope` not installed for use in debug mode!'
|
'`stackscope` not installed for use in '
|
||||||
|
'debug mode / `--enable-stackscope`!'
|
||||||
)
|
)
|
||||||
|
|
||||||
if rvs.get('use_greenback', False):
|
if rvs.get('use_greenback', False):
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,7 @@ Two empirical CPython properties drive the design:
|
||||||
the forked child otherwise (`Fatal Python error: not main
|
the forked child otherwise (`Fatal Python error: not main
|
||||||
interpreter`). Full source-level walkthrough:
|
interpreter`). Full source-level walkthrough:
|
||||||
`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`.
|
`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`.
|
||||||
|
|
||||||
2. **`os.fork()` from a regular `threading.Thread` attached to
|
2. **`os.fork()` from a regular `threading.Thread` attached to
|
||||||
the *main* interpreter — i.e. a worker thread that has never
|
the *main* interpreter — i.e. a worker thread that has never
|
||||||
entered a subint — works cleanly.** Empirically validated
|
entered a subint — works cleanly.** Empirically validated
|
||||||
|
|
@ -86,9 +87,11 @@ costs:
|
||||||
|
|
||||||
- **Sidecar lifecycle**: a second long-lived process per
|
- **Sidecar lifecycle**: a second long-lived process per
|
||||||
parent, with its own start/stop/health-check semantics.
|
parent, with its own start/stop/health-check semantics.
|
||||||
|
|
||||||
- **IPC overhead per spawn**: every actor-spawn round-trips
|
- **IPC overhead per spawn**: every actor-spawn round-trips
|
||||||
an `mp` request message through a unix socket before any
|
an `mp` request message through a unix socket before any
|
||||||
child code runs.
|
child code runs.
|
||||||
|
|
||||||
- **State isolation by process boundary**: the sidecar can't
|
- **State isolation by process boundary**: the sidecar can't
|
||||||
share parent state at all — every spawn is a "cold" child
|
share parent state at all — every spawn is a "cold" child
|
||||||
re-importing modules from disk.
|
re-importing modules from disk.
|
||||||
|
|
@ -106,6 +109,7 @@ For the full variant-2 picture see
|
||||||
1) we already get costs 1 + 2 collapsed; cost 3 will land
|
1) we already get costs 1 + 2 collapsed; cost 3 will land
|
||||||
when msgspec#1026 unblocks isolated-mode subints.
|
when msgspec#1026 unblocks isolated-mode subints.
|
||||||
|
|
||||||
|
|
||||||
What survives the fork? — POSIX semantics
|
What survives the fork? — POSIX semantics
|
||||||
-----------------------------------------
|
-----------------------------------------
|
||||||
|
|
||||||
|
|
@ -113,33 +117,58 @@ A natural worry when forking from a parent that's running
|
||||||
`trio.run()` on another thread: does that trio thread (and
|
`trio.run()` on another thread: does that trio thread (and
|
||||||
any other threads in the parent) keep running in the child?
|
any other threads in the parent) keep running in the child?
|
||||||
|
|
||||||
**No.** POSIX `fork()` only preserves the *calling* thread
|
**No** — but with a precise meaning that's worth pinning
|
||||||
in the child. Every other thread in the parent — trio's
|
down, since the canonical trio framing
|
||||||
runner thread, any `to_thread` cache threads, anything else
|
([python-trio/trio#1614](https://github.com/python-trio/trio/issues/1614))
|
||||||
— is gone the instant `fork()` returns in the child.
|
puts it the opposite-sounding way:
|
||||||
|
|
||||||
|
> If you use `fork()` in a process with multiple threads,
|
||||||
|
> all the other thread stacks are just leaked: there's
|
||||||
|
> nothing else you can reasonably do with them.
|
||||||
|
|
||||||
|
Both statements describe the same POSIX reality from
|
||||||
|
opposite sides:
|
||||||
|
|
||||||
|
- **Execution-side ("gone")**: POSIX `fork()` only
|
||||||
|
preserves the *calling* thread as a runnable thread in
|
||||||
|
the child. Every other thread in the parent — trio's
|
||||||
|
runner thread, any `to_thread` cache threads, anything
|
||||||
|
else — never executes another instruction post-fork.
|
||||||
|
|
||||||
|
- **Memory-side ("leaked")**: those non-running threads'
|
||||||
|
*stacks* and per-thread heap structures are still
|
||||||
|
COW-inherited into the child's address space. They
|
||||||
|
persist as orphaned bytes with no owning thread, no
|
||||||
|
scheduler entry, and no way for the child to clean
|
||||||
|
them up — hence trio's word "leaked".
|
||||||
|
|
||||||
Concretely, after the forkserver worker calls `os.fork()`:
|
Concretely, after the forkserver worker calls `os.fork()`:
|
||||||
|
|
||||||
| thread | parent | child |
|
| thread | parent | child (executing) | child (memory) |
|
||||||
|-----------------------|-----------|---------------|
|
|---------------------|-----------|-------------------|-----------------------------|
|
||||||
| forkserver worker | continues | sole survivor |
|
| forkserver worker | continues | sole survivor | live stack |
|
||||||
| `trio.run()` thread | continues | gone |
|
| `trio.run()` thread | continues | not running | leaked stack (zombie bytes) |
|
||||||
| any other thread | continues | gone |
|
| any other thread | continues | not running | leaked stack (zombie bytes) |
|
||||||
|
|
||||||
The forkserver worker becomes the new "main" execution
|
The forkserver worker becomes the new "main" execution
|
||||||
context in the child; `trio.run()` and every other parent
|
context in the child; `trio.run()` and every other parent
|
||||||
thread never executes a single instruction post-fork in the
|
thread never executes a single instruction post-fork.
|
||||||
child.
|
Their stack memory rides along as inert COW pages until
|
||||||
|
the child's fresh `trio.run()` boots and overwrites/GCs
|
||||||
|
it (or until the child `exec()`s and discards the entire
|
||||||
|
image).
|
||||||
|
|
||||||
This is exactly *why* `os.fork()` is delegated to a
|
This is exactly *why* `os.fork()` is delegated to a
|
||||||
dedicated worker thread that has provably never entered
|
dedicated worker thread that has provably never entered
|
||||||
trio: we want that trio-free thread to be the surviving
|
trio: we want that trio-free thread to be the surviving
|
||||||
one in the child.
|
*executing* thread in the child, with the leaked trio
|
||||||
|
stack reduced to inert COW pages we don't touch.
|
||||||
|
|
||||||
That said, dead-thread *artifacts* still cross the fork
|
The leaked-stack residue is one slice of the broader
|
||||||
boundary (canonical "fork in a multithreaded program is
|
"fork in a multithreaded program is dangerous" hazard
|
||||||
dangerous" — see `man pthread_atfork`). What persists, and
|
class (see `man pthread_atfork`). Other dead-thread
|
||||||
how we handle each:
|
artifacts that cross the fork boundary, and how we handle
|
||||||
|
each:
|
||||||
|
|
||||||
- **Inherited file descriptors** — the dead trio thread's
|
- **Inherited file descriptors** — the dead trio thread's
|
||||||
epoll fd, signal-wakeup-fd, eventfds, sockets, IPC
|
epoll fd, signal-wakeup-fd, eventfds, sockets, IPC
|
||||||
|
|
@ -148,16 +177,20 @@ how we handle each:
|
||||||
`_close_inherited_fds()` in the child prelude — walks
|
`_close_inherited_fds()` in the child prelude — walks
|
||||||
`/proc/self/fd` and closes everything except stdio +
|
`/proc/self/fd` and closes everything except stdio +
|
||||||
the channel pipe to the forkserver.
|
the channel pipe to the forkserver.
|
||||||
|
|
||||||
- **Memory image** — trio's internal data structures
|
- **Memory image** — trio's internal data structures
|
||||||
(scheduler, task queues, runner state) sit in COW
|
(scheduler, task queues, runner state) sit in COW
|
||||||
memory but nobody's executing them. Get GC'd /
|
memory alongside the leaked stacks above. Nobody's
|
||||||
overwritten when the child's fresh `trio.run()` boots.
|
executing them; they get GC'd / overwritten when the
|
||||||
|
child's fresh `trio.run()` boots.
|
||||||
|
|
||||||
- **Python thread state** — handled automatically by
|
- **Python thread state** — handled automatically by
|
||||||
CPython. `PyOS_AfterFork_Child()` calls
|
CPython. `PyOS_AfterFork_Child()` calls
|
||||||
`_PyThreadState_DeleteExceptCurrent()`, so dead
|
`_PyThreadState_DeleteExceptCurrent()`, so dead
|
||||||
`PyThreadState` objects are cleaned and
|
`PyThreadState` objects are cleaned and
|
||||||
`threading.enumerate()` returns just the surviving
|
`threading.enumerate()` returns just the surviving
|
||||||
thread.
|
thread.
|
||||||
|
|
||||||
- **User-level locks (`threading.Lock`)** —
|
- **User-level locks (`threading.Lock`)** —
|
||||||
held-by-dead-thread state is the canonical fork hazard.
|
held-by-dead-thread state is the canonical fork hazard.
|
||||||
Not an issue in practice for tractor: trio doesn't hold
|
Not an issue in practice for tractor: trio doesn't hold
|
||||||
|
|
@ -166,6 +199,7 @@ how we handle each:
|
||||||
either direction). CPython's GIL is auto-reset by the
|
either direction). CPython's GIL is auto-reset by the
|
||||||
fork callback.
|
fork callback.
|
||||||
|
|
||||||
|
|
||||||
FYI: how this dodges the `trio.run()` × `fork()` hazards
|
FYI: how this dodges the `trio.run()` × `fork()` hazards
|
||||||
--------------------------------------------------------
|
--------------------------------------------------------
|
||||||
|
|
||||||
|
|
@ -183,13 +217,16 @@ design dodges each class explicitly:
|
||||||
reader. *Dodge*: the inherited wakeup-fd is closed by
|
reader. *Dodge*: the inherited wakeup-fd is closed by
|
||||||
`_close_inherited_fds()`, then the child's own
|
`_close_inherited_fds()`, then the child's own
|
||||||
`trio.run()` installs a fresh one.
|
`trio.run()` installs a fresh one.
|
||||||
|
|
||||||
- **`epoll`/`kqueue` instance**: trio's I/O backend holds
|
- **`epoll`/`kqueue` instance**: trio's I/O backend holds
|
||||||
one. Inherited as a dead fd; same fix as above.
|
one. Inherited as a dead fd; same fix as above.
|
||||||
|
|
||||||
- **Threadpool cache threads** (`trio.to_thread`): worker
|
- **Threadpool cache threads** (`trio.to_thread`): worker
|
||||||
threads with cached tstate. Don't exist in the child
|
threads with cached tstate. Don't exist in the child
|
||||||
(POSIX); cache state is meaningless garbage that gets
|
(POSIX); cache state is meaningless garbage that gets
|
||||||
reset when the child's trio.run() initializes its own
|
reset when the child's trio.run() initializes its own
|
||||||
thread cache.
|
thread cache.
|
||||||
|
|
||||||
- **Cancel scopes / nurseries / open `trio.Process` /
|
- **Cancel scopes / nurseries / open `trio.Process` /
|
||||||
open sockets**: these are trio-runtime objects, not
|
open sockets**: these are trio-runtime objects, not
|
||||||
kernel objects. The runtime that owns them is gone in
|
kernel objects. The runtime that owns them is gone in
|
||||||
|
|
@ -197,9 +234,11 @@ design dodges each class explicitly:
|
||||||
in COW memory and get overwritten as the child runs.
|
in COW memory and get overwritten as the child runs.
|
||||||
Inherited *kernel* fds those objects wrapped (sockets,
|
Inherited *kernel* fds those objects wrapped (sockets,
|
||||||
proc pipes) are caught by `_close_inherited_fds()`.
|
proc pipes) are caught by `_close_inherited_fds()`.
|
||||||
|
|
||||||
- **`atexit` handlers**: trio doesn't register any that
|
- **`atexit` handlers**: trio doesn't register any that
|
||||||
would mis-fire post-fork; trio's lifetime-stack is
|
would mis-fire post-fork; trio's lifetime-stack is
|
||||||
all `with`-block-scoped and dies with the runner.
|
all `with`-block-scoped and dies with the runner.
|
||||||
|
|
||||||
- **Foreign-language I/O state** (libcurl, OpenSSL session
|
- **Foreign-language I/O state** (libcurl, OpenSSL session
|
||||||
caches, etc.): out of scope — same hazard as any
|
caches, etc.): out of scope — same hazard as any
|
||||||
fork-without-exec; users layering those on top of
|
fork-without-exec; users layering those on top of
|
||||||
|
|
@ -211,6 +250,7 @@ isolation + `_close_inherited_fds()` cleanup gives the
|
||||||
forked child a clean trio environment. Everything else
|
forked child a clean trio environment. Everything else
|
||||||
falls under the standard fork-without-exec disclaimer.
|
falls under the standard fork-without-exec disclaimer.
|
||||||
|
|
||||||
|
|
||||||
Implementation status
|
Implementation status
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
|
@ -231,10 +271,11 @@ follow-up) including the
|
||||||
|
|
||||||
Still-open work (tracked on tractor #379):
|
Still-open work (tracked on tractor #379):
|
||||||
|
|
||||||
- no cancellation / hard-kill stress coverage yet
|
- [ ] no cancellation / hard-kill stress coverage yet
|
||||||
(counterpart to `tests/test_subint_cancellation.py` for
|
(counterpart to `tests/test_subint_cancellation.py` for
|
||||||
the plain `subint` backend),
|
the plain `subint` backend),
|
||||||
- `child_sigint='trio'` mode (flag scaffolded below; default
|
|
||||||
|
- [ ] `child_sigint='trio'` mode (flag scaffolded below; default
|
||||||
is `'ipc'`). Originally intended as a manual SIGINT →
|
is `'ipc'`). Originally intended as a manual SIGINT →
|
||||||
trio-cancel bridge, but investigation showed trio's
|
trio-cancel bridge, but investigation showed trio's
|
||||||
handler IS already correctly installed in the fork-child
|
handler IS already correctly installed in the fork-child
|
||||||
|
|
@ -287,18 +328,22 @@ See also
|
||||||
- `tractor.spawn._subint_forkserver` — variant-2 placeholder
|
- `tractor.spawn._subint_forkserver` — variant-2 placeholder
|
||||||
module; reserved for the future subint-isolated-child
|
module; reserved for the future subint-isolated-child
|
||||||
runtime once jcrist/msgspec#1026 unblocks.
|
runtime once jcrist/msgspec#1026 unblocks.
|
||||||
|
|
||||||
- `tractor.spawn._subint_fork` — the stub for the
|
- `tractor.spawn._subint_fork` — the stub for the
|
||||||
fork-from-non-main-subint strategy that DIDN'T work (kept
|
fork-from-non-main-subint strategy that DIDN'T work (kept
|
||||||
in-tree as documentation of the attempt + the CPython-level
|
in-tree as documentation of the attempt + the CPython-level
|
||||||
block).
|
block).
|
||||||
|
|
||||||
- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
||||||
— CPython source walkthrough of why fork-from-subint is dead.
|
— CPython source walkthrough of why fork-from-subint is dead.
|
||||||
|
|
||||||
- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`
|
- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`
|
||||||
— standalone feasibility check (delegates to this module
|
— standalone feasibility check (delegates to this module
|
||||||
for the primitives it exercises).
|
for the primitives it exercises).
|
||||||
|
|
||||||
'''
|
'''
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import errno
|
||||||
import os
|
import os
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
|
@ -423,9 +468,24 @@ def _close_inherited_fds(
|
||||||
try:
|
try:
|
||||||
os.close(fd)
|
os.close(fd)
|
||||||
closed += 1
|
closed += 1
|
||||||
except OSError:
|
except OSError as oserr:
|
||||||
# fd was already closed (race with listdir) or otherwise
|
# `EBADF` is the benign-and-expected case: the
|
||||||
# unclosable — either is fine.
|
# `os.listdir('/proc/self/fd')` call above itself
|
||||||
|
# opens a transient dirfd that ends up in
|
||||||
|
# `candidates`, then auto-closes before this loop
|
||||||
|
# reaches it. Same for any fd whose Python wrapper
|
||||||
|
# was GC'd between `listdir` and `os.close`.
|
||||||
|
# Suppress at debug-level — surfacing every
|
||||||
|
# EBADF as a full traceback (prior `log.exception`
|
||||||
|
# behavior) drowned the post-fork log channel.
|
||||||
|
if oserr.errno == errno.EBADF:
|
||||||
|
log.debug(
|
||||||
|
f'Skip already-closed inherited fd {fd!r} '
|
||||||
|
f'(EBADF, benign race with listdir)\n'
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
# Other errnos (EIO / EPERM / EINTR / ...) are
|
||||||
|
# genuinely unexpected — keep the loud surface.
|
||||||
log.exception(
|
log.exception(
|
||||||
f'Failed to close inherited fd in child ??\n'
|
f'Failed to close inherited fd in child ??\n'
|
||||||
f'{fd!r}\n'
|
f'{fd!r}\n'
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue