Compare commits
399 Commits
main
...
subint_for
| Author | SHA1 | Date |
|---|---|---|
|
|
7ee0dc2e8f | |
|
|
b10011a36e | |
|
|
7d0a53d205 | |
|
|
75d5b4cf7b | |
|
|
8aa07a7932 | |
|
|
10db117864 | |
|
|
83b6a3373a | |
|
|
9bbb6f796b | |
|
|
a24600f1a7 | |
|
|
92443dc4ef | |
|
|
d3cbc92751 | |
|
|
099104e0af | |
|
|
abd3950ba6 | |
|
|
7d1e4462d4 | |
|
|
522b57570b | |
|
|
d60245777e | |
|
|
caebf60f4e | |
|
|
3b0724eba8 | |
|
|
cec6cc2a56 | |
|
|
34f333a026 | |
|
|
38ffb875bd | |
|
|
4c00913b3b | |
|
|
5cd06810db | |
|
|
255c9c3a7c | |
|
|
0f4e671862 | |
|
|
d036ef7d7f | |
|
|
7882c37ce0 | |
|
|
2ee44a6fdd | |
|
|
7b14fdcd96 | |
|
|
e4953851de | |
|
|
c4082be876 | |
|
|
ec8c4659c4 | |
|
|
29f9928524 | |
|
|
086e9f2c07 | |
|
|
9031605807 | |
|
|
c4885f9d99 | |
|
|
60ce713016 | |
|
|
0ef549fadb | |
|
|
e9712dcaeb | |
|
|
5cf0312c78 | |
|
|
32e89c67ee | |
|
|
d549c72052 | |
|
|
5a9926fc32 | |
|
|
72a0465c52 | |
|
|
9431a81d37 | |
|
|
fc2e298a29 | |
|
|
48523358cf | |
|
|
e2b790a70d | |
|
|
61d4525137 | |
|
|
0996a83655 | |
|
|
1cdc7fb302 | |
|
|
486249d74f | |
|
|
8bc304f094 | |
|
|
fc5e80fea5 | |
|
|
b7115fc875 | |
|
|
208e7c0926 | |
|
|
22cdf15b73 | |
|
|
532a9834f3 | |
|
|
2917b74ba4 | |
|
|
2d4995e08d | |
|
|
8c730193f9 | |
|
|
5418f2dc3c | |
|
|
383b0fdd75 | |
|
|
060f7d24c4 | |
|
|
3c366cac13 | |
|
|
f8178df0fd | |
|
|
530160fa69 | |
|
|
b376eb0332 | |
|
|
7c5dd4d033 | |
|
|
cbdf1eb6db | |
|
|
205382a39b | |
|
|
9f0709eee2 | |
|
|
5e83881f10 | |
|
|
57dae0e4a6 | |
|
|
99dade0fb3 | |
|
|
4b5176e2c3 | |
|
|
3ab99d557a | |
|
|
54561959e6 | |
|
|
66f1941f46 | |
|
|
9b05f659b3 | |
|
|
65fcfbf224 | |
|
|
4f12d69b41 | |
|
|
aa3e230926 | |
|
|
c99d475d03 | |
|
|
6d76b60404 | |
|
|
eae478f3d5 | |
|
|
44bdb1697c | |
|
|
2ca0f41e61 | |
|
|
b350aa09ee | |
|
|
d6e70e9de4 | |
|
|
4c133ab541 | |
|
|
4106ba73ea | |
|
|
eceed29d4a | |
|
|
e312a68d8a | |
|
|
4d0555435b | |
|
|
ab86f7613d | |
|
|
458a35cf09 | |
|
|
7cd47ef7fb | |
|
|
76d12060aa | |
|
|
506617c695 | |
|
|
8ac3dfeb85 | |
|
|
c20b05e181 | |
|
|
9993db0193 | |
|
|
35da808905 | |
|
|
70d58c4bd2 | |
|
|
1af2121057 | |
|
|
e3f4f5a387 | |
|
|
d093c31979 | |
|
|
1e357dcf08 | |
|
|
e31eb8d7c9 | |
|
|
8bcbe730bf | |
|
|
5e85f184e0 | |
|
|
f5f37b69e6 | |
|
|
a72deef709 | |
|
|
dcd5c1ff40 | |
|
|
76605d5609 | |
|
|
7804a9feac | |
|
|
63ab7c986b | |
|
|
26914fde75 | |
|
|
cf2e71d87f | |
|
|
25e400d526 | |
|
|
82332fbceb | |
|
|
de4f470b6c | |
|
|
0f48ed2eb9 | |
|
|
eee79a0357 | |
|
|
4b2a0886c3 | |
|
|
3b26b59dad | |
|
|
f3cea714bc | |
|
|
985ea76de5 | |
|
|
5998774535 | |
|
|
a6cbac954d | |
|
|
189f4e3ffc | |
|
|
a65fded4c6 | |
|
|
4a3254583b | |
|
|
2ed5e6a6e8 | |
|
|
34d9d482e4 | |
|
|
09466a1e9d | |
|
|
99541feec7 | |
|
|
c041518bdb | |
|
|
31cbd11a5b | |
|
|
8a8d01e076 | |
|
|
03bf2b931e | |
|
|
b8f243e98d | |
|
|
d2ea8aa2de | |
|
|
d318f1f8f4 | |
|
|
64ddc42ad8 | |
|
|
b524ee4633 | |
|
|
b1a0753a3f | |
|
|
ba86d482e3 | |
|
|
d3d6f646f9 | |
|
|
9cf3d588e7 | |
|
|
e75e29b1dc | |
|
|
a7b1ee34ef | |
|
|
ae5b63c0bc | |
|
|
f75865fb2e | |
|
|
e0b8f23cbc | |
|
|
8d662999a4 | |
|
|
d7ca68cf61 | |
|
|
b5b0504918 | |
|
|
de78a6445b | |
|
|
5c98ab1fb6 | |
|
|
3867403fab | |
|
|
7c8e5a6732 | |
|
|
3152f423d8 | |
|
|
ed65301d32 | |
|
|
8817032c90 | |
|
|
70dc60a199 | |
|
|
cd287c7e93 | |
|
|
7b04b2cdfc | |
|
|
75b07c4b7c | |
|
|
86d4e0d3ed | |
|
|
ccb013a615 | |
|
|
c3d6cc9007 | |
|
|
cb7b76c44f | |
|
|
23677f8a3c | |
|
|
06ff2dd5f2 | |
|
|
a891e003b2 | |
|
|
e90241baaa | |
|
|
7079a597c5 | |
|
|
bc60aa1ec5 | |
|
|
f881683c97 | |
|
|
490fac432c | |
|
|
5f6e45e1d4 | |
|
|
5c4438bacc | |
|
|
90ba0e3658 | |
|
|
1f1e09a786 | |
|
|
7cf3b5d00d | |
|
|
c72d495d68 | |
|
|
926e861f52 | |
|
|
d9cb38372f | |
|
|
b4c975b48b | |
|
|
8344537aa6 | |
|
|
3a9f4ea383 | |
|
|
ca1b01f926 | |
|
|
570c975f14 | |
|
|
a0a7668670 | |
|
|
27bf566d75 | |
|
|
e8f1eca8d2 | |
|
|
656c6c30d1 | |
|
|
acf6568275 | |
|
|
c6c591e61a | |
|
|
b883b27646 | |
|
|
00637764d9 | |
|
|
ea971d25aa | |
|
|
83b6c4270a | |
|
|
6309c2e6fc | |
|
|
f5301d3fb0 | |
|
|
9f8e9eb739 | |
|
|
6b04650187 | |
|
|
a0607f363e | |
|
|
2ee86ddb1a | |
|
|
0286d36ed7 | |
|
|
2db6f97130 | |
|
|
9af6adc181 | |
|
|
452a32fb23 | |
|
|
f47010d7e9 | |
|
|
3f198bc86c | |
|
|
391c8d3566 | |
|
|
4fc477cfd6 | |
|
|
cd1cd03725 | |
|
|
febe587c6c | |
|
|
4d3c5b9163 | |
|
|
8b106b9144 | |
|
|
d6ece8eab3 | |
|
|
8494eb9b8a | |
|
|
cab366cd65 | |
|
|
85f9c5df6f | |
|
|
ebe9d5e4b5 | |
|
|
bbf01d5161 | |
|
|
ec8e8a2786 | |
|
|
c3d1ec22eb | |
|
|
8f44efa327 | |
|
|
5968a3c773 | |
|
|
80597b80bf | |
|
|
a41c6d5c70 | |
|
|
9c37b3f956 | |
|
|
8f6bc56174 | |
|
|
b14dbde77b | |
|
|
cd6509b724 | |
|
|
93d99ed2eb | |
|
|
6215e3b2dd | |
|
|
be5d8da8c0 | |
|
|
21ed181835 | |
|
|
9ec2749ab7 | |
|
|
f3441a6790 | |
|
|
cc42d38284 | |
|
|
6827ceba12 | |
|
|
94458807ce | |
|
|
be5e7e446b | |
|
|
571b2b320e | |
|
|
c7b5d00f19 | |
|
|
1049f7bf38 | |
|
|
cc3bfac741 | |
|
|
e71eec07de | |
|
|
b557ec20a7 | |
|
|
85457cb839 | |
|
|
850219f60c | |
|
|
d929fb75b5 | |
|
|
403c2174a1 | |
|
|
528012f35f | |
|
|
0dfa6f4a8a | |
|
|
a0d3741fac | |
|
|
149b800c9f | |
|
|
03f458a45c | |
|
|
e77198bb64 | |
|
|
5b8f6cf4c7 | |
|
|
8868ff19f3 | |
|
|
066011b83d | |
|
|
b1d003d850 | |
|
|
8991ec2bf5 | |
|
|
dfc153c228 | |
|
|
52e8fb43ee | |
|
|
99577b719a | |
|
|
4092db60b2 | |
|
|
4f333dee05 | |
|
|
8a2f74da2c | |
|
|
2bf155131d | |
|
|
0f29f5717a | |
|
|
5ea721683b | |
|
|
f84ef44992 | |
|
|
1e0c57c6c5 | |
|
|
65660c77c7 | |
|
|
c9b415475f | |
|
|
359bcf691f | |
|
|
b3ce5ab4f6 | |
|
|
e89fe03da7 | |
|
|
417b796169 | |
|
|
36cbc07602 | |
|
|
1f2fad22ee | |
|
|
ca5f6f50a8 | |
|
|
a7ff1387c7 | |
|
|
abbb4a79c8 | |
|
|
1529095c32 | |
|
|
8215a7ba34 | |
|
|
c1c4d85958 | |
|
|
88b084802f | |
|
|
bf1dcea9d1 | |
|
|
5c270b89d5 | |
|
|
6ee0149e8d | |
|
|
9c4cd869fb | |
|
|
afd66ce3b7 | |
|
|
f9bdb1b35d | |
|
|
d135ce94af | |
|
|
fb94aa0095 | |
|
|
b71e8575e5 | |
|
|
bbc028e84c | |
|
|
016306adf5 | |
|
|
712c009790 | |
|
|
79396b4a26 | |
|
|
5b2905b702 | |
|
|
776af3fce6 | |
|
|
4639685770 | |
|
|
98a7d69341 | |
|
|
ab6c955949 | |
|
|
a72bb9321e | |
|
|
0e2949ea59 | |
|
|
fb73935dbc | |
|
|
94dfeb1441 | |
|
|
9c1bcb23af | |
|
|
a1ea373f34 | |
|
|
e8f3d64e71 | |
|
|
b30faaca82 | |
|
|
51701fc8dc | |
|
|
7b89204afd | |
|
|
82d02ef404 | |
|
|
b7546fd221 | |
|
|
86c95539ca | |
|
|
706a4b761b | |
|
|
c5af2fa778 | |
|
|
86489cc453 | |
|
|
2631fb4ff3 | |
|
|
aee86f2544 | |
|
|
83c8a8ad78 | |
|
|
daae196048 | |
|
|
70efcb09a0 | |
|
|
a7e74acdff | |
|
|
9c3d3bcec1 | |
|
|
521fb97fe9 | |
|
|
d8a3969048 | |
|
|
01c0db651a | |
|
|
7bcd7aca2b | |
|
|
920d0043b4 | |
|
|
93b9a6cd97 | |
|
|
e7cefba67f | |
|
|
683476cc96 | |
|
|
ad24df0ed7 | |
|
|
a1622c0b94 | |
|
|
a385d20810 | |
|
|
7f9044c1ef | |
|
|
d0618e3cb4 | |
|
|
a5bebf76d5 | |
|
|
814b2e7e62 | |
|
|
1704f73504 | |
|
|
c735fc8544 | |
|
|
c5ea6040bf | |
|
|
d4f2fa547a | |
|
|
20896bfbab | |
|
|
70bb77280e | |
|
|
916f88a070 | |
|
|
91f2f3ec10 | |
|
|
3e5124e184 | |
|
|
fa86269e30 | |
|
|
d0b92bbeba | |
|
|
9470815f5a | |
|
|
592d918394 | |
|
|
0cddc67bdb | |
|
|
052fe2435f | |
|
|
28819bf5d3 | |
|
|
07c2ba5c0d | |
|
|
50f40f427b | |
|
|
bf6de55865 | |
|
|
5ded99a886 | |
|
|
7145fa364f | |
|
|
f8e25688c7 | |
|
|
c3f455a8ec | |
|
|
f78e842fba | |
|
|
3638b80c9d | |
|
|
2ed9e65530 | |
|
|
6cab363c51 | |
|
|
8aee24e83f | |
|
|
cdcc1b42fc | |
|
|
51ac0c623e | |
|
|
3f0bde1bf8 | |
|
|
fa1a15dce8 | |
|
|
5850844297 | |
|
|
ff02939213 | |
|
|
d61e8caab2 | |
|
|
0b0c83e9da | |
|
|
5e7c0f264d | |
|
|
edf1189fe0 | |
|
|
de24bfe052 | |
|
|
e235b96894 | |
|
|
dea4b9fd93 | |
|
|
557e2cec6a | |
|
|
0e3229f16d | |
|
|
448d25aef4 | |
|
|
343c9e0034 | |
|
|
1dc27c5161 | |
|
|
14aefa4b11 |
|
|
@ -0,0 +1,38 @@
|
|||
# Docs TODOs
|
||||
|
||||
## Auto-sync README code examples with source
|
||||
|
||||
The `docs/README.rst` has inline code blocks that
|
||||
duplicate actual example files (e.g.
|
||||
`examples/infected_asyncio_echo_server.py`). Every time
|
||||
the public API changes we have to manually sync both.
|
||||
|
||||
Sphinx's `literalinclude` directive can pull code directly
|
||||
from source files:
|
||||
|
||||
```rst
|
||||
.. literalinclude:: ../examples/infected_asyncio_echo_server.py
|
||||
:language: python
|
||||
:caption: examples/infected_asyncio_echo_server.py
|
||||
```
|
||||
|
||||
Or to include only a specific function/section:
|
||||
|
||||
```rst
|
||||
.. literalinclude:: ../examples/infected_asyncio_echo_server.py
|
||||
:language: python
|
||||
:pyobject: aio_echo_server
|
||||
```
|
||||
|
||||
This way the docs always reflect the actual code without
|
||||
manual syncing.
|
||||
|
||||
### Considerations
|
||||
- `README.rst` is also rendered on GitHub/PyPI which do
|
||||
NOT support `literalinclude` - so we'd need a build
|
||||
step or a separate `_sphinx_readme.rst` (which already
|
||||
exists at `docs/github_readme/_sphinx_readme.rst`).
|
||||
- Could use a pre-commit hook or CI step to extract code
|
||||
from examples into the README for GitHub rendering.
|
||||
- Another option: `sphinx-autodoc` style approach where
|
||||
docstrings from the actual module are pulled in.
|
||||
|
|
@ -0,0 +1,125 @@
|
|||
# `RuntimeVars` env-var lift — design plan
|
||||
|
||||
Status: **draft, awaiting user edits**
|
||||
|
||||
## Goal
|
||||
|
||||
Consolidate the sprawl of pytest CLI flags + ad-hoc env vars +
|
||||
hardcoded fixture defaults into a *single* env-var-encoded
|
||||
runtime-vars envelope, with a typed in-memory representation
|
||||
(`tractor.runtime._state.RuntimeVars`) as the sole source of
|
||||
truth.
|
||||
|
||||
## Why now
|
||||
|
||||
- `--tpt-proto`, `--spawn-backend`, `--diag-on-hang`,
|
||||
`--diag-capture-delay` and (soon) `TRACTOR_REG_ADDR` etc. are
|
||||
proliferating. Each adds a parsing seam.
|
||||
- `tests/devx/test_debugger.py` invokes example scripts as
|
||||
separate subprocesses; they currently can't see the
|
||||
fixture-allocated `reg_addr` at all (root cause of why
|
||||
parametrizing devx scripts on `reg_addr` is on your TODO).
|
||||
- Concurrent pytest sessions on the same host collide on
|
||||
shared defaults (the `registry@1616` race we just fixed is
|
||||
one symptom; per-session unique addr is the structural
|
||||
fix).
|
||||
- `tractor.runtime._state.RuntimeVars: Struct` is already
|
||||
defined and **unused** — its docstring even says it
|
||||
"should be utilized as possible for future calls."
|
||||
|
||||
## Design
|
||||
|
||||
### Module: `tractor/_testing/_rtvars.py`
|
||||
|
||||
Lifted from `modden.runtime.env`, ~50 LOC, no new deps.
|
||||
|
||||
```python
|
||||
_TRACTOR_RT_VARS_OSENV: str = '_TRACTOR_RT_VARS'
|
||||
|
||||
def dump_rtvars(rtvars: RuntimeVars|dict) -> tuple[str, str]:
|
||||
'''str-serialize via `str(dict)` — ast.literal_eval-able'''
|
||||
|
||||
def load_rtvars(env: dict) -> RuntimeVars:
|
||||
'''ast.literal_eval the env-var value, hydrate to struct'''
|
||||
|
||||
def get_rtvars(proc: psutil.Process|None = None) -> RuntimeVars:
|
||||
'''read the var from a target proc's env (or current)'''
|
||||
|
||||
def update_rtvars(
|
||||
rtvars: RuntimeVars|dict|None = None,
|
||||
update_osenv: bool|dict = True,
|
||||
) -> tuple[str, str]:
|
||||
'''mutate + re-encode + (optionally) write to os.environ'''
|
||||
```
|
||||
|
||||
### Encoding choice: `str(dict)` + `ast.literal_eval`
|
||||
|
||||
Pros:
|
||||
- stdlib only
|
||||
- handles all the types tractor's tests need: `str`, `int`,
|
||||
`float`, `bool`, `None`, `list`, `tuple`, `dict`
|
||||
- human-readable in the env (greppable, inspectable via
|
||||
`cat /proc/<pid>/environ | tr '\0' '\n'`)
|
||||
|
||||
Cons:
|
||||
- non-stdlib types (msgspec Structs, `Path`, custom classes)
|
||||
must be lowered first — fine for the test fixture set
|
||||
- not stable across Python versions for esoteric repr cases
|
||||
(we don't hit any)
|
||||
|
||||
Alternatives considered:
|
||||
- **msgpack**: adds a dep + binary form is ungreppable
|
||||
- **json**: doesn't preserve tuples (becomes lists), which is
|
||||
a common type for `reg_addr`
|
||||
- **toml/yaml**: heavier deps, no real benefit
|
||||
|
||||
### `RuntimeVars` becomes the single source of truth
|
||||
|
||||
The legacy `_runtime_vars: dict[str, Any]` global in
|
||||
`runtime/_state.py` becomes a *cached view* of a
|
||||
`RuntimeVars` singleton instance:
|
||||
|
||||
- `get_runtime_vars()` returns either the struct or a
|
||||
`.to_dict()` view depending on caller's preference
|
||||
- `set_runtime_vars(...)` validates against the struct schema
|
||||
- spawn-time SpawnSpec sends the struct (already does
|
||||
conceptually — just gets typed)
|
||||
- `__setattr__` `breakpoint()` debug instrumentation gets
|
||||
removed (unrelated cleanup, mentioned in conversation)
|
||||
|
||||
### Migration path
|
||||
|
||||
**Phase 0** *(prep)*: strip the stray `breakpoint()` from
|
||||
`RuntimeVars.__setattr__`.
|
||||
|
||||
**Phase 1**: land `_rtvars.py` as a leaf module, used only by
|
||||
test infra. Subprocess-spawned scripts in `tests/devx/`
|
||||
read `_TRACTOR_RT_VARS` on startup → reconstruct
|
||||
`RuntimeVars` → call `tractor.open_root_actor(**rtvars.as_kwargs())`.
|
||||
Concurrent runs become deterministic-isolated because each
|
||||
session writes a unique `_registry_addrs` into the env.
|
||||
|
||||
**Phase 2**: migrate runtime callers (`_state.get_runtime_vars`,
|
||||
spawn `SpawnSpec`, `Actor.async_main`) to operate on the
|
||||
struct directly, with the dict as a compat view that gets
|
||||
deprecated.
|
||||
|
||||
**Phase 3** *(structural)*: per-session bindspace subdir
|
||||
`/run/user/<uid>/tractor/<session_uuid>/` — encoded in the
|
||||
rt-vars envelope, picked up by every subactor automatically.
|
||||
Obsoletes the entire bindspace-leak warning class.
|
||||
|
||||
## Open design questions (user input wanted)
|
||||
|
||||
- (placeholder for your edits)
|
||||
- (placeholder)
|
||||
- (placeholder)
|
||||
|
||||
## Out-of-scope for this lift
|
||||
|
||||
- Anything in `modden.runtime.env` related to `Spawn`,
|
||||
`WmCtl`, `Wks` — that's a workspace orchestration layer,
|
||||
not an env-var helper. We only lift the four utility
|
||||
functions + the var name constant.
|
||||
- Switching to msgpack/json — explicitly chosen against
|
||||
above.
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Bash(cp .claude/*)",
|
||||
"Read(.claude/**)",
|
||||
"Read(.claude/skills/run-tests/**)",
|
||||
"Write(.claude/**/*commit_msg*)",
|
||||
"Write(.claude/git_commit_msg_LATEST.md)",
|
||||
"Skill(run-tests)",
|
||||
"Skill(close-wkt)",
|
||||
"Skill(open-wkt)",
|
||||
"Skill(prompt-io)",
|
||||
"Bash(date *)",
|
||||
"Bash(git diff *)",
|
||||
"Bash(git log *)",
|
||||
"Bash(git status)",
|
||||
"Bash(git remote:*)",
|
||||
"Bash(git stash:*)",
|
||||
"Bash(git mv:*)",
|
||||
"Bash(git rev-parse:*)",
|
||||
"Bash(test:*)",
|
||||
"Bash(ls:*)",
|
||||
"Bash(grep:*)",
|
||||
"Bash(find:*)",
|
||||
"Bash(ln:*)",
|
||||
"Bash(cat:*)",
|
||||
"Bash(mkdir:*)",
|
||||
"Bash(gh pr:*)",
|
||||
"Bash(gh api:*)",
|
||||
"Bash(gh issue:*)",
|
||||
"Bash(UV_PROJECT_ENVIRONMENT=py* uv sync:*)",
|
||||
"Bash(UV_PROJECT_ENVIRONMENT=py* uv run:*)",
|
||||
"Bash(echo EXIT:$?:*)",
|
||||
"Bash(echo \"EXIT=$?\")",
|
||||
"Read(//tmp/**)"
|
||||
],
|
||||
"deny": [],
|
||||
"ask": []
|
||||
},
|
||||
"prefersReducedMotion": false,
|
||||
"outputStyle": "default"
|
||||
}
|
||||
|
|
@ -0,0 +1,225 @@
|
|||
# Commit Message Style Guide for `tractor`
|
||||
|
||||
Analysis based on 500 recent commits from the `tractor` repository.
|
||||
|
||||
## Core Principles
|
||||
|
||||
Write commit messages that are technically precise yet casual in
|
||||
tone. Use abbreviations and informal language while maintaining
|
||||
clarity about what changed and why.
|
||||
|
||||
## Subject Line Format
|
||||
|
||||
### Length and Structure
|
||||
- Target: ~50 chars with a hard-max of 67.
|
||||
- Use backticks around code elements (72.2% of commits)
|
||||
- Rarely use colons (5.2%), except for file prefixes
|
||||
- End with '?' for uncertain changes (rare: 0.8%)
|
||||
- End with '!' for important changes (rare: 2.0%)
|
||||
|
||||
### Opening Verbs (Present Tense)
|
||||
|
||||
Most common verbs from analysis:
|
||||
- `Add` (14.4%) - wholly new features/functionality
|
||||
- `Use` (4.4%) - adopt new approach/tool
|
||||
- `Drop` (3.6%) - remove code/feature
|
||||
- `Fix` (2.4%) - bug fixes
|
||||
- `Move`/`Mv` (3.6%) - relocate code
|
||||
- `Adjust` (2.0%) - minor tweaks
|
||||
- `Update` (1.6%) - enhance existing feature
|
||||
- `Bump` (1.2%) - dependency updates
|
||||
- `Rename` (1.2%) - identifier changes
|
||||
- `Set` (1.2%) - configuration changes
|
||||
- `Handle` (1.0%) - add handling logic
|
||||
- `Raise` (1.0%) - add error raising
|
||||
- `Pass` (0.8%) - pass parameters/values
|
||||
- `Support` (0.8%) - add support for something
|
||||
- `Hide` (1.4%) - make private/internal
|
||||
- `Always` (1.4%) - enforce consistent behavior
|
||||
- `Mk` (1.4%) - make/create (abbreviated)
|
||||
- `Start` (1.0%) - begin implementation
|
||||
|
||||
Other frequent verbs: `More`, `Change`, `Extend`, `Disable`, `Log`,
|
||||
`Enable`, `Ensure`, `Expose`, `Allow`
|
||||
|
||||
### Backtick Usage
|
||||
|
||||
Always use backticks for:
|
||||
- Module names: `trio`, `asyncio`, `msgspec`, `greenback`, `stackscope`
|
||||
- Class names: `Context`, `Actor`, `Address`, `PldRx`, `SpawnSpec`
|
||||
- Method names: `.pause_from_sync()`, `._pause()`, `.cancel()`
|
||||
- Function names: `breakpoint()`, `collapse_eg()`, `open_root_actor()`
|
||||
- Decorators: `@acm`, `@context`
|
||||
- Exceptions: `Cancelled`, `TransportClosed`, `MsgTypeError`
|
||||
- Keywords: `finally`, `None`, `False`
|
||||
- Variable names: `tn`, `debug_mode`
|
||||
- Complex expressions: `trio.Cancelled`, `asyncio.Task`
|
||||
|
||||
Most backticked terms in tractor:
|
||||
`trio`, `asyncio`, `Context`, `.pause_from_sync()`, `tn`,
|
||||
`._pause()`, `breakpoint()`, `collapse_eg()`, `Actor`, `@acm`,
|
||||
`.cancel()`, `Cancelled`, `open_root_actor()`, `greenback`
|
||||
|
||||
### Examples
|
||||
|
||||
Good subject lines:
|
||||
```
|
||||
Add `uds` to `._multiaddr`, tweak typing
|
||||
Drop `DebugStatus.shield` attr, add `.req_finished`
|
||||
Use `stackscope` for all actor-tree rendered "views"
|
||||
Fix `.to_asyncio` inter-task-cancellation!
|
||||
Bump `ruff.toml` to target py313
|
||||
Mv `load_module_from_path()` to new `._code_load` submod
|
||||
Always use `tuple`-cast for singleton parent addrs
|
||||
```
|
||||
|
||||
## Body Format
|
||||
|
||||
### General Structure
|
||||
- 43.2% of commits have no body (simple changes)
|
||||
- Use blank line after subject
|
||||
- Max line length: 67 chars
|
||||
- Use `-` bullets for lists (28.0% of commits)
|
||||
- Rarely use `*` bullets (2.4%)
|
||||
|
||||
### Section Markers
|
||||
|
||||
Use these markers to organize longer commit bodies:
|
||||
- `Also,` (most common: 26 occurrences)
|
||||
- `Other,` (13 occurrences)
|
||||
- `Deats,` (11 occurrences) - for implementation details
|
||||
- `Further,` (7 occurrences)
|
||||
- `TODO,` (3 occurrences)
|
||||
- `Impl details,` (2 occurrences)
|
||||
- `Notes,` (1 occurrence)
|
||||
|
||||
### Common Abbreviations
|
||||
|
||||
Use these freely (sorted by frequency):
|
||||
- `msg` (63) - message
|
||||
- `bg` (37) - background
|
||||
- `ctx` (30) - context
|
||||
- `impl` (27) - implementation
|
||||
- `mod` (26) - module
|
||||
- `obvi` (17) - obviously
|
||||
- `tn` (16) - task name
|
||||
- `fn` (15) - function
|
||||
- `vs` (15) - versus
|
||||
- `bc` (14) - because
|
||||
- `var` (14) - variable
|
||||
- `prolly` (9) - probably
|
||||
- `ep` (6) - entry point
|
||||
- `OW` (5) - otherwise
|
||||
- `rn` (4) - right now
|
||||
- `sig` (4) - signal/signature
|
||||
- `deps` (3) - dependencies
|
||||
- `iface` (2) - interface
|
||||
- `subproc` (2) - subprocess
|
||||
- `tho` (2) - though
|
||||
- `ofc` (2) - of course
|
||||
|
||||
### Tone and Style
|
||||
|
||||
- Casual but technical (use `XD` for humor: 23 times)
|
||||
- Use `..` for trailing thoughts (108 occurrences)
|
||||
- Use `Woops,` to acknowledge mistakes (4 subject lines)
|
||||
- Don't be afraid to show personality while being precise
|
||||
|
||||
### Example Bodies
|
||||
|
||||
Simple with bullets:
|
||||
```
|
||||
Add `multiaddr` and bump up some deps
|
||||
|
||||
Since we're planning to use it for (discovery)
|
||||
addressing, allowing replacement of the hacky (pretend)
|
||||
attempt in `tractor._multiaddr` Bp
|
||||
|
||||
Also pin some deps,
|
||||
- make us py312+
|
||||
- use `pdbp` with my frame indexing fix.
|
||||
- mv to latest `xonsh` for fancy cmd/suggestion injections.
|
||||
|
||||
Bump lock file to match obvi!
|
||||
```
|
||||
|
||||
With section markers:
|
||||
```
|
||||
Use `stackscope` for all actor-tree rendered "views"
|
||||
|
||||
Instead of the (much more) limited and hacky `.devx._code`
|
||||
impls, move to using the new `.devx._stackscope` API which
|
||||
wraps the `stackscope` project.
|
||||
|
||||
Deats,
|
||||
- make new `stackscope.extract_stack()` wrapper
|
||||
- port over frame-descing to `_stackscope.pformat_stack()`
|
||||
- move `PdbREPL` to use `stackscope` render approach
|
||||
- update tests for new stack output format
|
||||
|
||||
Also,
|
||||
- tweak log formatting for consistency
|
||||
- add typing hints throughout
|
||||
```
|
||||
|
||||
## Special Patterns
|
||||
|
||||
### WIP Commits
|
||||
Rare (0.2%) - avoid committing WIP if possible
|
||||
|
||||
### Merge Commits
|
||||
Auto-generated (4.4%), don't worry about style
|
||||
|
||||
### File References
|
||||
- Use `module.py` or `.submodule` style
|
||||
- Rarely use `file.py:line` references (0 in analysis)
|
||||
|
||||
### Links
|
||||
- GitHub links used sparingly (3 total)
|
||||
- Prefer code references over external links
|
||||
|
||||
## Footer
|
||||
|
||||
The default footer should credit `claude` (you) for helping generate
|
||||
the commit msg content:
|
||||
|
||||
```
|
||||
(this commit msg was generated in some part by [`claude-code`][claude-code-gh])
|
||||
[claude-code-gh]: https://github.com/anthropics/claude-code
|
||||
```
|
||||
|
||||
Further, if the patch was solely or in part written
|
||||
by `claude`, instead add:
|
||||
|
||||
```
|
||||
(this patch was generated in some part by [`claude-code`][claude-code-gh])
|
||||
[claude-code-gh]: https://github.com/anthropics/claude-code
|
||||
```
|
||||
|
||||
## Summary Checklist
|
||||
|
||||
Before committing, verify:
|
||||
- [ ] Subject line uses present tense verb
|
||||
- [ ] Subject line ~50 chars (hard max 67)
|
||||
- [ ] Code elements wrapped in backticks
|
||||
- [ ] Body lines ≤67 chars
|
||||
- [ ] Abbreviations used where natural
|
||||
- [ ] Casual yet precise tone
|
||||
- [ ] Section markers if body >3 paragraphs
|
||||
- [ ] Technical accuracy maintained
|
||||
|
||||
## Analysis Metadata
|
||||
|
||||
```
|
||||
Source: tractor repository
|
||||
Commits analyzed: 500
|
||||
Date range: 2019-2025
|
||||
Analysis date: 2026-02-08
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
(this style guide was generated by [`claude-code`][claude-code-gh]
|
||||
analyzing commit history)
|
||||
|
||||
[claude-code-gh]: https://github.com/anthropics/claude-code
|
||||
|
|
@ -0,0 +1,297 @@
|
|||
---
|
||||
name: conc-anal
|
||||
description: >
|
||||
Concurrency analysis for tractor's trio-based
|
||||
async primitives. Trace task scheduling across
|
||||
checkpoint boundaries, identify race windows in
|
||||
shared mutable state, and verify synchronization
|
||||
correctness. Invoke on code segments the user
|
||||
points at, OR proactively when reviewing/writing
|
||||
concurrent cache, lock, or multi-task acm code.
|
||||
argument-hint: "[file:line-range or function name]"
|
||||
allowed-tools:
|
||||
- Read
|
||||
- Grep
|
||||
- Glob
|
||||
- Task
|
||||
---
|
||||
|
||||
Perform a structured concurrency analysis on the
|
||||
target code. This skill should be invoked:
|
||||
|
||||
- **On demand**: user points at a code segment
|
||||
(file:lines, function name, or pastes a snippet)
|
||||
- **Proactively**: when writing or reviewing code
|
||||
that touches shared mutable state across trio
|
||||
tasks — especially `_Cache`, locks, events, or
|
||||
multi-task `@acm` lifecycle management
|
||||
|
||||
## 0. Identify the target
|
||||
|
||||
If the user provides a file:line-range or function
|
||||
name, read that code. If not explicitly provided,
|
||||
identify the relevant concurrent code from context
|
||||
(e.g. the current diff, a failing test, or the
|
||||
function under discussion).
|
||||
|
||||
## 1. Inventory shared mutable state
|
||||
|
||||
List every piece of state that is accessed by
|
||||
multiple tasks. For each, note:
|
||||
|
||||
- **What**: the variable/dict/attr (e.g.
|
||||
`_Cache.values`, `_Cache.resources`,
|
||||
`_Cache.users`)
|
||||
- **Scope**: class-level, module-level, or
|
||||
closure-captured
|
||||
- **Writers**: which tasks/code-paths mutate it
|
||||
- **Readers**: which tasks/code-paths read it
|
||||
- **Guarded by**: which lock/event/ordering
|
||||
protects it (or "UNGUARDED" if none)
|
||||
|
||||
Format as a table:
|
||||
|
||||
```
|
||||
| State | Writers | Readers | Guard |
|
||||
|---------------------|-----------------|-----------------|----------------|
|
||||
| _Cache.values | run_ctx, moc¹ | moc | ctx_key lock |
|
||||
| _Cache.resources | run_ctx, moc | moc, run_ctx | UNGUARDED |
|
||||
```
|
||||
|
||||
¹ `moc` = `maybe_open_context`
|
||||
|
||||
## 2. Map checkpoint boundaries
|
||||
|
||||
For each code path through the target, mark every
|
||||
**checkpoint** — any `await` expression where trio
|
||||
can switch to another task. Use line numbers:
|
||||
|
||||
```
|
||||
L325: await lock.acquire() ← CHECKPOINT
|
||||
L395: await service_tn.start(...) ← CHECKPOINT
|
||||
L411: lock.release() ← (not a checkpoint, but changes lock state)
|
||||
L414: yield (False, yielded) ← SUSPEND (caller runs)
|
||||
L485: no_more_users.set() ← (wakes run_ctx, no switch yet)
|
||||
```
|
||||
|
||||
**Key trio scheduling rules to apply:**
|
||||
- `Event.set()` makes waiters *ready* but does NOT
|
||||
switch immediately
|
||||
- `lock.release()` is not a checkpoint
|
||||
- `await sleep(0)` IS a checkpoint
|
||||
- Code in `finally` blocks CAN have checkpoints
|
||||
(unlike asyncio)
|
||||
- `await` inside `except` blocks can be
|
||||
`trio.Cancelled`-masked
|
||||
|
||||
## 3. Trace concurrent task schedules
|
||||
|
||||
Write out the **interleaved execution trace** for
|
||||
the problematic scenario. Number each step and tag
|
||||
which task executes it:
|
||||
|
||||
```
|
||||
[Task A] 1. acquires lock
|
||||
[Task A] 2. cache miss → allocates resources
|
||||
[Task A] 3. releases lock
|
||||
[Task A] 4. yields to caller
|
||||
[Task A] 5. caller exits → finally runs
|
||||
[Task A] 6. users-- → 0, sets no_more_users
|
||||
[Task A] 7. pops lock from _Cache.locks
|
||||
[run_ctx] 8. wakes from no_more_users.wait()
|
||||
[run_ctx] 9. values.pop(ctx_key)
|
||||
[run_ctx] 10. acm __aexit__ → CHECKPOINT
|
||||
[Task B] 11. creates NEW lock (old one popped)
|
||||
[Task B] 12. acquires immediately
|
||||
[Task B] 13. values[ctx_key] → KeyError
|
||||
[Task B] 14. resources[ctx_key] → STILL EXISTS
|
||||
[Task B] 15. 💥 RuntimeError
|
||||
```
|
||||
|
||||
Identify the **race window**: the range of steps
|
||||
where state is inconsistent. In the example above,
|
||||
steps 9–10 are the window (values gone, resources
|
||||
still alive).
|
||||
|
||||
## 4. Classify the bug
|
||||
|
||||
Categorize what kind of concurrency issue this is:
|
||||
|
||||
- **TOCTOU** (time-of-check-to-time-of-use): state
|
||||
changes between a check and the action based on it
|
||||
- **Stale reference**: a task holds a reference to
|
||||
state that another task has invalidated
|
||||
- **Lifetime mismatch**: a synchronization primitive
|
||||
(lock, event) has a shorter lifetime than the
|
||||
state it's supposed to protect
|
||||
- **Missing guard**: shared state is accessed
|
||||
without any synchronization
|
||||
- **Atomicity gap**: two operations that should be
|
||||
atomic have a checkpoint between them
|
||||
|
||||
## 5. Propose fixes
|
||||
|
||||
For each proposed fix, provide:
|
||||
|
||||
- **Sketch**: pseudocode or diff showing the change
|
||||
- **How it closes the window**: which step(s) from
|
||||
the trace it eliminates or reorders
|
||||
- **Tradeoffs**: complexity, perf, new edge cases,
|
||||
impact on other code paths
|
||||
- **Risk**: what could go wrong (deadlocks, new
|
||||
races, cancellation issues)
|
||||
|
||||
Rate each fix: `[simple|moderate|complex]` impl
|
||||
effort.
|
||||
|
||||
## 6. Output format
|
||||
|
||||
Structure the full analysis as:
|
||||
|
||||
```markdown
|
||||
## Concurrency analysis: `<target>`
|
||||
|
||||
### Shared state
|
||||
<table from step 1>
|
||||
|
||||
### Checkpoints
|
||||
<list from step 2>
|
||||
|
||||
### Race trace
|
||||
<interleaved trace from step 3>
|
||||
|
||||
### Classification
|
||||
<bug type from step 4>
|
||||
|
||||
### Fixes
|
||||
<proposals from step 5>
|
||||
```
|
||||
|
||||
## Tractor-specific patterns to watch
|
||||
|
||||
These are known problem areas in tractor's
|
||||
concurrency model. Flag them when encountered:
|
||||
|
||||
### `_Cache` lock vs `run_ctx` lifetime
|
||||
|
||||
The `_Cache.locks` entry is managed by
|
||||
`maybe_open_context` callers, but `run_ctx` runs
|
||||
in `service_tn` — a different task tree. Lock
|
||||
pop/release in the caller's `finally` does NOT
|
||||
wait for `run_ctx` to finish tearing down. Any
|
||||
state that `run_ctx` cleans up in its `finally`
|
||||
(e.g. `resources.pop()`) is vulnerable to
|
||||
re-entry races after the lock is popped.
|
||||
|
||||
### `values.pop()` → acm `__aexit__` → `resources.pop()` gap
|
||||
|
||||
In `_Cache.run_ctx`, the inner `finally` pops
|
||||
`values`, then the acm's `__aexit__` runs (which
|
||||
has checkpoints), then the outer `finally` pops
|
||||
`resources`. This creates a window where `values`
|
||||
is gone but `resources` still exists — a classic
|
||||
atomicity gap.
|
||||
|
||||
### Global vs per-key counters
|
||||
|
||||
`_Cache.users` as a single `int` (pre-fix) meant
|
||||
that users of different `ctx_key`s inflated each
|
||||
other's counts, preventing teardown when one key's
|
||||
users hit zero. Always verify that per-key state
|
||||
(`users`, `locks`) is actually keyed on `ctx_key`
|
||||
and not on `fid` or some broader key.
|
||||
|
||||
### `Event.set()` wakes but doesn't switch
|
||||
|
||||
`trio.Event.set()` makes waiting tasks *ready* but
|
||||
the current task continues executing until its next
|
||||
checkpoint. Code between `.set()` and the next
|
||||
`await` runs atomically from the scheduler's
|
||||
perspective. Use this to your advantage (or watch
|
||||
for bugs where code assumes the woken task runs
|
||||
immediately).
|
||||
|
||||
### `except` block checkpoint masking
|
||||
|
||||
`await` expressions inside `except` handlers can
|
||||
be masked by `trio.Cancelled`. If a `finally`
|
||||
block runs from an `except` and contains
|
||||
`lock.release()`, the release happens — but any
|
||||
`await` after it in the same `except` may be
|
||||
swallowed. This is why `maybe_open_context`'s
|
||||
cache-miss path does `lock.release()` in a
|
||||
`finally` inside the `except KeyError`.
|
||||
|
||||
### Cancellation in `finally`
|
||||
|
||||
Unlike asyncio, trio allows checkpoints in
|
||||
`finally` blocks. This means `finally` cleanup
|
||||
that does `await` can itself be cancelled (e.g.
|
||||
by nursery shutdown). Watch for cleanup code that
|
||||
assumes it will run to completion.
|
||||
|
||||
### Unbounded waits in cleanup paths
|
||||
|
||||
Any `await <event>.wait()` in a teardown path is
|
||||
a latent deadlock unless the event's setter is
|
||||
GUARANTEED to fire. If the setter depends on
|
||||
external state (peer disconnects, child process
|
||||
exit, subsequent task completion) that itself
|
||||
depends on the current task's progress, you have
|
||||
a mutual wait.
|
||||
|
||||
Rule: **bound every `await X.wait()` in cleanup
|
||||
paths with `trio.move_on_after()`** unless you
|
||||
can prove the setter is unconditionally reachable
|
||||
from the state at the await site. Concrete recent
|
||||
example: `ipc_server.wait_for_no_more_peers()` in
|
||||
`async_main`'s finally (see
|
||||
`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
|
||||
"probe iteration 3") — it was unbounded, and when
|
||||
one peer-handler was stuck the wait-for-no-more-
|
||||
peers event never fired, deadlocking the whole
|
||||
actor-tree teardown cascade.
|
||||
|
||||
### The capture-pipe-fill hang pattern (grep this first)
|
||||
|
||||
When investigating any hang in the test suite
|
||||
**especially under fork-based backends**, first
|
||||
check whether the hang reproduces under `pytest
|
||||
-s` (`--capture=no`). If `-s` makes it go away
|
||||
you're not looking at a trio concurrency bug —
|
||||
you're looking at a Linux pipe-buffer fill.
|
||||
|
||||
Mechanism: pytest replaces fds 1,2 with pipe
|
||||
write-ends. Fork-child subactors inherit those
|
||||
fds. High-volume error-log tracebacks (cancel
|
||||
cascade spew) fill the 64KB pipe buffer. Child
|
||||
`write()` blocks. Child can't exit. Parent's
|
||||
`waitpid`/pidfd wait blocks. Deadlock cascades up
|
||||
the tree.
|
||||
|
||||
Pre-existing guards in `tests/conftest.py` encode
|
||||
this knowledge — grep these BEFORE blaming
|
||||
concurrency:
|
||||
|
||||
```python
|
||||
# tests/conftest.py:258
|
||||
if loglevel in ('trace', 'debug'):
|
||||
# XXX: too much logging will lock up the subproc (smh)
|
||||
loglevel: str = 'info'
|
||||
|
||||
# tests/conftest.py:316
|
||||
# can lock up on the `_io.BufferedReader` and hang..
|
||||
stderr: str = proc.stderr.read().decode()
|
||||
```
|
||||
|
||||
Full post-mortem +
|
||||
`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
|
||||
for the canonical reproduction. Cost several
|
||||
investigation sessions before catching it —
|
||||
because the capture-pipe symptom was masked by
|
||||
deeper cascade-deadlocks. Once the cascades were
|
||||
fixed, the tree tore down enough to generate
|
||||
pipe-filling log volume → capture-pipe finally
|
||||
surfaced. Grep-note for future-self: **if a
|
||||
multi-subproc tractor test hangs, `pytest -s`
|
||||
first, conc-anal second.**
|
||||
|
|
@ -0,0 +1,241 @@
|
|||
# PR/Patch-Request Description Format Reference
|
||||
|
||||
Canonical structure for `tractor` patch-request
|
||||
descriptions, designed to work across GitHub,
|
||||
Gitea, SourceHut, and GitLab markdown renderers.
|
||||
|
||||
**Line length: wrap at 72 chars** for all prose
|
||||
content (Summary bullets, Motivation paragraphs,
|
||||
Scopes bullets, etc.). Fill lines *to* 72 — don't
|
||||
stop short at 50-65. Only raw URLs in
|
||||
reference-link definitions may exceed this.
|
||||
|
||||
## Template
|
||||
|
||||
```markdown
|
||||
<!-- pr-msg-meta
|
||||
branch: <branch-name>
|
||||
base: <base-branch>
|
||||
submitted:
|
||||
github: ___
|
||||
gitea: ___
|
||||
srht: ___
|
||||
-->
|
||||
|
||||
## <Title: present-tense verb + backticked code>
|
||||
|
||||
### Summary
|
||||
- [<hash>][<hash>] Description of change ending
|
||||
with period.
|
||||
- [<hash>][<hash>] Another change description
|
||||
ending with period.
|
||||
- [<hash>][<hash>] [<hash>][<hash>] Multi-commit
|
||||
change description.
|
||||
|
||||
### Motivation
|
||||
<1-2 paragraphs: problem/limitation first,
|
||||
then solution. Hard-wrap at 72 chars.>
|
||||
|
||||
### Scopes changed
|
||||
- [<hash>][<hash>] `pkg.mod.func()` — what
|
||||
changed.
|
||||
* [<hash>][<hash>] Also adjusts
|
||||
`.related_thing()` in same module.
|
||||
- [<hash>][<hash>] `tests.test_mod` — new/changed
|
||||
test coverage.
|
||||
|
||||
<!--
|
||||
### Cross-references
|
||||
Also submitted as
|
||||
[github-pr][] | [gitea-pr][] | [srht-patch][].
|
||||
|
||||
### Links
|
||||
- [relevant-issue-or-discussion](url)
|
||||
- [design-doc-or-screenshot](url)
|
||||
-->
|
||||
|
||||
(this pr content was generated in some part by
|
||||
[`claude-code`][claude-code-gh])
|
||||
|
||||
[<hash>]: https://<service>/<owner>/<repo>/commit/<hash>
|
||||
[claude-code-gh]: https://github.com/anthropics/claude-code
|
||||
|
||||
<!-- cross-service pr refs (fill after submit):
|
||||
[github-pr]: https://github.com/<owner>/<repo>/pull/___
|
||||
[gitea-pr]: https://<host>/<owner>/<repo>/pulls/___
|
||||
[srht-patch]: https://git.sr.ht/~<owner>/<repo>/patches/___
|
||||
-->
|
||||
```
|
||||
|
||||
## Markdown Reference-Link Strategy
|
||||
|
||||
Use reference-style links for ALL commit hashes
|
||||
and cross-service PR refs to ensure cross-service
|
||||
compatibility:
|
||||
|
||||
**Inline usage** (in bullets):
|
||||
```markdown
|
||||
- [f3726cf9][f3726cf9] Add `reg_err_types()`
|
||||
for custom exc lookup.
|
||||
```
|
||||
|
||||
**Definition** (bottom of document):
|
||||
```markdown
|
||||
[f3726cf9]: https://github.com/goodboy/tractor/commit/f3726cf9
|
||||
```
|
||||
|
||||
### Why reference-style?
|
||||
- Keeps prose readable without long inline URLs.
|
||||
- All URLs in one place — trivially swappable
|
||||
per-service.
|
||||
- Most git services auto-link bare SHAs anyway,
|
||||
but explicit refs guarantee it works in *any*
|
||||
md renderer.
|
||||
- The `[hash][hash]` form is self-documenting —
|
||||
display text matches the ref ID.
|
||||
- Cross-service PR refs use the same mechanism:
|
||||
`[github-pr][]` resolves via a ref-link def
|
||||
at the bottom, trivially fillable post-submit.
|
||||
|
||||
## Cross-Service PR Placeholder Mechanism
|
||||
|
||||
The generated description includes three layers
|
||||
of cross-service support, all using native md
|
||||
reference-links:
|
||||
|
||||
### 1. Metadata comment (top of file)
|
||||
|
||||
```markdown
|
||||
<!-- pr-msg-meta
|
||||
branch: remote_exc_type_registry
|
||||
base: main
|
||||
submitted:
|
||||
github: ___
|
||||
gitea: ___
|
||||
srht: ___
|
||||
-->
|
||||
```
|
||||
|
||||
A YAML-ish HTML comment block. The `___`
|
||||
placeholders get filled with PR/patch numbers
|
||||
after submission. Machine-parseable for tooling
|
||||
(e.g. `gish`) but invisible in rendered md.
|
||||
|
||||
### 2. Cross-references section (in body)
|
||||
|
||||
```markdown
|
||||
<!--
|
||||
### Cross-references
|
||||
Also submitted as
|
||||
[github-pr][] | [gitea-pr][] | [srht-patch][].
|
||||
-->
|
||||
```
|
||||
|
||||
Commented out at generation time. After submitting
|
||||
to multiple services, uncomment and the ref-links
|
||||
resolve via the stubs at the bottom.
|
||||
|
||||
### 3. Ref-link stubs (bottom of file)
|
||||
|
||||
```markdown
|
||||
<!-- cross-service pr refs (fill after submit):
|
||||
[github-pr]: https://github.com/goodboy/tractor/pull/___
|
||||
[gitea-pr]: https://pikers.dev/goodboy/tractor/pulls/___
|
||||
[srht-patch]: https://git.sr.ht/~goodboy/tractor/patches/___
|
||||
-->
|
||||
```
|
||||
|
||||
Commented out with `___` number placeholders.
|
||||
After submission: uncomment, replace `___` with
|
||||
the actual number. Each service-specific copy
|
||||
fills in all services' numbers so any copy can
|
||||
cross-reference the others.
|
||||
|
||||
### Post-submission file layout
|
||||
|
||||
```
|
||||
pr_msg_LATEST.md # latest draft (skill root)
|
||||
msgs/
|
||||
20260325T002027Z_mybranch_pr_msg.md # timestamped
|
||||
github/
|
||||
42_pr_msg.md # github PR #42
|
||||
gitea/
|
||||
17_pr_msg.md # gitea PR #17
|
||||
srht/
|
||||
5_pr_msg.md # srht patch #5
|
||||
```
|
||||
|
||||
Each `<service>/<num>_pr_msg.md` is a copy with:
|
||||
- metadata `submitted:` fields filled in
|
||||
- cross-references section uncommented
|
||||
- ref-link stubs uncommented with real numbers
|
||||
- all services cross-linked in each copy
|
||||
|
||||
This mirrors the `gish` skill's
|
||||
`<backend>/<num>.md` pattern.
|
||||
|
||||
## Commit-Link URL Patterns by Service
|
||||
|
||||
| Service | Pattern |
|
||||
|-----------|-------------------------------------|
|
||||
| GitHub | `https://github.com/<o>/<r>/commit/<h>` |
|
||||
| Gitea | `https://<host>/<o>/<r>/commit/<h>` |
|
||||
| SourceHut | `https://git.sr.ht/~<o>/<r>/commit/<h>` |
|
||||
| GitLab | `https://gitlab.com/<o>/<r>/-/commit/<h>` |
|
||||
|
||||
## PR/Patch URL Patterns by Service
|
||||
|
||||
| Service | Pattern |
|
||||
|-----------|-------------------------------------|
|
||||
| GitHub | `https://github.com/<o>/<r>/pull/<n>` |
|
||||
| Gitea | `https://<host>/<o>/<r>/pulls/<n>` |
|
||||
| SourceHut | `https://git.sr.ht/~<o>/<r>/patches/<n>` |
|
||||
| GitLab | `https://gitlab.com/<o>/<r>/-/merge_requests/<n>` |
|
||||
|
||||
## Scope Naming Convention
|
||||
|
||||
Use Python namespace-resolution syntax for
|
||||
referencing changed code scopes:
|
||||
|
||||
| File path | Scope reference |
|
||||
|---------------------------|-------------------------------|
|
||||
| `tractor/_exceptions.py` | `tractor._exceptions` |
|
||||
| `tractor/_state.py` | `tractor._state` |
|
||||
| `tests/test_foo.py` | `tests.test_foo` |
|
||||
| Function in module | `tractor._exceptions.func()` |
|
||||
| Method on class | `.RemoteActorError.src_type` |
|
||||
| Class | `tractor._exceptions.RAE` |
|
||||
|
||||
Prefix with the package path for top-level refs;
|
||||
use leading-dot shorthand (`.ClassName.method()`)
|
||||
for sub-bullets where the parent module is already
|
||||
established.
|
||||
|
||||
## Title Conventions
|
||||
|
||||
Same verb vocabulary as commit messages:
|
||||
- `Add` — wholly new feature/API
|
||||
- `Fix` — bug fix
|
||||
- `Drop` — removal
|
||||
- `Use` — adopt new approach
|
||||
- `Move`/`Mv` — relocate code
|
||||
- `Adjust` — minor tweak
|
||||
- `Update` — enhance existing feature
|
||||
- `Support` — add support for something
|
||||
|
||||
Target 50 chars, hard max 70. Always backtick
|
||||
code elements.
|
||||
|
||||
## Tone
|
||||
|
||||
Casual yet technically precise — matching the
|
||||
project's commit-msg style. Terse but every bullet
|
||||
carries signal. Use project abbreviations freely
|
||||
(msg, bg, ctx, impl, mod, obvi, fn, bc, var,
|
||||
prolly, ep, etc.).
|
||||
|
||||
---
|
||||
|
||||
(this format reference was generated by
|
||||
[`claude-code`][claude-code-gh])
|
||||
[claude-code-gh]: https://github.com/anthropics/claude-code
|
||||
|
|
@ -0,0 +1,625 @@
|
|||
---
|
||||
name: run-tests
|
||||
description: >
|
||||
Run tractor test suite (or subsets). Use when the user wants
|
||||
to run tests, verify changes, or check for regressions.
|
||||
argument-hint: "[test-path-or-pattern] [--opts]"
|
||||
allowed-tools:
|
||||
- Bash(python -m pytest *)
|
||||
- Bash(python -c *)
|
||||
- Bash(python --version *)
|
||||
- Bash(UV_PROJECT_ENVIRONMENT=py* uv run python *)
|
||||
- Bash(UV_PROJECT_ENVIRONMENT=py* uv run pytest *)
|
||||
- Bash(UV_PROJECT_ENVIRONMENT=py* uv sync *)
|
||||
- Bash(UV_PROJECT_ENVIRONMENT=py* uv pip show *)
|
||||
- Bash(git rev-parse *)
|
||||
- Bash(ls *)
|
||||
- Bash(cat *)
|
||||
- Bash(jq * .pytest_cache/*)
|
||||
- Read
|
||||
- Grep
|
||||
- Glob
|
||||
- Task
|
||||
- AskUserQuestion
|
||||
---
|
||||
|
||||
Run the `tractor` test suite using `pytest`. Follow this
|
||||
process:
|
||||
|
||||
## 1. Parse user intent
|
||||
|
||||
From the user's message and any arguments, determine:
|
||||
|
||||
- **scope**: full suite, specific file(s), specific
|
||||
test(s), or a keyword pattern (`-k`).
|
||||
- **transport**: which IPC transport protocol to test
|
||||
against (default: `tcp`, also: `uds`).
|
||||
- **options**: any extra pytest flags the user wants
|
||||
(e.g. `--ll debug`, `--tpdb`, `-x`, `-v`).
|
||||
|
||||
If the user provides a bare path or pattern as argument,
|
||||
treat it as the test target. Examples:
|
||||
|
||||
- `/run-tests` → full suite
|
||||
- `/run-tests test_local.py` → single file
|
||||
- `/run-tests test_registrar -v` → file + verbose
|
||||
- `/run-tests -k cancel` → keyword filter
|
||||
- `/run-tests tests/ipc/ --tpt-proto uds` → subdir + UDS
|
||||
|
||||
## 2. Construct the pytest command
|
||||
|
||||
Base command:
|
||||
```
|
||||
python -m pytest
|
||||
```
|
||||
|
||||
### Default flags (always include unless user overrides):
|
||||
- `-x` (stop on first failure)
|
||||
- `--tb=short` (concise tracebacks)
|
||||
- `--no-header` (reduce noise)
|
||||
|
||||
### Path resolution:
|
||||
- If the user gives a bare filename like `test_local.py`,
|
||||
resolve it under `tests/`.
|
||||
- If the user gives a subdirectory like `ipc/`, resolve
|
||||
under `tests/ipc/`.
|
||||
- Glob if needed: `tests/**/test_*<pattern>*.py`
|
||||
|
||||
### Key pytest options for this project:
|
||||
|
||||
| Flag | Purpose |
|
||||
|---|---|
|
||||
| `--ll <level>` | Set tractor log level (e.g. `debug`, `info`, `runtime`) |
|
||||
| `--tpdb` / `--debug-mode` | Enable tractor's multi-proc debugger |
|
||||
| `--tpt-proto <key>` | IPC transport: `tcp` (default) or `uds` |
|
||||
| `--spawn-backend <be>` | Spawn method: `trio` (default), `mp_spawn`, `mp_forkserver` |
|
||||
| `-k <expr>` | pytest keyword filter |
|
||||
| `-v` / `-vv` | Verbosity |
|
||||
| `-s` | No output capture (useful with `--tpdb`) |
|
||||
|
||||
### Common combos:
|
||||
```sh
|
||||
# quick smoke test of core modules
|
||||
python -m pytest tests/test_local.py tests/test_rpc.py -x --tb=short --no-header
|
||||
|
||||
# full suite, stop on first failure
|
||||
python -m pytest tests/ -x --tb=short --no-header
|
||||
|
||||
# specific test with debug
|
||||
python -m pytest tests/discovery/test_registrar.py::test_reg_then_unreg -x -s --tpdb --ll debug
|
||||
|
||||
# run with UDS transport
|
||||
python -m pytest tests/ -x --tb=short --no-header --tpt-proto uds
|
||||
|
||||
# keyword filter
|
||||
python -m pytest tests/ -x --tb=short --no-header -k "cancel and not slow"
|
||||
```
|
||||
|
||||
## 3. Pre-flight: venv detection (MANDATORY)
|
||||
|
||||
**Always verify a `uv` venv is active before running
|
||||
`python` or `pytest`.** This project uses
|
||||
`UV_PROJECT_ENVIRONMENT=py<MINOR>` naming (e.g.
|
||||
`py313`) — never `.venv`.
|
||||
|
||||
### Step 1: detect active venv
|
||||
|
||||
Run this check first:
|
||||
|
||||
```sh
|
||||
python -c "
|
||||
import sys, os
|
||||
venv = os.environ.get('VIRTUAL_ENV', '')
|
||||
prefix = sys.prefix
|
||||
print(f'VIRTUAL_ENV={venv}')
|
||||
print(f'sys.prefix={prefix}')
|
||||
print(f'executable={sys.executable}')
|
||||
"
|
||||
```
|
||||
|
||||
### Step 2: interpret results
|
||||
|
||||
**Case A — venv is active** (`VIRTUAL_ENV` is set
|
||||
and points to a `py<MINOR>/` dir under the project
|
||||
root or worktree):
|
||||
|
||||
Use bare `python` / `python -m pytest` for all
|
||||
commands. This is the normal, fast path.
|
||||
|
||||
**Case B — no venv active** (`VIRTUAL_ENV` is empty
|
||||
or `sys.prefix` points to a system Python):
|
||||
|
||||
Use `AskUserQuestion` to ask the user:
|
||||
|
||||
> "No uv venv is active. Should I activate one
|
||||
> via `UV_PROJECT_ENVIRONMENT=py<MINOR> uv sync`,
|
||||
> or would you prefer to activate your shell venv
|
||||
> first?"
|
||||
|
||||
Options:
|
||||
1. **"Create/sync venv"** — run
|
||||
`UV_PROJECT_ENVIRONMENT=py<MINOR> uv sync` where
|
||||
`<MINOR>` is detected from `python --version`
|
||||
(e.g. `313` for 3.13). Then use
|
||||
`py<MINOR>/bin/python` for all subsequent
|
||||
commands in this session.
|
||||
2. **"I'll activate it myself"** — stop and let the
|
||||
user `source py<MINOR>/bin/activate` or similar.
|
||||
|
||||
**Case C — inside a git worktree** (`git rev-parse
|
||||
--git-common-dir` differs from `--git-dir`):
|
||||
|
||||
Verify Python resolves from the **worktree's own
|
||||
venv**, not the main repo's:
|
||||
|
||||
```sh
|
||||
python -c "import tractor; print(tractor.__file__)"
|
||||
```
|
||||
|
||||
If the path points outside the worktree, create a
|
||||
worktree-local venv:
|
||||
|
||||
```sh
|
||||
UV_PROJECT_ENVIRONMENT=py<MINOR> uv sync
|
||||
```
|
||||
|
||||
Then use `py<MINOR>/bin/python` for all commands.
|
||||
|
||||
**Why this matters**: without the correct venv,
|
||||
subprocesses spawned by tractor resolve modules
|
||||
from the wrong editable install, causing spurious
|
||||
`AttributeError` / `ModuleNotFoundError`.
|
||||
|
||||
### Fallback: `uv run`
|
||||
|
||||
If the user can't or won't activate a venv, all
|
||||
`python` and `pytest` commands can be prefixed
|
||||
with `UV_PROJECT_ENVIRONMENT=py<MINOR> uv run`:
|
||||
|
||||
```sh
|
||||
# instead of: python -m pytest tests/ -x
|
||||
UV_PROJECT_ENVIRONMENT=py313 uv run pytest tests/ -x
|
||||
|
||||
# instead of: python -c 'import tractor'
|
||||
UV_PROJECT_ENVIRONMENT=py313 uv run python -c 'import tractor'
|
||||
```
|
||||
|
||||
`uv run` auto-discovers the project and venv,
|
||||
but is slower than a pre-activated venv due to
|
||||
lock-file resolution on each invocation. Prefer
|
||||
activating the venv when possible.
|
||||
|
||||
### Step 3: import + collection checks
|
||||
|
||||
After venv is confirmed, always run these
|
||||
(especially after refactors or module moves):
|
||||
|
||||
```sh
|
||||
# 1. package import smoke check
|
||||
python -c 'import tractor; print(tractor)'
|
||||
|
||||
# 2. verify all tests collect (no import errors)
|
||||
python -m pytest tests/ -x -q --co 2>&1 | tail -5
|
||||
```
|
||||
|
||||
If either fails, fix the import error before running
|
||||
any actual tests.
|
||||
|
||||
### Step 4: zombie-actor / stale-registry check (MANDATORY)
|
||||
|
||||
The tractor runtime's default registry address is
|
||||
**`127.0.0.1:1616`** (TCP) / `/tmp/registry@1616.sock`
|
||||
(UDS). Whenever any prior test run — especially one
|
||||
using a fork-based backend like `subint_forkserver` —
|
||||
leaks a child actor process, that zombie keeps the
|
||||
registry port bound and **every subsequent test
|
||||
session fails to bind**, often presenting as 50+
|
||||
unrelated failures ("all tests broken"!) across
|
||||
backends.
|
||||
|
||||
**This has to be checked before the first run AND
|
||||
after any cancelled/SIGINT'd run** — signal failures
|
||||
in the middle of a test can leave orphan children.
|
||||
|
||||
```sh
|
||||
# 1. TCP registry — any listener on :1616? (primary signal)
|
||||
ss -tlnp 2>/dev/null | grep ':1616' || echo 'TCP :1616 free'
|
||||
|
||||
# 2. leftover actor/forkserver procs — scoped to THIS
|
||||
# repo's python path, so we don't false-flag legit
|
||||
# long-running tractor-using apps (e.g. `piker`,
|
||||
# downstream projects that embed tractor).
|
||||
pgrep -af "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" \
|
||||
| grep -v 'grep\|pgrep' \
|
||||
|| echo 'no leaked actor procs from this repo'
|
||||
|
||||
# 3. stale UDS registry sockets
|
||||
ls -la /tmp/registry@*.sock 2>/dev/null \
|
||||
|| echo 'no leaked UDS registry sockets'
|
||||
```
|
||||
|
||||
**Interpretation:**
|
||||
|
||||
- **TCP :1616 free AND no stale sockets** → clean,
|
||||
proceed. The actor-procs probe is secondary — false
|
||||
positives are common (piker, any other tractor-
|
||||
embedding app); only cleanup if `:1616` is bound or
|
||||
sockets linger.
|
||||
- **TCP :1616 bound OR stale sockets present** →
|
||||
surface PIDs + cmdlines to the user, offer cleanup:
|
||||
|
||||
```sh
|
||||
# 1. GRACEFUL FIRST (tractor is structured concurrent — it
|
||||
# catches SIGINT as an OS-cancel in `_trio_main` and
|
||||
# cascades Portal.cancel_actor via IPC to every descendant.
|
||||
# So always try SIGINT first with a bounded timeout; only
|
||||
# escalate to SIGKILL if graceful cleanup doesn't complete).
|
||||
pkill -INT -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
|
||||
|
||||
# 2. bounded wait for graceful teardown (usually sub-second).
|
||||
# Loop until the processes exit, or timeout. Keep the
|
||||
# bound tight — hung/abrupt-killed descendants usually
|
||||
# hang forever, so don't wait more than a few seconds.
|
||||
for i in $(seq 1 10); do
|
||||
pgrep -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" >/dev/null || break
|
||||
sleep 0.3
|
||||
done
|
||||
|
||||
# 3. ESCALATE TO SIGKILL only if graceful didn't finish.
|
||||
if pgrep -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" >/dev/null; then
|
||||
echo 'graceful teardown timed out — escalating to SIGKILL'
|
||||
pkill -9 -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
|
||||
fi
|
||||
|
||||
# 4. if a test zombie holds :1616 specifically and doesn't
|
||||
# match the above pattern, find its PID the hard way:
|
||||
ss -tlnp 2>/dev/null | grep ':1616' # prints `users:(("<name>",pid=NNNN,...))`
|
||||
# then (same SIGINT-first ladder):
|
||||
# kill -INT <NNNN>; sleep 1; kill -9 <NNNN> 2>/dev/null
|
||||
|
||||
# 5. remove stale UDS sockets
|
||||
rm -f /tmp/registry@*.sock
|
||||
|
||||
# 6. re-verify
|
||||
ss -tlnp 2>/dev/null | grep ':1616' || echo 'TCP :1616 now free'
|
||||
```
|
||||
|
||||
**Never ignore stale registry state.** If you see the
|
||||
"all tests failing" pattern — especially
|
||||
`trio.TooSlowError` / connection refused / address in
|
||||
use on many unrelated tests — check registry **before**
|
||||
spelunking into test code. The failure signature will
|
||||
be identical across backends because they're all
|
||||
fighting for the same port.
|
||||
|
||||
**False-positive warning for step 2:** a plain
|
||||
`pgrep -af '_actor_child_main'` will also match
|
||||
legit long-running tractor-embedding apps (e.g.
|
||||
`piker` at `~/repos/piker/py*/bin/python3 -m
|
||||
tractor._child ...`). Always scope to the current
|
||||
repo's python path, or only use step 1 (`:1616`) as
|
||||
the authoritative signal.
|
||||
|
||||
## 4. Run and report
|
||||
|
||||
- Run the constructed command.
|
||||
- Use a timeout of **600000ms** (10min) for full suite
|
||||
runs, **120000ms** (2min) for single-file runs.
|
||||
- If the suite is large (full `tests/`), consider running
|
||||
in the background and checking output when done.
|
||||
- Use `--lf` (last-failed) to re-run only previously
|
||||
failing tests when iterating on a fix.
|
||||
|
||||
### On failure:
|
||||
- Show the failing test name(s) and short traceback.
|
||||
- If the failure looks related to recent changes, point
|
||||
out the likely cause and suggest a fix.
|
||||
- **Check the known-flaky list** (section 8) before
|
||||
investigating — don't waste time on pre-existing
|
||||
timeout issues.
|
||||
- **NEVER auto-commit fixes.** If you apply a code fix
|
||||
during test iteration, leave it unstaged. Tell the
|
||||
user what changed and suggest they review the
|
||||
worktree state, stage files manually, and use
|
||||
`/commit-msg` (inline or in a separate session) to
|
||||
generate the commit message. The human drives all
|
||||
`git add` and `git commit` operations.
|
||||
|
||||
### On success:
|
||||
- Report the pass/fail/skip counts concisely.
|
||||
|
||||
## 5. Test directory layout (reference)
|
||||
|
||||
```
|
||||
tests/
|
||||
├── conftest.py # root fixtures, daemon, signals
|
||||
├── devx/ # debugger/tooling tests
|
||||
├── ipc/ # transport protocol tests
|
||||
├── msg/ # messaging layer tests
|
||||
├── discovery/ # discovery subsystem tests
|
||||
│ ├── test_multiaddr.py # multiaddr construction
|
||||
│ └── test_registrar.py # registry/discovery protocol
|
||||
├── test_local.py # registrar + local actor basics
|
||||
├── test_rpc.py # RPC error handling
|
||||
├── test_spawning.py # subprocess spawning
|
||||
├── test_multi_program.py # multi-process tree tests
|
||||
├── test_cancellation.py # cancellation semantics
|
||||
├── test_context_stream_semantics.py # ctx streaming
|
||||
├── test_inter_peer_cancellation.py # peer cancel
|
||||
├── test_infected_asyncio.py # trio-in-asyncio
|
||||
└── ...
|
||||
```
|
||||
|
||||
## 6. Change-type → test mapping
|
||||
|
||||
After modifying specific modules, run the corresponding
|
||||
test subset first for fast feedback:
|
||||
|
||||
| Changed module(s) | Run these tests first |
|
||||
|---|---|
|
||||
| `runtime/_runtime.py`, `runtime/_state.py` | `test_local.py test_rpc.py test_spawning.py test_root_runtime.py` |
|
||||
| `discovery/` (`_registry`, `_discovery`, `_addr`) | `tests/discovery/ test_multi_program.py test_local.py` |
|
||||
| `_context.py`, `_streaming.py` | `test_context_stream_semantics.py test_advanced_streaming.py` |
|
||||
| `ipc/` (`_chan`, `_server`, `_transport`) | `tests/ipc/ test_2way.py` |
|
||||
| `runtime/_portal.py`, `runtime/_rpc.py` | `test_rpc.py test_cancellation.py` |
|
||||
| `spawn/` (`_spawn`, `_entry`) | `test_spawning.py test_multi_program.py` |
|
||||
| `devx/debug/` | `tests/devx/test_debugger.py` (slow!) |
|
||||
| `to_asyncio.py` | `test_infected_asyncio.py test_root_infect_asyncio.py` |
|
||||
| `msg/` | `tests/msg/` |
|
||||
| `_exceptions.py` | `test_remote_exc_relay.py test_inter_peer_cancellation.py` |
|
||||
| `runtime/_supervise.py` | `test_cancellation.py test_spawning.py` |
|
||||
|
||||
## 7. Quick-check shortcuts
|
||||
|
||||
### After refactors (fastest first-pass):
|
||||
```sh
|
||||
# import + collect check
|
||||
python -c 'import tractor' && python -m pytest tests/ -x -q --co 2>&1 | tail -3
|
||||
|
||||
# core subset (~10s)
|
||||
python -m pytest tests/test_local.py tests/test_rpc.py tests/test_spawning.py tests/discovery/test_registrar.py -x --tb=short --no-header
|
||||
```
|
||||
|
||||
### Inspect last failures (without re-running):
|
||||
|
||||
When the user asks "what failed?", "show failures",
|
||||
or wants to check the last-failed set before
|
||||
re-running — read the pytest cache directly. This
|
||||
is instant and avoids test collection overhead.
|
||||
|
||||
```sh
|
||||
python -c "
|
||||
import json, pathlib, sys
|
||||
p = pathlib.Path('.pytest_cache/v/cache/lastfailed')
|
||||
if not p.exists():
|
||||
print('No lastfailed cache found.'); sys.exit()
|
||||
data = json.loads(p.read_text())
|
||||
# filter to real test node IDs (ignore junk
|
||||
# entries that can accumulate from system paths)
|
||||
tests = sorted(k for k in data if k.startswith('tests/'))
|
||||
if not tests:
|
||||
print('No failures recorded.')
|
||||
else:
|
||||
print(f'{len(tests)} last-failed test(s):')
|
||||
for t in tests:
|
||||
print(f' {t}')
|
||||
"
|
||||
```
|
||||
|
||||
**Why not `--cache-show` or `--co --lf`?**
|
||||
|
||||
- `pytest --cache-show 'cache/lastfailed'` works
|
||||
but dumps raw dict repr including junk entries
|
||||
(stale system paths that leak into the cache).
|
||||
- `pytest --co --lf` actually *collects* tests which
|
||||
triggers import resolution and is slow (~0.5s+).
|
||||
Worse, when cached node IDs don't exactly match
|
||||
current parametrize IDs (e.g. param names changed
|
||||
between runs), pytest falls back to collecting
|
||||
the *entire file*, giving false positives.
|
||||
- Reading the JSON directly is instant, filterable
|
||||
to `tests/`-prefixed entries, and shows exactly
|
||||
what pytest recorded — no interpretation.
|
||||
|
||||
**After inspecting**, re-run the failures:
|
||||
```sh
|
||||
python -m pytest --lf -x --tb=short --no-header
|
||||
```
|
||||
|
||||
### Full suite in background:
|
||||
When core tests pass and you want full coverage while
|
||||
continuing other work, run in background:
|
||||
```sh
|
||||
python -m pytest tests/ -x --tb=short --no-header -q
|
||||
```
|
||||
(use `run_in_background=true` on the Bash tool)
|
||||
|
||||
## 8. Known flaky tests
|
||||
|
||||
These tests have **pre-existing** timing/environment
|
||||
sensitivity. If they fail with `TooSlowError` or
|
||||
pexpect `TIMEOUT`, they are almost certainly NOT caused
|
||||
by your changes — note them and move on.
|
||||
|
||||
| Test | Typical error | Notes |
|
||||
|---|---|---|
|
||||
| `devx/test_debugger.py::test_multi_nested_subactors_error_through_nurseries` | pexpect TIMEOUT | Debugger pexpect timing |
|
||||
| `test_cancellation.py::test_cancel_via_SIGINT_other_task` | TooSlowError | Signal handling race |
|
||||
| `test_inter_peer_cancellation.py::test_peer_spawns_and_cancels_service_subactor` | TooSlowError | Async timing (both param variants) |
|
||||
| `test_docs_examples.py::test_example[we_are_processes.py]` | `assert None == 0` | `__main__` missing `__file__` in subproc |
|
||||
|
||||
**Rule of thumb**: if a test fails with `TooSlowError`,
|
||||
`trio.TooSlowError`, or `pexpect.TIMEOUT` and you didn't
|
||||
touch the relevant code path, it's flaky — skip it.
|
||||
|
||||
## 9. The pytest-capture hang pattern (CHECK THIS FIRST)
|
||||
|
||||
**Symptom:** a tractor test hangs indefinitely under
|
||||
default `pytest` but passes instantly when you add
|
||||
`-s` (`--capture=no`).
|
||||
|
||||
**Cause:** tractor subactors (especially under fork-
|
||||
based backends) inherit pytest's stdout/stderr
|
||||
capture pipes via fds 1,2. Under high-volume error
|
||||
logging (e.g. multi-level cancel cascade, nested
|
||||
`run_in_actor` failures, anything triggering
|
||||
`RemoteActorError` + `ExceptionGroup` traceback
|
||||
spew), the **64KB Linux pipe buffer fills** faster
|
||||
than pytest drains it. Subactor writes block → can't
|
||||
finish exit → parent's `waitpid`/pidfd wait blocks →
|
||||
deadlock cascades up the tree.
|
||||
|
||||
**Pre-existing guards in the tractor harness** that
|
||||
encode this same knowledge — grep these FIRST
|
||||
before spelunking:
|
||||
|
||||
- `tests/conftest.py:258-260` (in the `daemon`
|
||||
fixture): `# XXX: too much logging will lock up
|
||||
the subproc (smh)` — downgrades `trace`/`debug`
|
||||
loglevel to `info` to prevent the hang.
|
||||
- `tests/conftest.py:316`: `# can lock up on the
|
||||
_io.BufferedReader and hang..` — noted on the
|
||||
`proc.stderr.read()` post-SIGINT.
|
||||
|
||||
**Debug recipe (in priority order):**
|
||||
|
||||
1. **Try `-s` first.** If the hang disappears with
|
||||
`pytest -s`, you've confirmed it's capture-pipe
|
||||
fill. Skip spelunking.
|
||||
2. **Lower the loglevel.** Default `--ll=error` on
|
||||
this project; if you've bumped it to `debug` /
|
||||
`info`, try dropping back. Each log level
|
||||
multiplies pipe-pressure under fault cascades.
|
||||
3. **If you MUST use default capture + high log
|
||||
volume**, redirect subactor stdout/stderr in the
|
||||
child prelude (e.g.
|
||||
`tractor.spawn._subint_forkserver._child_target`
|
||||
post-`_close_inherited_fds`) to `/dev/null` or a
|
||||
file.
|
||||
|
||||
**Signature tells you it's THIS bug (vs. a real
|
||||
code hang):**
|
||||
|
||||
- Multi-actor test under fork-based backend
|
||||
(`subint_forkserver`, eventually `trio_proc` too
|
||||
under enough log volume).
|
||||
- Multiple `RemoteActorError` / `ExceptionGroup`
|
||||
tracebacks in the error path.
|
||||
- Test passes with `-s` in the 5-10s range, hangs
|
||||
past pytest-timeout (usually 30+ s) without `-s`.
|
||||
- Subactor processes visible via `pgrep -af
|
||||
subint-forkserv` or similar after the hang —
|
||||
they're alive but blocked on `write()` to an
|
||||
inherited stdout fd.
|
||||
|
||||
**Historical reference:** this deadlock cost a
|
||||
multi-session investigation (4 genuine cascade
|
||||
fixes landed along the way) that only surfaced the
|
||||
capture-pipe issue AFTER the deeper fixes let the
|
||||
tree actually tear down enough to produce pipe-
|
||||
filling log volume. Full post-mortem in
|
||||
`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`.
|
||||
Lesson codified here so future-me grep-finds the
|
||||
workaround before digging.
|
||||
|
||||
## 10. Reaping zombie subactors (`tractor-reap`)
|
||||
|
||||
**Symptom:** after a `pytest` run crashes, times out,
|
||||
or is `Ctrl+C`'d, subactor forks (esp. under
|
||||
`subint_forkserver`) can be reparented to `init`
|
||||
(PPid==1) and linger. They hold onto ports, inherit
|
||||
pytest's capture-pipe fds, and flakify later
|
||||
sessions.
|
||||
|
||||
**Two layers of defense:**
|
||||
|
||||
### a) Session-scoped auto-fixture (always on)
|
||||
|
||||
`tractor/_testing/pytest.py::_reap_orphaned_subactors`
|
||||
runs at pytest session teardown. It walks `/proc` for
|
||||
direct descendants of the pytest pid, SIGINTs them,
|
||||
waits up to 3s, then SIGKILLs survivors. SC-polite:
|
||||
gives the subactor runtime a chance to run its trio
|
||||
cancel shield + IPC teardown before escalation.
|
||||
|
||||
This is *autouse* and session-scoped — you don't need
|
||||
to do anything. It just runs.
|
||||
|
||||
### b) `scripts/tractor-reap` CLI (manual reap)
|
||||
|
||||
For the **pytest-died-mid-session** case (Ctrl+C, OOM
|
||||
kill, hung process you had to `kill -9`), the fixture
|
||||
never ran. Reach for the CLI:
|
||||
|
||||
```sh
|
||||
# default: orphans (PPid==1, cwd==repo, cmd contains python)
|
||||
scripts/tractor-reap
|
||||
|
||||
# descendant-mode: from a still-live supervisor
|
||||
scripts/tractor-reap --parent <pytest-pid>
|
||||
|
||||
# see what would be reaped, don't signal
|
||||
scripts/tractor-reap -n
|
||||
|
||||
# tune the SIGINT → SIGKILL grace window
|
||||
scripts/tractor-reap --grace 5
|
||||
```
|
||||
|
||||
Exit code: `0` if everyone exited on SIGINT, `1` if
|
||||
SIGKILL had to escalate — so you can chain it in CI
|
||||
health-checks (`scripts/tractor-reap || <alert>`).
|
||||
|
||||
**What it matches** (orphan-mode):
|
||||
- `PPid == 1` (reparented to init → definitely
|
||||
orphaned, not just a currently-running child)
|
||||
- `cwd == <repo-root>` (keeps the sweep scoped; won't
|
||||
touch unrelated init-children elsewhere)
|
||||
- `python` in cmdline
|
||||
|
||||
**What it does not do:** kill anything whose PPid is
|
||||
still a live tractor parent. If the parent is alive
|
||||
it's not an orphan; use `--parent <pid>` if you need
|
||||
to force-reap under a still-live supervisor.
|
||||
|
||||
**When NOT to run it:** while a pytest session is
|
||||
active in another terminal. It's safe (won't touch
|
||||
that session's live children in orphan-mode) but can
|
||||
race if the target session is mid-teardown.
|
||||
|
||||
### c) `--shm` / `--shm-only`: orphan-segment sweep
|
||||
|
||||
Because `tractor.ipc._mp_bs.disable_mantracker()`
|
||||
turns off `mp.resource_tracker` (see
|
||||
`ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md`),
|
||||
a hard-crashing actor can leave `/dev/shm/<key>`
|
||||
segments behind that nothing else GCs.
|
||||
|
||||
```sh
|
||||
# process reap THEN shm sweep
|
||||
scripts/tractor-reap --shm
|
||||
|
||||
# shm sweep only (skip process phase)
|
||||
scripts/tractor-reap --shm-only
|
||||
|
||||
# dry-run: list candidates, don't unlink
|
||||
scripts/tractor-reap --shm -n
|
||||
```
|
||||
|
||||
**Match criteria** (very conservative — this is a
|
||||
shared-system path, can't be wrong):
|
||||
- segment is a regular file under `/dev/shm`,
|
||||
- owned by the **current uid** (`stat.st_uid`),
|
||||
- AND **no live process holds it open** —
|
||||
enumerated by walking every readable
|
||||
`/proc/<pid>/maps` (post-mmap mappings) AND
|
||||
`/proc/<pid>/fd/*` (pre-mmap shm-opened fds).
|
||||
|
||||
The "nobody has it open" check is the
|
||||
kernel-canonical "is this leaked?" test — same
|
||||
answer `lsof /dev/shm/<key>` would give. No
|
||||
reliance on tractor-specific naming, so it works
|
||||
for any tractor app. Critically, it WILL NOT touch
|
||||
segments held by other apps you have running
|
||||
(e.g. `piker`, `lttng-ust-*`, `aja-shm-*` —
|
||||
verified locally with 81 in-use segments correctly
|
||||
preserved).
|
||||
|
|
@ -1,10 +1,18 @@
|
|||
name: CI
|
||||
|
||||
# NOTE distilled from,
|
||||
# https://github.com/orgs/community/discussions/26276
|
||||
on:
|
||||
# any time someone pushes a new branch to origin
|
||||
# any time a new update to 'main'
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
# Allows you to run this workflow manually from the Actions tab
|
||||
# for on all (forked) PRs to repo
|
||||
# NOTE, use a draft PR if you just want CI triggered..
|
||||
pull_request:
|
||||
|
||||
# to run workflow manually from the "Actions" tab
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
|
|
@ -74,24 +82,81 @@ jobs:
|
|||
# run: mypy tractor/ --ignore-missing-imports --show-traceback
|
||||
|
||||
|
||||
testing-linux:
|
||||
name: '${{ matrix.os }} Python ${{ matrix.python }} - ${{ matrix.spawn_backend }}'
|
||||
timeout-minutes: 10
|
||||
testing:
|
||||
name: '${{ matrix.os }} Python${{ matrix.python-version }} spawn_backend=${{ matrix.spawn_backend }} tpt_proto=${{ matrix.tpt_proto }} capture=${{ matrix.capture }}'
|
||||
timeout-minutes: 20
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
# NOTE on the matrix shape — the `capture=` mode follows
|
||||
# `spawn_backend`:
|
||||
#
|
||||
# - `trio` / `mp_*` backends use `--capture=fd` (default)
|
||||
# for per-test attribution of subactor *raw-fd* output
|
||||
# in failure reports.
|
||||
# - Fork-based backends (`main_thread_forkserver`,
|
||||
# `subint_forkserver`) REQUIRE `--capture=sys` because
|
||||
# fork-child × `--capture=fd` is a known deadlock
|
||||
# pattern. See the long NOTE in `tractor._testing.pytest`'s
|
||||
# `pytest_load_initial_conftests` for the mechanism +
|
||||
# tradeoff write-up.
|
||||
#
|
||||
# If a future matrix row adds a fork-spawn backend
|
||||
# WITHOUT setting `capture: 'sys'`, the
|
||||
# `pytest_load_initial_conftests` hook fail-fasts on `CI=1`
|
||||
# with a clear error msg. So the matrix is self-policing.
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
python-version: ['3.13']
|
||||
os: [
|
||||
ubuntu-latest,
|
||||
macos-latest,
|
||||
]
|
||||
python-version: [
|
||||
'3.13',
|
||||
# '3.14',
|
||||
]
|
||||
spawn_backend: [
|
||||
'trio',
|
||||
# 'mp_spawn',
|
||||
# 'mp_forkserver',
|
||||
# ?TODO^ is it worth it to get these running again?
|
||||
#
|
||||
# - [ ] next-gen backends, on 3.13+
|
||||
# https://github.com/goodboy/tractor/issues/379
|
||||
# 'subinterpreter',
|
||||
# 'subint',
|
||||
]
|
||||
tpt_proto: [
|
||||
'tcp',
|
||||
'uds',
|
||||
]
|
||||
capture: [
|
||||
'fd', # default for non-fork backends
|
||||
]
|
||||
|
||||
steps:
|
||||
# Fork-based backends — added via `include:` so each
|
||||
# cell carries its REQUIRED `capture: 'sys'` mode.
|
||||
# Linux-only for now; macOS coverage TBD pending
|
||||
# local validation.
|
||||
include:
|
||||
- os: ubuntu-latest
|
||||
python-version: '3.13'
|
||||
spawn_backend: 'main_thread_forkserver'
|
||||
tpt_proto: 'tcp'
|
||||
capture: 'sys'
|
||||
- os: ubuntu-latest
|
||||
python-version: '3.13'
|
||||
spawn_backend: 'main_thread_forkserver'
|
||||
tpt_proto: 'uds'
|
||||
capture: 'sys'
|
||||
|
||||
# https://github.com/orgs/community/discussions/26253#discussioncomment-3250989
|
||||
exclude:
|
||||
# don't do UDS run on macOS (for now)
|
||||
- os: macos-latest
|
||||
tpt_proto: 'uds'
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: 'Install uv + py-${{ matrix.python-version }}'
|
||||
|
|
@ -118,7 +183,18 @@ jobs:
|
|||
run: uv tree
|
||||
|
||||
- name: Run tests
|
||||
run: uv run pytest tests/ --spawn-backend=${{ matrix.spawn_backend }} -rsx
|
||||
run: >
|
||||
uv run
|
||||
pytest
|
||||
tests/
|
||||
-rsx
|
||||
--spawn-backend=${{ matrix.spawn_backend }}
|
||||
--tpt-proto=${{ matrix.tpt_proto }}
|
||||
--capture=${{ matrix.capture }}
|
||||
# NOTE: capture mode is matrix-driven — `fd` for
|
||||
# non-fork backends (per-test fd attribution),
|
||||
# `sys` for fork-based (avoids fork-child x
|
||||
# capture-fd deadlock). See matrix-NOTE above.
|
||||
|
||||
# XXX legacy NOTE XXX
|
||||
#
|
||||
|
|
|
|||
|
|
@ -102,3 +102,69 @@ venv.bak/
|
|||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
# all files under
|
||||
.git/
|
||||
|
||||
# require very explicit staging for anything we **really**
|
||||
# want put/kept in repo.
|
||||
notes_to_self/
|
||||
snippets/
|
||||
|
||||
# ------- AI shiz -------
|
||||
# `ai.skillz` symlinks,
|
||||
# (machine-local, deploy via deploy-skill.sh)
|
||||
.claude/skills/py-codestyle
|
||||
.claude/skills/close-wkt
|
||||
.claude/skills/plan-io
|
||||
.claude/skills/prompt-io
|
||||
.claude/skills/resolve-conflicts
|
||||
.claude/skills/inter-skill-review
|
||||
|
||||
# /open-wkt specifics
|
||||
.claude/skills/open-wkt
|
||||
.claude/wkts/
|
||||
claude_wkts
|
||||
|
||||
# /code-review-changes specifics
|
||||
.claude/skills/code-review-changes
|
||||
# review-skill ephemeral ctx (per-PR, single-use)
|
||||
.claude/review_context.md
|
||||
.claude/review_regression.md
|
||||
|
||||
# /pr-msg specifics
|
||||
.claude/skills/pr-msg/*
|
||||
# repo-specific
|
||||
!.claude/skills/pr-msg/format-reference.md
|
||||
# XXX, so u can nvim-telescope this file.
|
||||
# !.claude/skills/pr-msg/pr_msg_LATEST.md
|
||||
|
||||
# /commit-msg specifics
|
||||
# - any commit-msg gen tmp files
|
||||
.claude/*_commit_*.md
|
||||
.claude/*_commit*.txt
|
||||
.claude/skills/commit-msg/*
|
||||
!.claude/skills/commit-msg/style-duie-reference.md
|
||||
|
||||
# use prompt-io instead?
|
||||
.claude/plans
|
||||
|
||||
# nix develop --profile .nixdev
|
||||
.nixdev*
|
||||
|
||||
# :Obsession .
|
||||
Session.vim
|
||||
|
||||
# `gish` local `.md`-files
|
||||
# TODO? better all around automation!
|
||||
# -[ ] it'd be handy to also commit and sync with wtv git service?
|
||||
# -[ ] everything should be put under a `.gish/` no?
|
||||
gitea/
|
||||
gh/
|
||||
|
||||
# ------ macOS ------
|
||||
# Finder metadata
|
||||
**/.DS_Store
|
||||
|
||||
# LLM conversations that should remain private
|
||||
docs/conversations/
|
||||
|
|
|
|||
|
|
@ -0,0 +1,202 @@
|
|||
# Cancel-cascade `trio.TooSlowError` flakes under `main_thread_forkserver`
|
||||
|
||||
## Symptom
|
||||
|
||||
Running the full test suite under
|
||||
|
||||
```bash
|
||||
./py313/bin/python -m pytest tests/ \
|
||||
--tpt-proto=tcp \
|
||||
--spawn-backend=main_thread_forkserver
|
||||
```
|
||||
|
||||
surfaces a single, **rotating** `trio.TooSlowError`
|
||||
failure each run. The failure isn't deterministic on
|
||||
test identity — different test each run — but it
|
||||
ALWAYS looks like:
|
||||
|
||||
```
|
||||
FAILED tests/<file>::test_<name> - trio.TooSlowError
|
||||
==== 1 failed, 373 passed, 17 skipped, 11–12 xfailed,
|
||||
0–1 xpassed, ~550 warnings in ~6min ====
|
||||
```
|
||||
|
||||
Pass rate: **~99.7%** (373 of 374 non-skip tests).
|
||||
Wall-clock per full run: 5–6 min.
|
||||
|
||||
## Tests observed flaking so far
|
||||
|
||||
Each row was the SOLE failure in a separate run:
|
||||
|
||||
| run # | test |
|
||||
|---|---|
|
||||
| 1 | `tests/test_advanced_streaming.py::test_dynamic_pub_sub[KeyboardInterrupt]` |
|
||||
| 2 | `tests/test_infected_asyncio.py::test_context_spawns_aio_task_that_errors[parent_actor_cancels_child=False]` |
|
||||
|
||||
Both share the same shape:
|
||||
|
||||
- **Cancel cascade** of N subactors back to a parent root actor.
|
||||
- N ≥ `multiprocessing.cpu_count()` for `test_dynamic_pub_sub`
|
||||
(it spawns `cpus - 1` consumers + publisher + dynamic-consumer).
|
||||
- N ≈ 2 for `test_context_spawns_aio_task_that_errors` —
|
||||
but each subactor is `infect_asyncio=True`, so each
|
||||
cancel involves the trio↔asyncio guest-run unwind
|
||||
which is structurally heavier than pure-trio.
|
||||
- Test wraps the cascade in `trio.fail_after(N seconds)`
|
||||
and the cap fires before the cascade completes.
|
||||
|
||||
The exact failing test rotates because each test is
|
||||
independently close to the cap; whichever happens to
|
||||
be unlucky in scheduling/CPU-contention on a given run
|
||||
is the one that times out.
|
||||
|
||||
## Root-cause family
|
||||
|
||||
`hard_kill` (`tractor/spawn/_spawn.py:hard_kill`) runs
|
||||
the SC-graceful teardown ladder per subactor:
|
||||
|
||||
1. `Portal.cancel_actor()` — graceful IPC cancel-req.
|
||||
2. Wait `terminate_after=1.6s` for sub to exit.
|
||||
3. If still alive: `proc.kill()` (SIGKILL).
|
||||
4. (NEW) `_unlink_uds_bind_addrs()` — post-mortem
|
||||
sock-file cleanup for UDS leaks (issue #452 fix).
|
||||
|
||||
For a cascade of N subactors, each pays steps 1–4. If
|
||||
graceful-cancel doesn't complete within 1.6s for ANY
|
||||
sub, that sub eats a full 1.6s of `move_on_after` plus
|
||||
the `proc.wait()` post-SIGKILL.
|
||||
|
||||
Worst case under fork backend with N=cpus subs:
|
||||
- N × 1.6s = 16s+ on a 10-core box just for the
|
||||
graceful timeout phase
|
||||
- Plus per-spawn fork-IPC handshake cost compounds
|
||||
during teardown (each sub's IPC cleanup goes through
|
||||
the same forkserver coordinator)
|
||||
- Plus the new autouse fixtures
|
||||
(`_track_orphaned_uds_per_test`,
|
||||
`_detect_runaway_subactors_per_test`,
|
||||
`_reap_orphaned_subactors`) all run at test
|
||||
teardown, adding small (10s of ms) but cumulative
|
||||
overhead
|
||||
|
||||
Current cap: 30s (`fail_after_s = 30 if
|
||||
is_forking_spawner else 12`). Empirically fits the
|
||||
median run but the tail breaks ~0.3% of the time.
|
||||
|
||||
## NOT regressing
|
||||
|
||||
To confirm this is a flake and not a regression:
|
||||
|
||||
- Pre-`WakeupSocketpair`-patch baseline: tests
|
||||
HUNG INDEFINITELY (busy-loop never released).
|
||||
- Post-patch: pass-or-fail-fast, ~99.7% pass, the
|
||||
occasional cap-hit fails in bounded time (<60s for
|
||||
the offending test).
|
||||
- Same test PASSES under `--spawn-backend=trio`
|
||||
(no fork, no hard-kill compounding).
|
||||
|
||||
So the suite is dramatically better than before; the
|
||||
remaining flake is a known-tolerable steady-state.
|
||||
|
||||
## Possible mitigations (ranked)
|
||||
|
||||
### A. Bump the cap further
|
||||
|
||||
Cheapest. Change the per-test `fail_after_s` from 30
|
||||
to e.g. 60 for fork backends. Pros: trivial. Cons:
|
||||
masks any genuine slowness regression we'd want to
|
||||
catch.
|
||||
|
||||
### B. CPU-count-aware cap
|
||||
|
||||
For tests whose N scales with `cpu_count()`, scale
|
||||
the cap too:
|
||||
|
||||
```python
|
||||
fail_after_s = (
|
||||
max(30, cpu_count() * 3) # 3s/actor floor
|
||||
if is_forking_spawner
|
||||
else 12
|
||||
)
|
||||
```
|
||||
|
||||
Pros: scales with the actual cancel-cascade work.
|
||||
Cons: still arbitrary multiplier.
|
||||
|
||||
### C. `pytest-rerunfailures` for these tests only
|
||||
|
||||
Mark the known-flaky tests with
|
||||
`@pytest.mark.flaky(reruns=1)` (needs
|
||||
`pytest-rerunfailures` dep). Single retry hides
|
||||
genuine ~0.3% transient flakes.
|
||||
|
||||
Pros: no cap change, surfaces persistent failures
|
||||
loudly. Cons: adds a dep, retries can mask real bugs
|
||||
if used widely.
|
||||
|
||||
### D. Reduce `hard_kill`'s `terminate_after`
|
||||
|
||||
Drop from 1.6s → 0.8s. Cuts the worst-case cascade
|
||||
time roughly in half. Risks: fewer subs get a chance
|
||||
to run their cleanup before SIGKILL → more orphaned
|
||||
state for the autouse reapers to handle (ironically,
|
||||
adds back overhead elsewhere).
|
||||
|
||||
### E. Profile + targeted fix
|
||||
|
||||
Add `log.devx()` markers in `hard_kill` to time each
|
||||
phase. Identify if any subactor is consistently
|
||||
hitting the 1.6s cap (vs. exiting in <0.1s). If so,
|
||||
that sub has a teardown bug worth fixing at source.
|
||||
Pros: actually fixes the underlying slowness. Cons:
|
||||
real investigation work, deferred from this round.
|
||||
|
||||
## Recommendation
|
||||
|
||||
Land this issue-doc as the tracker. Apply **(B)** as
|
||||
a small follow-up — cheap and proportional. If it
|
||||
still flakes, escalate to **(E)** with a `log.devx()`
|
||||
profile-pass.
|
||||
|
||||
`(C)` is a backstop if `(B)` doesn't quite get there
|
||||
and we need green CI faster than (E) can deliver.
|
||||
|
||||
## Verification protocol
|
||||
|
||||
After applying any mitigation:
|
||||
|
||||
```bash
|
||||
# Run the suite N times back-to-back, count failures.
|
||||
# A persistent failure on the SAME test == real bug.
|
||||
# Failures rotating across tests == still cap-related.
|
||||
|
||||
for i in $(seq 1 5); do
|
||||
./py313/bin/python -m pytest tests/ \
|
||||
--tpt-proto=tcp \
|
||||
--spawn-backend=main_thread_forkserver \
|
||||
-q 2>&1 | tail -2
|
||||
done
|
||||
```
|
||||
|
||||
Target: 0 failures across 5 runs ⇒ ship. 1–2 failures
|
||||
still rotating ⇒ apply (C). Same test failing twice
|
||||
⇒ escalate to (E).
|
||||
|
||||
## See also
|
||||
|
||||
- [#452](https://github.com/goodboy/tractor/issues/452) —
|
||||
UDS sock-file leak (related — `hard_kill`'s
|
||||
cleanup phase contributes to cascade time)
|
||||
- `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`
|
||||
— the upstream-trio fix that turned this from a
|
||||
100% hang into a 0.3% flake
|
||||
- `ai/conc-anal/infected_asyncio_under_main_thread_forkserver_hang_issue.md`
|
||||
— the asyncio variant which contributes to one of
|
||||
the rotating failures
|
||||
- `tractor/spawn/_spawn.py::hard_kill` — the SIGKILL
|
||||
cascade source
|
||||
- `tractor/_testing/_reap.py::_track_orphaned_uds_per_test`,
|
||||
`_detect_runaway_subactors_per_test`,
|
||||
`_reap_orphaned_subactors` — autouse cleanup
|
||||
fixtures whose cumulative teardown overhead
|
||||
contributes to the cascade time
|
||||
|
|
@ -0,0 +1,281 @@
|
|||
# `fork()` in a multi-threaded program — execution-side vs. memory-side of the same coin
|
||||
|
||||
A reference doc for readers who've encountered one of two
|
||||
opposite-sounding framings of POSIX `fork()` semantics in a
|
||||
multi-threaded program and are confused by the other.
|
||||
|
||||
This is a sibling to
|
||||
`subint_fork_blocked_by_cpython_post_fork_issue.md` — that
|
||||
doc covers a CPython-level refusal of fork-from-subint;
|
||||
this one covers the more general POSIX layer, since
|
||||
tractor's main-thread forkserver design rests on it.
|
||||
|
||||
## TL;DR
|
||||
|
||||
POSIX `fork()` only preserves the *calling* thread as a
|
||||
runnable thread in the child — every other thread in the
|
||||
parent simply never executes another instruction in the
|
||||
child. trio's docs call this "leaked"; tractor's
|
||||
`_main_thread_forkserver.py` docstring calls it "gone".
|
||||
Both are correct: "gone" is the *execution* side (no
|
||||
scheduler entry, no instructions retired), "leaked" is the
|
||||
*memory* side (the dead threads' stacks and per-thread
|
||||
heap structures still ride into the child's address space
|
||||
as orphaned COW pages with no owner and no cleanup hook).
|
||||
Same POSIX reality, two halves of the same coin.
|
||||
|
||||
## The two framings
|
||||
|
||||
[python-trio/trio#1614][trio-1614] (the canonical "trio +
|
||||
fork" hazards thread) puts it this way:
|
||||
|
||||
> If you use `fork()` in a process with multiple threads,
|
||||
> all the other thread stacks are just leaked: there's
|
||||
> nothing else you can reasonably do with them.
|
||||
|
||||
`tractor.spawn._main_thread_forkserver`'s module docstring
|
||||
(specifically the "What survives the fork? — POSIX
|
||||
semantics" section) puts it this way:
|
||||
|
||||
> POSIX `fork()` only preserves the *calling* thread as a
|
||||
> runnable thread in the child. Every other thread in the
|
||||
> parent — trio's runner thread, any `to_thread` cache
|
||||
> threads, anything else — never executes another
|
||||
> instruction post-fork.
|
||||
|
||||
A reader bouncing between the two can be forgiven for
|
||||
asking: well, *which* is it — leaked or gone?
|
||||
|
||||
The answer is "yes". They're describing the same POSIX
|
||||
behavior from two different angles:
|
||||
|
||||
- trio is talking about the **bytes** the dead threads
|
||||
leave behind — stacks, TLS slots, per-thread arena
|
||||
metadata — and the fact that nothing in the child can
|
||||
drive them forward, free them, or even safely walk
|
||||
them. That's a memory leak in the strict sense: held
|
||||
but unreachable.
|
||||
- tractor is talking about the **execution** side
|
||||
relevant to the forkserver design: which threads
|
||||
retire instructions in the child? Exactly one — the
|
||||
one that called `fork()`. Everything else, regardless
|
||||
of the bytes left behind, is dead in a scheduler
|
||||
sense.
|
||||
|
||||
Neither framing is wrong; they're just answering
|
||||
different questions.
|
||||
|
||||
## POSIX `fork()` in a multi-threaded program — what actually happens
|
||||
|
||||
Per POSIX (and concretely on Linux glibc), the contract
|
||||
of `fork()` in a multi-threaded process is:
|
||||
|
||||
1. The kernel creates a new process whose virtual
|
||||
address space is a COW copy of the parent's. *All*
|
||||
pages map across — code, heap, every thread's stack,
|
||||
every malloc arena, every mmap region.
|
||||
2. Of the parent's N threads, exactly **one** is
|
||||
reified in the child as a runnable kernel task: the
|
||||
thread that called `fork()`. The other N-1 threads
|
||||
have *no* corresponding task in the child kernel. They
|
||||
were never scheduled, never `clone()`d for the child,
|
||||
never exist as runnable entities.
|
||||
3. Their **memory artifacts** — pthread stacks, TLS,
|
||||
`pthread_t` structures, glibc per-thread arena
|
||||
bookkeeping — are still mapped in the child's address
|
||||
space, because (1) duplicates *everything* page-wise.
|
||||
They sit there as inert COW bytes.
|
||||
4. The kernel does not clean those bytes up. There is no
|
||||
"phantom-thread cleanup" pass post-fork. The kernel
|
||||
doesn't know which mapped pages "belonged to" which
|
||||
thread — at the kernel level mappings are
|
||||
process-scoped, not thread-scoped.
|
||||
5. The surviving thread (the caller of `fork()`) cannot
|
||||
safely access those leaked bytes either. Any state
|
||||
they encoded — held mutexes, in-flight syscalls,
|
||||
half-updated invariants — is frozen at whatever
|
||||
instant the parent's fork-syscall observed it. Some
|
||||
of those mutexes may even still be locked from the
|
||||
child's POV (the canonical "fork-in-multithreaded-
|
||||
program-deadlocks" hazard; see `man pthread_atfork`).
|
||||
|
||||
So: from the kernel's PoV, the child has one thread.
|
||||
From the address-space's PoV, the child has all the
|
||||
parent's bytes — including the corpses of the N-1 dead
|
||||
threads' stacks. Both true simultaneously.
|
||||
|
||||
## Why trio says "leaked"
|
||||
|
||||
trio's framing makes sense from the parent's
|
||||
PoV, looking at *what those threads were doing*. In a
|
||||
running `trio.run()` process you typically have:
|
||||
|
||||
- The trio runner thread itself — owns the `selectors`
|
||||
epoll fd, the signal-wakeup-fd, the run-queue.
|
||||
- Threadpool worker threads (`trio.to_thread`'s cache)
|
||||
— blocked in `wait()` on the threadpool's work
|
||||
condvar.
|
||||
- Whatever other ad-hoc threads the application
|
||||
started.
|
||||
|
||||
Each of those threads owns *real work-state*: epoll
|
||||
registrations, file descriptors held in
|
||||
soon-to-be-completed reads, half-released locks, posted
|
||||
but unconsumed wakeups. After fork, that state is still
|
||||
encoded in the child's memory. None of it is invalid in
|
||||
a well-formed-bytes sense. It's just that:
|
||||
|
||||
- The thread that was driving it is gone.
|
||||
- Nothing else in the child knows the layout well
|
||||
enough to take over.
|
||||
- Even if it did, the kernel objects backing the work
|
||||
(epoll fd, signalfd) have separate post-fork
|
||||
semantics that don't compose with userland trio
|
||||
state.
|
||||
|
||||
So the bytes are *held* (they're in the child's
|
||||
address space, they count against RSS, they survive
|
||||
until something clobbers them), and they're
|
||||
*unreachable* in any meaningful sense — no thread can
|
||||
safely drive them forward. That is the textbook
|
||||
definition of a leak.
|
||||
|
||||
trio's quote is reminding the user that `fork()` from a
|
||||
multi-threaded process is a one-way memory hazard:
|
||||
whatever those threads were doing, that work-state is
|
||||
now garbage you happen to still be carrying.
|
||||
|
||||
## Why tractor says "gone"
|
||||
|
||||
tractor's `_main_thread_forkserver` framing is concerned
|
||||
with a different question: *which thread executes in the
|
||||
child, and is it safe?*
|
||||
|
||||
The forkserver design rests on POSIX's "calling thread
|
||||
is the sole survivor" guarantee. We pick that calling
|
||||
thread very deliberately: a dedicated worker that has
|
||||
provably never entered trio. So the thread that *does*
|
||||
run in the child is one whose locals, TLS, and stack
|
||||
contain nothing trio-related. Trio's runner thread —
|
||||
the one that owned the epoll fd and the run-queue — is
|
||||
*gone* from the child in the execution sense. It will
|
||||
never run another instruction. The fact that its stack
|
||||
bytes still exist in the child's address space (the
|
||||
"leaked" view) is irrelevant to the forkserver, because
|
||||
nothing in the child reads or writes those pages.
|
||||
|
||||
So when the docstring says "Every other thread … is
|
||||
gone the instant `fork()` returns in the child", it's
|
||||
being precise about the surface that matters for the
|
||||
backend: scheduler-level liveness. Nothing schedules
|
||||
those threads ever again. Whether their bytes are
|
||||
hanging around is a separate (and, for the design,
|
||||
non-load-bearing) fact.
|
||||
|
||||
## Cross-table
|
||||
|
||||
The same tabular layout the `_main_thread_forkserver`
|
||||
docstring uses, expanded with a fourth "what handles
|
||||
it" column:
|
||||
|
||||
| thread | parent | child (executing) | child (memory) | what handles it |
|
||||
|---------------------|-----------|-------------------|------------------------------|-----------------------------|
|
||||
| forkserver worker | continues | sole survivor | live stack | runs the child's bootstrap |
|
||||
| `trio.run()` thread | continues | not running | leaked stack (zombie bytes) | overwritten by child's fresh `trio.run()` |
|
||||
| any other thread | continues | not running | leaked stack (zombie bytes) | overwritten / GC'd / clobbered by `exec()` if used |
|
||||
|
||||
The "child (executing)" column is the *execution* side
|
||||
of the coin — what tractor cares about. The "child
|
||||
(memory)" column is the *memory* side — what trio
|
||||
cares about.
|
||||
|
||||
The "what handles it" column is the deliberate punchline
|
||||
of the design: nothing has to handle the leaked bytes
|
||||
*explicitly*. They get clobbered by ordinary forward
|
||||
progress in the child:
|
||||
|
||||
- The fresh `trio.run()` the child boots up allocates
|
||||
its own stack, scheduler, and run-queue, which over
|
||||
time overlaps and overwrites the inherited zombie
|
||||
pages.
|
||||
- Python's GC walks live objects only; the dead-thread
|
||||
Python frames aren't reachable from any
|
||||
`PyThreadState`, so they get freed at the next
|
||||
collection cycle.
|
||||
- If the child eventually `exec()`s, the entire address
|
||||
space is replaced and the leak vanishes.
|
||||
|
||||
## What this means for the forkserver design
|
||||
|
||||
The crucial point is that **the design doesn't and
|
||||
*can't* prevent the leak**. There is no userland fix
|
||||
for COW thread stacks. The kernel hands the child a
|
||||
duplicated address space; that's what `fork()` *is*. No
|
||||
amount of pre-fork hookery, `pthread_atfork()`
|
||||
gymnastics, or post-fork cleanup can un-COW the dead
|
||||
threads' pages without unmapping them, and unmapping
|
||||
arbitrary regions of a duplicated address space is
|
||||
neither portable nor safe.
|
||||
|
||||
What the design *does* ensure is the orthogonal
|
||||
property: the survivor thread is one that doesn't need
|
||||
any of that leaked state to function. Concretely:
|
||||
|
||||
- Survivor is the forkserver worker thread.
|
||||
- That worker has provably never imported, called into,
|
||||
or held any reference to `trio`. (Enforced by keeping
|
||||
the worker's lifecycle entirely in
|
||||
`_main_thread_forkserver.py` and never letting trio
|
||||
task-state cross into it.)
|
||||
- So the leaked pages — trio runner stack, threadpool
|
||||
caches, etc. — are inert relative to the survivor.
|
||||
No code path in the child references them.
|
||||
- The child then boots its own fresh `trio.run()`,
|
||||
which allocates new state in new pages. Over the
|
||||
child's lifetime the COW'd zombie pages get
|
||||
overwritten, GC'd, or (if the child eventually
|
||||
`exec()`s) discarded wholesale.
|
||||
|
||||
The "leak" is real but inert. It costs RSS until
|
||||
clobbered; it doesn't cost correctness. That's exactly
|
||||
the property the forkserver pattern is built on, and
|
||||
it's also why the design needs the "calling thread is
|
||||
trio-free" precondition to be airtight: if the survivor
|
||||
were a trio thread, it *would* try to drive the leaked
|
||||
trio state, and the leak would no longer be inert.
|
||||
|
||||
## See also
|
||||
|
||||
- `tractor/spawn/_main_thread_forkserver.py` — module
|
||||
docstring's "What survives the fork? — POSIX
|
||||
semantics" section is the in-tree, code-adjacent
|
||||
prose this doc expands on. The cross-table here is a
|
||||
fourth-column expansion of the table there.
|
||||
|
||||
- [python-trio/trio#1614][trio-1614] — the trio issue
|
||||
with the "leaked" framing, and the canonical thread
|
||||
for trio + `fork()` hazards more broadly.
|
||||
|
||||
- [`subint_fork_blocked_by_cpython_post_fork_issue.md`](./subint_fork_blocked_by_cpython_post_fork_issue.md)
|
||||
— sibling analysis covering CPython's *post-fork*
|
||||
hooks (`PyOS_AfterFork_Child`,
|
||||
`_PyInterpreterState_DeleteExceptMain`) and why
|
||||
fork-from-non-main-subint is a CPython-level hard
|
||||
refusal. Complementary axis: this doc is about POSIX
|
||||
semantics; that doc is about the CPython runtime
|
||||
layer that runs *after* POSIX `fork()` returns in
|
||||
the child.
|
||||
|
||||
- `man pthread_atfork(3)` — canonical "fork in a
|
||||
multithreaded process is dangerous" reference.
|
||||
Especially the rationale section, which is the
|
||||
closest thing to a normative statement of "the
|
||||
surviving thread cannot safely use anything the dead
|
||||
threads were touching."
|
||||
|
||||
- `man fork(2)` (Linux) — "Other than [the calling
|
||||
thread], … no other threads are replicated …"
|
||||
paragraph is the kernel-side statement of the
|
||||
execution-side framing this doc opens with.
|
||||
|
||||
[trio-1614]: https://github.com/python-trio/trio/issues/1614
|
||||
|
|
@ -0,0 +1,378 @@
|
|||
# `infect_asyncio` × `main_thread_forkserver` Mode-A deadlock
|
||||
|
||||
## Reproducer
|
||||
|
||||
```bash
|
||||
./py313/bin/python -m pytest \
|
||||
tests/test_infected_asyncio.py::test_aio_simple_error \
|
||||
--tpt-proto=tcp \
|
||||
--spawn-backend=main_thread_forkserver \
|
||||
-v --capture=sys
|
||||
```
|
||||
|
||||
Hangs indefinitely. Mode-A signature — both processes
|
||||
parked in `epoll_wait`, **neither burning CPU**.
|
||||
|
||||
## Empirical observations (caught alive)
|
||||
|
||||
### Outer pytest (parent)
|
||||
|
||||
`py-spy dump` on the test runner pid shows the trio
|
||||
event loop parked at the bottom of `trio.run()`:
|
||||
|
||||
```
|
||||
Thread <pid> (idle): "MainThread"
|
||||
get_events (trio/_core/_io_epoll.py:245)
|
||||
self: <EpollIOManager at 0x...>
|
||||
timeout: 86400
|
||||
run (trio/_core/_run.py:2415)
|
||||
next_send: []
|
||||
timeout: 86400
|
||||
test_aio_simple_error (tests/test_infected_asyncio.py:175)
|
||||
```
|
||||
|
||||
`timeout: 86400` is trio's "no scheduled work, just wait
|
||||
for I/O forever" sentinel. `next_send: []` confirms
|
||||
nothing is queued. The parent is stuck inside
|
||||
`tractor.open_nursery(...).run_in_actor(...)` waiting
|
||||
for `ipc_server.wait_for_peer(uid)` to fire — i.e.
|
||||
waiting for the spawned subactor to connect back.
|
||||
|
||||
### Subactor (forked child)
|
||||
|
||||
`/proc/<pid>/stack`:
|
||||
|
||||
```
|
||||
do_epoll_wait+0x4c0/0x500
|
||||
__x64_sys_epoll_wait+0x70/0x120
|
||||
do_syscall_64+0xef/0x1540
|
||||
entry_SYSCALL_64_after_hwframe+0x77/0x7f
|
||||
```
|
||||
|
||||
`strace -p <pid> -f`:
|
||||
|
||||
```
|
||||
[pid <child-A>] epoll_wait(6 <unfinished ...>
|
||||
[pid <child-B>] epoll_wait(3
|
||||
```
|
||||
|
||||
**Two threads**, both parked in `epoll_wait` on
|
||||
distinct epoll fds. Both blocked, neither making
|
||||
progress.
|
||||
|
||||
### Subactor file-descriptor table
|
||||
|
||||
```
|
||||
fd=0,1,2 stdio
|
||||
fd=3 eventpoll [watches fd 4]
|
||||
fd=4 ↔ fd=5 unix STREAM (CONNECTED) — internal pair
|
||||
fd=6 eventpoll [watches fds 7, 9]
|
||||
fd=7 ↔ fd=8 unix STREAM (CONNECTED) — internal pair
|
||||
fd=9 ↔ fd=10 unix STREAM (CONNECTED) — internal pair
|
||||
```
|
||||
|
||||
Confirmed via `ss -xp` peer-inode lookup: **all 6 unix
|
||||
sockets are internal socketpairs** (peer in same pid).
|
||||
|
||||
**Critical**: zero TCP/IPv4/IPv6 sockets, despite
|
||||
`--tpt-proto=tcp`:
|
||||
|
||||
```
|
||||
$ sudo lsof -p <subactor> | grep -iE 'TCP|IPv'
|
||||
(empty)
|
||||
$ sudo ss -tnp | grep <subactor>
|
||||
(empty)
|
||||
```
|
||||
|
||||
**The subactor never opened a TCP connection back to
|
||||
the parent.**
|
||||
|
||||
## Diagnosis
|
||||
|
||||
The subactor reaches `_actor_child_main` →
|
||||
`_trio_main(actor)` →
|
||||
`run_as_asyncio_guest(trio_main)`. Code path
|
||||
(`tractor.spawn._entry`):
|
||||
|
||||
```python
|
||||
if infect_asyncio:
|
||||
actor._infected_aio = True
|
||||
run_as_asyncio_guest(trio_main) # ← this branch
|
||||
else:
|
||||
trio.run(trio_main)
|
||||
```
|
||||
|
||||
`run_as_asyncio_guest` (`tractor.to_asyncio`):
|
||||
|
||||
```python
|
||||
def run_as_asyncio_guest(trio_main, ...):
|
||||
async def aio_main(trio_main):
|
||||
loop = asyncio.get_running_loop()
|
||||
trio_done_fute = asyncio.Future()
|
||||
...
|
||||
trio.lowlevel.start_guest_run(
|
||||
trio_main,
|
||||
run_sync_soon_threadsafe=loop.call_soon_threadsafe,
|
||||
done_callback=trio_done_callback,
|
||||
)
|
||||
out = await asyncio.shield(trio_done_fute)
|
||||
return out.unwrap()
|
||||
...
|
||||
return asyncio.run(aio_main(trio_main))
|
||||
```
|
||||
|
||||
Expected flow:
|
||||
1. `asyncio.run(aio_main(...))` — boots fresh asyncio
|
||||
loop in calling thread.
|
||||
2. `aio_main` calls `trio.lowlevel.start_guest_run(...)`
|
||||
— initializes trio's I/O manager, schedules first
|
||||
trio slice via `loop.call_soon_threadsafe`.
|
||||
3. asyncio loop dispatches the callback → trio runs a
|
||||
slice → yields back via `call_soon_threadsafe`.
|
||||
4. Trio's `async_main` (the user function) runs →
|
||||
`Channel.from_addr(parent_addr)` → TCP connect to
|
||||
parent.
|
||||
|
||||
What we observe instead:
|
||||
- 2 threads in `epoll_wait` (one trio epoll, one
|
||||
asyncio epoll, both inactive)
|
||||
- 6 unix-socket fds (3 socketpairs: trio
|
||||
wakeup-fd-pair, asyncio wakeup-fd-pair, trio kicker
|
||||
socketpair)
|
||||
- ZERO TCP — `Channel.from_addr` never ran
|
||||
|
||||
Most likely cause: **trio's guest-run scheduling
|
||||
callback didn't get dispatched by asyncio's loop in
|
||||
the forked child**, so trio's `async_main` never
|
||||
executes past trio bootstrap, and the
|
||||
parent-IPC-connect step is never reached.
|
||||
|
||||
## Fork-survival risk surface (hypothesis)
|
||||
|
||||
`trio.lowlevel.start_guest_run` builds Python-level
|
||||
closures + signal handlers + wakeup-fd registrations
|
||||
that depend on:
|
||||
|
||||
- The asyncio event loop's `call_soon_threadsafe`
|
||||
thread-id matching the loop owner thread.
|
||||
- Process-wide signal-wakeup-fd state
|
||||
(`signal.set_wakeup_fd`).
|
||||
- Trio's `KIManager` SIGINT handler.
|
||||
|
||||
Under `main_thread_forkserver`, the fork happens from
|
||||
a worker thread that has **never entered trio**
|
||||
(intentional — trio-free launchpad). But the FORKED
|
||||
child then tries to bring up BOTH asyncio AND
|
||||
trio-as-guest fresh from this trio-free thread. The
|
||||
asyncio loop boots fine; trio's `start_guest_run`
|
||||
initializes BUT the cross-loop dispatch (asyncio
|
||||
queue → trio slice) appears to silently fail to wire
|
||||
up.
|
||||
|
||||
Two more hypotheses worth probing:
|
||||
|
||||
1. **Wakeup-fd contention**: asyncio installs
|
||||
`signal.set_wakeup_fd(<own_pair>)`. trio's
|
||||
guest-run also wants a wakeup-fd. Whoever installs
|
||||
second wins; the loser's `epoll_wait` no longer
|
||||
wakes on signals. Combined with the `asyncio.shield(
|
||||
trio_done_fute)` + `asyncio.CancelledError`
|
||||
handling in `run_as_asyncio_guest`, a missed signal
|
||||
delivery could explain the indefinite park.
|
||||
|
||||
2. **Trio kicker socketpair race**: trio's I/O manager
|
||||
uses an internal `socket.socketpair()` to "kick"
|
||||
itself out of `epoll_wait` when a non-IO task needs
|
||||
scheduling. In guest mode, the kicker is still
|
||||
present but is supposed to be triggered via the
|
||||
asyncio dispatch. If the kicker write never gets
|
||||
issued by asyncio's callback, trio's epoll never
|
||||
wakes.
|
||||
|
||||
## Confirmed via py-spy (live capture)
|
||||
|
||||
After detaching `strace` (ptrace is exclusive — that's
|
||||
why `py-spy` returns EPERM if strace is attached):
|
||||
|
||||
```
|
||||
Thread <pid> (idle): "main-thread-forkserver[asyncio_actor]"
|
||||
select (selectors.py:452) # asyncio epoll
|
||||
_run_once (asyncio/base_events.py:2012)
|
||||
run_forever (asyncio/base_events.py:683)
|
||||
run_until_complete (asyncio/base_events.py:712)
|
||||
run (asyncio/runners.py:118)
|
||||
run (asyncio/runners.py:195)
|
||||
run_as_asyncio_guest (tractor/to_asyncio.py:1770)
|
||||
_trio_main (tractor/spawn/_entry.py:160)
|
||||
_actor_child_main (tractor/_child.py:72)
|
||||
_child_target (tractor/spawn/_main_thread_forkserver.py:910)
|
||||
_worker (tractor/spawn/_main_thread_forkserver.py:605)
|
||||
[thread bootstrap]
|
||||
|
||||
Thread <pid+1> (idle): "Trio thread 14"
|
||||
get_events (trio/_core/_io_epoll.py:245) # trio epoll
|
||||
get_events (trio/_core/_run.py:1678)
|
||||
capture (outcome/_impl.py:67)
|
||||
_handle_job (trio/_core/_thread_cache.py:173)
|
||||
_work (trio/_core/_thread_cache.py:196)
|
||||
[thread bootstrap]
|
||||
```
|
||||
|
||||
This data **rewrites the diagnosis**: trio guest-run
|
||||
isn't broken across the fork — it's working as designed.
|
||||
The two threads ARE the canonical guest-run architecture:
|
||||
|
||||
1. **Asyncio main loop** runs in the lead thread. Parked
|
||||
in `selectors.EpollSelector.select(timeout=-1)` —
|
||||
waiting indefinitely for ANY callback to be queued.
|
||||
2. **Trio's I/O manager** offloads `get_events`
|
||||
(`epoll_wait`) onto a `trio._core._thread_cache`
|
||||
worker thread. The worker calls
|
||||
`outcome.capture(get_events)` and parks in
|
||||
`epoll_wait(timeout=86400)`.
|
||||
3. When trio I/O fires (or its kicker socketpair gets a
|
||||
write), the worker returns from `epoll_wait`,
|
||||
delivers the result via `_handle_job`'s `deliver`
|
||||
callback, which schedules the next trio slice on
|
||||
asyncio via `loop.call_soon_threadsafe`.
|
||||
|
||||
The fact that the trio thread is *already* in
|
||||
`_thread_cache._handle_job` doing `capture(get_events)`
|
||||
means **trio's scheduler HAS started** — the bridge
|
||||
asyncio↔trio is wired correctly post-fork.
|
||||
|
||||
So `async_main` DID run far enough to register some
|
||||
trio task that's now awaiting I/O. The question
|
||||
becomes: **what is `async_main` waiting on?**
|
||||
|
||||
Process state confirms it's NOT waiting on the TCP
|
||||
connect to parent:
|
||||
|
||||
```
|
||||
$ sudo lsof -p <subactor> | grep -iE 'TCP|IPv'
|
||||
(empty)
|
||||
$ sudo ss -tnp | grep <subactor>
|
||||
(empty)
|
||||
```
|
||||
|
||||
`Channel.from_addr(parent_addr)` — the very first
|
||||
thing `async_main` does — was never reached, OR was
|
||||
reached but errored before `socket()` was called. The
|
||||
parent (running `ipc_server.wait_for_peer`) waits
|
||||
forever for the connection; it never comes.
|
||||
|
||||
## Refined hypothesis
|
||||
|
||||
`async_main` is stalled in some PRE-`Channel.from_addr`
|
||||
checkpoint. Candidates:
|
||||
|
||||
1. **`get_console_log` / logger init** — called early in
|
||||
`_trio_main` if `actor.loglevel is not None`. Logging
|
||||
setup involves file/handler init that could block on
|
||||
something fork-inherited (e.g. a stale lock).
|
||||
2. **`debug.maybe_init_greenback`** — `start_guest_run`
|
||||
includes a check (`if debug_mode(): assert 0` —
|
||||
currently asserts unsupported). For non-debug mode
|
||||
this is bypassed but related machinery may run.
|
||||
3. **Stackscope SIGUSR1 handler install** — gated on
|
||||
`_debug_mode` OR `TRACTOR_ENABLE_STACKSCOPE` env-var.
|
||||
The `enable_stack_on_sig()` path captures a trio
|
||||
token via `trio.lowlevel.current_trio_token()` —
|
||||
could block under guest mode.
|
||||
4. **Initial `await trio.sleep(0)` / first checkpoint**
|
||||
in `async_main` before reaching the
|
||||
`Channel.from_addr` line. Under guest mode, if the
|
||||
FIRST `call_soon_threadsafe` callback never gets
|
||||
processed by asyncio, trio's first slice never
|
||||
completes — but the worker thread WOULD still be in
|
||||
`epoll_wait` having been started by trio's I/O
|
||||
manager init.
|
||||
|
||||
## Confirming `async_main`'s parked location
|
||||
|
||||
Add temporary logging at the top of `Actor.async_main`:
|
||||
|
||||
```python
|
||||
# tractor/runtime/_runtime.py around line 855
|
||||
async def async_main(self, parent_addr=None):
|
||||
log.devx('async_main: ENTERED') # marker A
|
||||
try:
|
||||
log.devx('async_main: pre-Channel.from_addr') # marker B
|
||||
chan = await Channel.from_addr(
|
||||
addr=wrap_address(parent_addr)
|
||||
)
|
||||
log.devx('async_main: post-Channel.from_addr') # marker C
|
||||
...
|
||||
```
|
||||
|
||||
Re-run the test with `--ll=devx`. The last marker logged
|
||||
tells us exactly where `async_main` parked. If only A
|
||||
fires, the issue is between A and B (logger init,
|
||||
stackscope, etc.). If A and B fire but not C, it's in
|
||||
`Channel.from_addr` (DNS, socket creation, connect).
|
||||
|
||||
## Related sibling bug
|
||||
|
||||
`tests/test_multi_program.py::test_register_duplicate_name`
|
||||
hangs under the same backend with a DIFFERENT
|
||||
fingerprint:
|
||||
|
||||
- Subactor at 100% CPU (busy-loop), not parked
|
||||
- `recvfrom(6, "", 65536, 0, NULL, NULL) = 0` repeating
|
||||
with no `epoll_wait` in between
|
||||
- fd=6 is one of trio's internal AF_UNIX
|
||||
socketpair fds (the kicker mechanism)
|
||||
|
||||
Distinct root cause — possibly trio's kicker socketpair
|
||||
inheriting a half-closed state across the fork — but
|
||||
shares the broader theme: **trio internal-state
|
||||
initialization isn't fully fork-safe under
|
||||
`main_thread_forkserver`** for the more exotic
|
||||
dispatch paths.
|
||||
|
||||
## Workarounds (until fix lands)
|
||||
|
||||
1. **Skip-mark on the fork backend** — temporarily mark
|
||||
`tests/test_infected_asyncio.py` with
|
||||
`pytest.mark.skipon_spawn_backend('main_thread_forkserver',
|
||||
reason='infect_asyncio + fork interaction broken,
|
||||
see ai/conc-anal/infected_asyncio_under_main_thread_forkserver_hang_issue.md')`.
|
||||
Lets the rest of the test suite run green while
|
||||
this is being fixed properly.
|
||||
|
||||
2. **Run infected-asyncio tests under the `trio`
|
||||
backend only** — they don't exercise fork
|
||||
semantics, so they won't hit this bug.
|
||||
|
||||
## Investigation next steps
|
||||
|
||||
In rough priority:
|
||||
|
||||
1. Catch the hang alive again, **detach strace**,
|
||||
`py-spy --locals` the subactor — confirm trio
|
||||
thread is NOT yet at `async_main`.
|
||||
2. Diff `start_guest_run` setup pre-fork vs post-fork
|
||||
by adding `log.devx()` markers in
|
||||
`tractor.to_asyncio.run_as_asyncio_guest::aio_main`
|
||||
at:
|
||||
- asyncio loop bringup
|
||||
- immediately before `start_guest_run`
|
||||
- immediately after `start_guest_run`
|
||||
- inside the `trio_done_callback` registration
|
||||
3. Check whether the asyncio loop dispatches ANY
|
||||
callbacks in the forked child — instrument
|
||||
`loop.call_soon_threadsafe` (e.g. monkey-patch
|
||||
`loop._call_soon` to log).
|
||||
4. If steps 1–3 confirm that asyncio's queue is
|
||||
stuck, look at whether the asyncio event-loop
|
||||
policy or selector is being inherited from a
|
||||
pre-fork (parent-process) state in a way that
|
||||
breaks the new loop.
|
||||
|
||||
## See also
|
||||
|
||||
- [#379](https://github.com/goodboy/tractor/issues/379) — subint umbrella
|
||||
- [#451](https://github.com/goodboy/tractor/issues/451) — Mode-A cancel-cascade hang
|
||||
- `ai/conc-anal/fork_thread_semantics_execution_vs_memory.md`
|
||||
- `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
|
||||
- python-trio/trio#1614 — trio + fork hazards
|
||||
|
|
@ -0,0 +1,142 @@
|
|||
# Spawn-time boot-death (`rc=2`) under rapid same-name spawn against a registrar
|
||||
|
||||
## Symptom
|
||||
|
||||
Spawning N (≥4) sub-actors with the **same name** in tight
|
||||
succession against a daemon registrar surfaces as
|
||||
`ActorFailure: Sub-actor (...) died during boot (rc=2)
|
||||
before completing parent-handshake`.
|
||||
|
||||
```
|
||||
tests/discovery/test_multi_program.py
|
||||
::test_dup_name_cancel_cascade_escalates_to_hard_kill[n_dups=4]
|
||||
```
|
||||
|
||||
```
|
||||
tractor._exceptions.ActorFailure:
|
||||
Sub-actor ('doggy', '<uuid>') died during boot (rc=2)
|
||||
before completing parent-handshake.
|
||||
proc: <_ForkedProc pid=<n> returncode=None>
|
||||
```
|
||||
|
||||
The `proc` repr shows `returncode=None` because the repr is
|
||||
captured before `proc.wait()` returns; the actual
|
||||
`os.WEXITSTATUS == 2` is reported via `result['died']` in the
|
||||
race-helper.
|
||||
|
||||
## When it surfaces
|
||||
|
||||
- N=2 (`n_dups=2`): **always passes**.
|
||||
- N=4 (`n_dups=4`): **consistent fail** under both `tpt-proto=tcp`
|
||||
and `tpt-proto=uds`, MTF backend.
|
||||
- N=8 (`n_dups=8`): **passes** (counter-intuitive — see "racing
|
||||
windows").
|
||||
- Non-MTF backends: not yet exercised systematically.
|
||||
|
||||
## What previously masked it
|
||||
|
||||
Pre the spawn-time `wait_for_peer_or_proc_death` race-helper
|
||||
(in `tractor.spawn._spawn`), the parent's `start_actor` flow
|
||||
ended with a bare:
|
||||
|
||||
```python
|
||||
event, chan = await ipc_server.wait_for_peer(uid)
|
||||
```
|
||||
|
||||
That awaits an unsignalled `trio.Event` on `_peer_connected[uid]`.
|
||||
If the sub-actor process **dies during boot** (before its
|
||||
runtime executes the parent-callback handshake that sets the
|
||||
event), the wait parks forever. The dead proc becomes a zombie
|
||||
because no one ever calls `proc.wait()` to reap it.
|
||||
|
||||
In test contexts the failure presented as a hang or a much
|
||||
later `trio.TooSlowError` from an outer `fail_after`. In
|
||||
production it'd present as a parent that never makes progress
|
||||
past `start_actor`. The death itself was silently masked.
|
||||
|
||||
## What surfaces it now
|
||||
|
||||
`tractor.spawn._spawn.wait_for_peer_or_proc_death` (used by
|
||||
`_main_thread_forkserver_proc`) races the handshake-wait
|
||||
against `proc.wait()`. The race-helper raises `ActorFailure`
|
||||
on death-first instead of parking, exposing the rc=2.
|
||||
|
||||
## Hypothesis: registrar-side same-name contention
|
||||
|
||||
The test spawns N actors with name `doggy` sequentially:
|
||||
|
||||
```python
|
||||
for i in range(n_dups):
|
||||
p: Portal = await an.start_actor('doggy')
|
||||
portals.append(p)
|
||||
```
|
||||
|
||||
Each spawned doggy:
|
||||
|
||||
1. Forks via the forkserver.
|
||||
2. Boots its runtime in `_actor_child_main`.
|
||||
3. Connects back to the parent for handshake.
|
||||
4. Connects to the daemon registrar to call `register_actor`.
|
||||
5. Enters its RPC msg-loop.
|
||||
|
||||
Step (4) is where the same-name contention lives. The
|
||||
registrar's `register_actor` (in
|
||||
`tractor.discovery._registry`) accepts duplicate names
|
||||
(stores `(name, uuid) -> addr`), but its internal bookkeeping
|
||||
may have a non-trivial check (e.g. `wait_for_actor` resolution,
|
||||
`_addrs2aids` map updates) that errors out under specific
|
||||
ordering between the existing entry and the incoming one.
|
||||
|
||||
`rc=2 == os.WEXITSTATUS == 2` corresponds to `sys.exit(2)`
|
||||
in the doggy process — typically reached via an unhandled
|
||||
exception that's translated to exit code 2 by Python's top-
|
||||
level (e.g. `argparse` errors use 2; `SystemExit(2)` etc.).
|
||||
So the doggy is hitting an explicit exit path during
|
||||
`register_actor` or just-after.
|
||||
|
||||
The non-monotonic shape (N=2 OK, N=4 BAD, N=8 OK) suggests a
|
||||
specific timing window — likely "the 3rd register-RPC arrives
|
||||
while the 1st-or-2nd is in some intermediate state". With
|
||||
N=8, the additional procs widen the registration spread
|
||||
enough that no two land in the conflicting window.
|
||||
|
||||
## Where to dig next
|
||||
|
||||
- Add per-actor logging in `_actor_child_main` and
|
||||
`register_actor` to surface the actual exception that
|
||||
triggers the rc=2 exit. Currently the doggy dies before
|
||||
the parent ever sees its stderr (forkserver doesn't
|
||||
marshal child stdio back).
|
||||
- Race-test the registrar's `register_actor` /
|
||||
`unregister_actor` / `wait_for_actor` against same-name
|
||||
concurrent calls in isolation (no spawn).
|
||||
- Consider whether `register_actor` should be idempotent
|
||||
under same-name re-register or should explicitly reject
|
||||
same-name (and ideally with a clear `RemoteActorError`,
|
||||
not `sys.exit(2)`).
|
||||
|
||||
## Test-suite handling
|
||||
|
||||
Currently:
|
||||
|
||||
- `tests/discovery/test_multi_program.py
|
||||
::test_dup_name_cancel_cascade_escalates_to_hard_kill[n_dups=4]`
|
||||
is `pytest.mark.xfail(strict=False, reason=...)` to keep
|
||||
the suite green while this issue is investigated.
|
||||
- `n_dups=2` and `n_dups=8` continue to validate the
|
||||
cancel-cascade hard-kill escalation.
|
||||
|
||||
Once the underlying race is understood + fixed, drop the
|
||||
xfail.
|
||||
|
||||
## Related work
|
||||
|
||||
- The cancel-cascade fix that introduced this regression
|
||||
test:
|
||||
`tractor/_exceptions.py:ActorTooSlowError`,
|
||||
`tractor/runtime/_supervise.py:_try_cancel_then_kill`,
|
||||
`tractor/runtime/_portal.py:Portal.cancel_actor(
|
||||
raise_on_timeout=...)`.
|
||||
- The spawn-time death-detection that exposed this:
|
||||
`tractor/spawn/_spawn.py:wait_for_peer_or_proc_death`,
|
||||
used by `tractor/spawn/_main_thread_forkserver.py`.
|
||||
|
|
@ -0,0 +1,161 @@
|
|||
# `subint` backend: parent trio loop parks after subint teardown (Ctrl-C works; not a CPython-level issue)
|
||||
|
||||
Follow-up to the Phase B subint spawn-backend PR (see
|
||||
`tractor.spawn._subint`, issue #379). Distinct from the
|
||||
`subint_sigint_starvation_issue.md` (SIGINT-unresponsive
|
||||
starvation hang): this one is **Ctrl-C-able**, which means
|
||||
it's *not* the shared-GIL-hostage class and is ours to fix
|
||||
from inside tractor rather than waiting on upstream CPython
|
||||
/ msgspec progress.
|
||||
|
||||
## TL;DR
|
||||
|
||||
After a stuck-subint subactor is torn down via the
|
||||
hard-kill path, a parent-side trio task parks on an
|
||||
*orphaned resource* (most likely a `chan.recv()` /
|
||||
`process_messages` loop on the now-dead subint's IPC
|
||||
channel) and waits forever for bytes that can't arrive —
|
||||
because the channel was torn down without emitting a clean
|
||||
EOF/`BrokenResourceError` to the waiting receiver.
|
||||
|
||||
Unlike `subint_sigint_starvation_issue.md`, the main trio
|
||||
loop **is** iterating normally — SIGINT delivers cleanly
|
||||
and the test unhangs. But absent Ctrl-C, the test suite
|
||||
wedges indefinitely.
|
||||
|
||||
## Symptom
|
||||
|
||||
Running `test_subint_non_checkpointing_child` under
|
||||
`--spawn-backend=subint` (in
|
||||
`tests/test_subint_cancellation.py`):
|
||||
|
||||
1. Test spawns a subactor whose main task runs
|
||||
`threading.Event.wait(1.0)` in a loop — releases the
|
||||
GIL but never inserts a trio checkpoint.
|
||||
2. Parent does `an.cancel_scope.cancel()`. Our
|
||||
`subint_proc` cancel path fires: soft-kill sends
|
||||
`Portal.cancel_actor()` over the live IPC channel →
|
||||
subint's trio loop *should* process the cancel msg on
|
||||
its IPC dispatcher task (since the GIL releases are
|
||||
happening).
|
||||
3. Expected: subint's `trio.run()` unwinds, driver thread
|
||||
exits naturally, parent returns.
|
||||
4. Actual: parent `trio.run()` never completes. Test
|
||||
hangs past its `trio.fail_after()` deadline.
|
||||
|
||||
## Evidence
|
||||
|
||||
### `strace` on the hung pytest process during SIGINT
|
||||
|
||||
```
|
||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||
write(17, "\2", 1) = 1
|
||||
```
|
||||
|
||||
Contrast with the SIGINT-starvation hang (see
|
||||
`subint_sigint_starvation_issue.md`) where that same
|
||||
`write()` returned `EAGAIN`. Here the SIGINT byte is
|
||||
written successfully → Python's signal handler pipe is
|
||||
being drained → main trio loop **is** iterating → SIGINT
|
||||
gets turned into `trio.Cancelled` → the test unhangs (if
|
||||
the operator happens to be there to hit Ctrl-C).
|
||||
|
||||
### Stack dump (via `tractor.devx.dump_on_hang`)
|
||||
|
||||
Single main thread visible, parked in
|
||||
`trio._core._io_epoll.get_events` inside `trio.run` at the
|
||||
test's `trio.run(...)` call site. No subint driver thread
|
||||
(subint was destroyed successfully — this is *after* the
|
||||
hard-kill path, not during it).
|
||||
|
||||
## Root cause hypothesis
|
||||
|
||||
Most consistent with the evidence: a parent-side trio
|
||||
task is awaiting a `chan.recv()` / `process_messages` loop
|
||||
on the dead subint's IPC channel. The sequence:
|
||||
|
||||
1. Soft-kill in `subint_proc` sends `Portal.cancel_actor()`
|
||||
over the channel. The subint's trio dispatcher *may* or
|
||||
may not have processed the cancel msg before the subint
|
||||
was destroyed — timing-dependent.
|
||||
2. Hard-kill timeout fires (because the subint's main
|
||||
task was in `threading.Event.wait()` with no trio
|
||||
checkpoint — cancel-msg processing couldn't race the
|
||||
timeout).
|
||||
3. Driver thread abandoned, `_interpreters.destroy()`
|
||||
runs. Subint is gone.
|
||||
4. But the parent-side trio task holding a
|
||||
`chan.recv()` / `process_messages` loop against that
|
||||
channel was **not** explicitly cancelled. The channel's
|
||||
underlying socket got torn down, but without a clean
|
||||
EOF delivered to the waiting recv, the task parks
|
||||
forever on `trio.lowlevel.wait_readable` (or similar).
|
||||
|
||||
This matches the "main loop fine, task parked on
|
||||
orphaned I/O" signature.
|
||||
|
||||
## Why this is ours to fix (not CPython's)
|
||||
|
||||
- Main trio loop iterates normally → GIL isn't starved.
|
||||
- SIGINT is deliverable → not a signal-pipe-full /
|
||||
wakeup-fd contention scenario.
|
||||
- The hang is in *our* supervision code, specifically in
|
||||
how `subint_proc` tears down its side of the IPC when
|
||||
the subint is abandoned/destroyed.
|
||||
|
||||
## Possible fix directions
|
||||
|
||||
1. **Explicit parent-side channel abort on subint
|
||||
abandon.** In `subint_proc`'s teardown block, after the
|
||||
hard-kill timeout fires, explicitly close the parent's
|
||||
end of the IPC channel to the subint. Any waiting
|
||||
`chan.recv()` / `process_messages` task sees
|
||||
`BrokenResourceError` (or `ClosedResourceError`) and
|
||||
unwinds.
|
||||
2. **Cancel parent-side RPC tasks tied to the dead
|
||||
subint's channel.** The `Actor._rpc_tasks` / nursery
|
||||
machinery should have a handle on any
|
||||
`process_messages` loops bound to a specific peer
|
||||
channel. Iterate those and cancel explicitly.
|
||||
3. **Bound the top-level `await actor_nursery
|
||||
._join_procs.wait()` shield in `subint_proc`** (same
|
||||
pattern as the other bounded shields the hard-kill
|
||||
patch added). If the nursery never sets `_join_procs`
|
||||
because a child task is parked, the bound would at
|
||||
least let the teardown proceed.
|
||||
|
||||
Of these, (1) is the most surgical and directly addresses
|
||||
the root cause. (2) is a defense-in-depth companion. (3)
|
||||
is a band-aid but cheap to add.
|
||||
|
||||
## Current workaround
|
||||
|
||||
None in-tree. The test's `trio.fail_after()` bound
|
||||
currently fires and raises `TooSlowError`, so the test
|
||||
visibly **fails** rather than hangs — which is
|
||||
intentional (an unbounded cancellation-audit test would
|
||||
defeat itself). But in interactive test runs the operator
|
||||
has to hit Ctrl-C to move past the parked state before
|
||||
pytest reports the failure.
|
||||
|
||||
## Reproducer
|
||||
|
||||
```
|
||||
./py314/bin/python -m pytest \
|
||||
tests/test_subint_cancellation.py::test_subint_non_checkpointing_child \
|
||||
--spawn-backend=subint --tb=short --no-header -v
|
||||
```
|
||||
|
||||
Expected: hangs until `trio.fail_after(15)` fires, or
|
||||
Ctrl-C unwedges it manually.
|
||||
|
||||
## References
|
||||
|
||||
- `tractor.spawn._subint.subint_proc` — current subint
|
||||
teardown code; see the `_HARD_KILL_TIMEOUT` bounded
|
||||
shields + `daemon=True` driver-thread abandonment
|
||||
(commit `b025c982`).
|
||||
- `ai/conc-anal/subint_sigint_starvation_issue.md` — the
|
||||
sibling CPython-level hang (GIL-starvation,
|
||||
SIGINT-unresponsive) which is **not** this issue.
|
||||
- Phase B tracking: issue #379.
|
||||
|
|
@ -0,0 +1,337 @@
|
|||
# `os.fork()` from a non-main sub-interpreter aborts the child (CPython refuses post-fork cleanup)
|
||||
|
||||
Third `subint`-class analysis in this project. Unlike its
|
||||
two siblings (`subint_sigint_starvation_issue.md`,
|
||||
`subint_cancel_delivery_hang_issue.md`), this one is not a
|
||||
hang — it's a **hard CPython-level refusal** of an
|
||||
experimental spawn strategy we wanted to try.
|
||||
|
||||
## TL;DR
|
||||
|
||||
An in-process sub-interpreter cannot be used as a
|
||||
"launchpad" for `os.fork()` on current CPython. The fork
|
||||
syscall succeeds in the parent, but the forked CHILD
|
||||
process is aborted immediately by CPython's post-fork
|
||||
cleanup with:
|
||||
|
||||
```
|
||||
Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
|
||||
```
|
||||
|
||||
This is enforced by a hard `PyStatus_ERR` gate in
|
||||
`Python/pystate.c`. The CPython devs acknowledge the
|
||||
fragility with an in-source comment (`// Ideally we could
|
||||
guarantee tstate is running main.`) but provide no
|
||||
mechanism to satisfy the precondition from user code.
|
||||
|
||||
**Implication for tractor**: the `subint_fork` backend
|
||||
sketched in `tractor.spawn._subint_fork` is structurally
|
||||
dead on current CPython. The submodule is kept as
|
||||
documentation of the attempt; `--spawn-backend=subint_fork`
|
||||
raises `NotImplementedError` pointing here.
|
||||
|
||||
## Context — why we tried this
|
||||
|
||||
The motivation is issue #379's "Our own thoughts, ideas
|
||||
for `fork()`-workaround/hacks..." section. The existing
|
||||
trio-backend (`tractor.spawn._trio.trio_proc`) spawns
|
||||
subactors via `trio.lowlevel.open_process()` → ultimately
|
||||
`posix_spawn()` or `fork+exec`, from the parent's main
|
||||
interpreter that is currently running `trio.run()`. This
|
||||
brushes against a known-fragile interaction between
|
||||
`trio` and `fork()` tracked in
|
||||
[python-trio/trio#1614](https://github.com/python-trio/trio/issues/1614)
|
||||
and siblings — mostly mitigated in `tractor`'s case only
|
||||
incidentally (we `exec()` immediately post-fork).
|
||||
|
||||
The idea was:
|
||||
|
||||
1. Create a subint that has *never* imported `trio`.
|
||||
2. From a worker thread in that subint, call `os.fork()`.
|
||||
3. In the child, `execv()` back into
|
||||
`python -m tractor._child` — same as `trio_proc` does.
|
||||
4. The fork is from a trio-free context → trio+fork
|
||||
hazards avoided regardless of downstream behavior.
|
||||
|
||||
The parent-side orchestration (`ipc_server.wait_for_peer`,
|
||||
`SpawnSpec`, `Portal` yield) would reuse
|
||||
`trio_proc`'s flow verbatim, with only the subproc-spawn
|
||||
mechanics swapped.
|
||||
|
||||
## Symptom
|
||||
|
||||
Running the prototype (`tractor.spawn._subint_fork.subint_fork_proc`,
|
||||
see git history prior to the stub revert) on py3.14:
|
||||
|
||||
```
|
||||
Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
|
||||
Python runtime state: initialized
|
||||
|
||||
Current thread 0x00007f6b71a456c0 [subint-fork-lau] (most recent call first):
|
||||
File "<script>", line 2 in <module>
|
||||
<script>:2: DeprecationWarning: This process (pid=802985) is multi-threaded, use of fork() may lead to deadlocks in the child.
|
||||
```
|
||||
|
||||
Key clues:
|
||||
|
||||
- The **`DeprecationWarning`** fires in the parent (before
|
||||
fork completes) — fork *is* executing, we get that far.
|
||||
- The **`Fatal Python error`** comes from the child — it
|
||||
aborts during CPython's post-fork C initialization
|
||||
before any user Python runs in the child.
|
||||
- The thread name `subint-fork-lau[nchpad]` is ours —
|
||||
confirms the fork is being called from the launchpad
|
||||
subint's driver thread.
|
||||
|
||||
## CPython source walkthrough
|
||||
|
||||
### Call site — `Modules/posixmodule.c:728-793`
|
||||
|
||||
The post-fork-child hook CPython runs in the child process:
|
||||
|
||||
```c
|
||||
void
|
||||
PyOS_AfterFork_Child(void)
|
||||
{
|
||||
PyStatus status;
|
||||
_PyRuntimeState *runtime = &_PyRuntime;
|
||||
|
||||
// re-creates runtime->interpreters.mutex (HEAD_UNLOCK)
|
||||
status = _PyRuntimeState_ReInitThreads(runtime);
|
||||
...
|
||||
|
||||
PyThreadState *tstate = _PyThreadState_GET();
|
||||
_Py_EnsureTstateNotNULL(tstate);
|
||||
|
||||
...
|
||||
|
||||
// Ideally we could guarantee tstate is running main. ← !!!
|
||||
_PyInterpreterState_ReinitRunningMain(tstate);
|
||||
|
||||
status = _PyEval_ReInitThreads(tstate);
|
||||
...
|
||||
|
||||
status = _PyInterpreterState_DeleteExceptMain(runtime);
|
||||
if (_PyStatus_EXCEPTION(status)) {
|
||||
goto fatal_error;
|
||||
}
|
||||
...
|
||||
|
||||
fatal_error:
|
||||
Py_ExitStatusException(status);
|
||||
}
|
||||
```
|
||||
|
||||
The `// Ideally we could guarantee tstate is running
|
||||
main.` comment is a flashing warning sign — the CPython
|
||||
devs *know* this path is fragile when fork is called from
|
||||
a non-main subint, but they've chosen to abort rather than
|
||||
silently corrupt state. Arguably the right call.
|
||||
|
||||
### The refusal — `Python/pystate.c:1035-1075`
|
||||
|
||||
```c
|
||||
/*
|
||||
* Delete all interpreter states except the main interpreter. If there
|
||||
* is a current interpreter state, it *must* be the main interpreter.
|
||||
*/
|
||||
PyStatus
|
||||
_PyInterpreterState_DeleteExceptMain(_PyRuntimeState *runtime)
|
||||
{
|
||||
struct pyinterpreters *interpreters = &runtime->interpreters;
|
||||
|
||||
PyThreadState *tstate = _PyThreadState_Swap(runtime, NULL);
|
||||
if (tstate != NULL && tstate->interp != interpreters->main) {
|
||||
return _PyStatus_ERR("not main interpreter"); ← our error
|
||||
}
|
||||
|
||||
HEAD_LOCK(runtime);
|
||||
PyInterpreterState *interp = interpreters->head;
|
||||
interpreters->head = NULL;
|
||||
while (interp != NULL) {
|
||||
if (interp == interpreters->main) {
|
||||
interpreters->main->next = NULL;
|
||||
interpreters->head = interp;
|
||||
interp = interp->next;
|
||||
continue;
|
||||
}
|
||||
|
||||
// XXX Won't this fail since PyInterpreterState_Clear() requires
|
||||
// the "current" tstate to be set?
|
||||
PyInterpreterState_Clear(interp); // XXX must activate?
|
||||
zapthreads(interp);
|
||||
...
|
||||
}
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
The comment in the docstring (`If there is a current
|
||||
interpreter state, it *must* be the main interpreter.`) is
|
||||
the formal API contract. The `XXX` comments further in
|
||||
suggest the CPython team is already aware this function
|
||||
has latent issues even in the happy path.
|
||||
|
||||
## Chain summary
|
||||
|
||||
1. Our launchpad subint's driver OS-thread calls
|
||||
`os.fork()`.
|
||||
2. `fork()` succeeds. Child wakes up with:
|
||||
- The parent's full memory image (including all
|
||||
subints).
|
||||
- Only the *calling* thread alive (the driver thread).
|
||||
- `_PyThreadState_GET()` on that thread returns the
|
||||
**launchpad subint's tstate**, *not* main's.
|
||||
3. CPython runs `PyOS_AfterFork_Child()`.
|
||||
4. It reaches `_PyInterpreterState_DeleteExceptMain()`.
|
||||
5. Gate check fails: `tstate->interp != interpreters->main`.
|
||||
6. `PyStatus_ERR("not main interpreter")` → `fatal_error`
|
||||
goto → `Py_ExitStatusException()` → child aborts.
|
||||
|
||||
Parent-side consequence: `os.fork()` in the subint
|
||||
bootstrap returned successfully with the child's PID, but
|
||||
the child died before connecting back. Our parent's
|
||||
`ipc_server.wait_for_peer(uid)` would hang forever — the
|
||||
child never gets to `_actor_child_main`.
|
||||
|
||||
## Definitive answer to "Open Question 1"
|
||||
|
||||
From the (now-stub) `subint_fork_proc` docstring:
|
||||
|
||||
> Does CPython allow `os.fork()` from a non-main
|
||||
> sub-interpreter under the legacy config?
|
||||
|
||||
**No.** Not in a usable-by-user-code sense. The fork
|
||||
syscall is not blocked, but the child cannot survive
|
||||
CPython's post-fork initialization. This is enforced, not
|
||||
accidental, and the CPython devs have acknowledged the
|
||||
fragility in-source.
|
||||
|
||||
## What we'd need from CPython to unblock
|
||||
|
||||
Any one of these, from least-to-most invasive:
|
||||
|
||||
1. **A pre-fork hook mechanism** that lets user code (or
|
||||
tractor itself via `os.register_at_fork(before=...)`)
|
||||
swap the current tstate to main before fork runs. The
|
||||
swap would need to work across the subint→main
|
||||
boundary, which is the actual hard part —
|
||||
`_PyThreadState_Swap()` exists but is internal.
|
||||
|
||||
2. **A `_PyInterpreterState_DeleteExceptFor(tstate->interp)`
|
||||
variant** that cleans up all *other* subints while
|
||||
preserving the calling subint's state. Lets the child
|
||||
continue executing in the subint after fork; a
|
||||
subsequent `execv()` clears everything at the OS
|
||||
level anyway.
|
||||
|
||||
3. **A cleaner error** than `Fatal Python error` aborting
|
||||
the child. Even without fixing the underlying
|
||||
capability, a raised Python-level exception in the
|
||||
parent's `fork()` call (rather than a silent child
|
||||
abort) would at least make the failure mode
|
||||
debuggable.
|
||||
|
||||
## Upstream-report draft (for CPython issue tracker)
|
||||
|
||||
### Title
|
||||
|
||||
> `os.fork()` from a non-main sub-interpreter aborts the
|
||||
> child with a fatal error in `PyOS_AfterFork_Child`; can
|
||||
> we at least make it a clean `RuntimeError` in the
|
||||
> parent?
|
||||
|
||||
### Body
|
||||
|
||||
> **Version**: Python 3.14.x
|
||||
>
|
||||
> **Summary**: Calling `os.fork()` from a thread currently
|
||||
> executing inside a sub-interpreter causes the forked
|
||||
> child process to abort during CPython's post-fork
|
||||
> cleanup, with the following output in the child:
|
||||
>
|
||||
> ```
|
||||
> Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
|
||||
> ```
|
||||
>
|
||||
> From the **parent's** point of view the fork succeeded
|
||||
> (returned a valid child PID). The failure is completely
|
||||
> opaque to parent-side Python code — unless the parent
|
||||
> does `os.waitpid()` it won't even notice the child
|
||||
> died.
|
||||
>
|
||||
> **Root cause** (as I understand it from reading sources):
|
||||
> `Modules/posixmodule.c::PyOS_AfterFork_Child()` calls
|
||||
> `_PyInterpreterState_DeleteExceptMain()` with a
|
||||
> precondition that `_PyThreadState_GET()->interp` be the
|
||||
> main interpreter. When `fork()` is called from a thread
|
||||
> executing inside a subinterpreter, the child wakes up
|
||||
> with its tstate still pointing at the subint, and the
|
||||
> gate in `Python/pystate.c:1044-1047` fails.
|
||||
>
|
||||
> A comment in the source
|
||||
> (`Modules/posixmodule.c:753` — `// Ideally we could
|
||||
> guarantee tstate is running main.`) suggests this is a
|
||||
> known-fragile path rather than an intentional
|
||||
> invariant.
|
||||
>
|
||||
> **Use case**: I was experimenting with using a
|
||||
> sub-interpreter as a "fork launchpad" — have a subint
|
||||
> that has never imported `trio`, call `os.fork()` from
|
||||
> that subint's thread, and in the child `execv()` back
|
||||
> into a fresh Python interpreter process. The goal was
|
||||
> to sidestep known issues with `trio` + `fork()`
|
||||
> interaction (see
|
||||
> [python-trio/trio#1614](https://github.com/python-trio/trio/issues/1614))
|
||||
> by guaranteeing the forking context had never been
|
||||
> "contaminated" by trio's imports or globals. This
|
||||
> approach would allow `trio`-using applications to
|
||||
> combine `fork`-based subprocess spawning with
|
||||
> per-worker `trio.run()` runtimes — a fairly common
|
||||
> pattern that currently requires workarounds.
|
||||
>
|
||||
> **Request**:
|
||||
>
|
||||
> Ideally: make fork-from-subint work (e.g., by swapping
|
||||
> the caller's tstate to main in the pre-fork hook), or
|
||||
> provide a `_PyInterpreterState_DeleteExceptFor(interp)`
|
||||
> variant that permits the caller's subint to survive
|
||||
> post-fork so user code can subsequently `execv()`.
|
||||
>
|
||||
> Minimally: convert the fatal child-side abort into a
|
||||
> clean `RuntimeError` (or similar) raised in the
|
||||
> parent's `fork()` call. Even if the capability isn't
|
||||
> expanded, the failure mode should be debuggable by
|
||||
> user-code in the parent — right now it's a silent
|
||||
> child death with an error message buried in the
|
||||
> child's stderr that parent code can't programmatically
|
||||
> see.
|
||||
>
|
||||
> **Related**: PEP 684 (per-interpreter GIL), PEP 734
|
||||
> (`concurrent.interpreters` public API). The private
|
||||
> `_interpreters` module is what I used to create the
|
||||
> launchpad — behavior is the same whether using
|
||||
> `_interpreters.create('legacy')` or
|
||||
> `concurrent.interpreters.create()` (the latter was not
|
||||
> tested but the gate is identical).
|
||||
>
|
||||
> Happy to contribute a minimal reproducer + test case if
|
||||
> this is something the team wants to pursue.
|
||||
|
||||
## References
|
||||
|
||||
- `Modules/posixmodule.c:728` —
|
||||
[`PyOS_AfterFork_Child`](https://github.com/python/cpython/blob/main/Modules/posixmodule.c#L728)
|
||||
- `Python/pystate.c:1040` —
|
||||
[`_PyInterpreterState_DeleteExceptMain`](https://github.com/python/cpython/blob/main/Python/pystate.c#L1040)
|
||||
- PEP 684 (per-interpreter GIL):
|
||||
<https://peps.python.org/pep-0684/>
|
||||
- PEP 734 (`concurrent.interpreters` public API):
|
||||
<https://peps.python.org/pep-0734/>
|
||||
- [python-trio/trio#1614](https://github.com/python-trio/trio/issues/1614)
|
||||
— the original motivation for the launchpad idea.
|
||||
- tractor issue #379 — "Our own thoughts, ideas for
|
||||
`fork()`-workaround/hacks..." section where this was
|
||||
first sketched.
|
||||
- `tractor.spawn._subint_fork` — in-tree stub preserving
|
||||
the attempted impl's shape in git history.
|
||||
|
|
@ -0,0 +1,375 @@
|
|||
#!/usr/bin/env python3
|
||||
'''
|
||||
Standalone CPython-level feasibility check for the "main-interp
|
||||
worker-thread forkserver + subint-hosted trio" architecture
|
||||
proposed as a workaround to the CPython-level refusal
|
||||
documented in
|
||||
`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`.
|
||||
|
||||
Purpose
|
||||
-------
|
||||
Deliberately NOT a `tractor` test. Zero `tractor` imports.
|
||||
Uses `_interpreters` (private stdlib) + `os.fork()` directly so
|
||||
the signal is unambiguous — pass/fail here is a property of
|
||||
CPython alone, independent of our runtime.
|
||||
|
||||
Run each scenario in isolation; the child's fate is observable
|
||||
only via `os.waitpid()` of the parent and the scenario's own
|
||||
status prints.
|
||||
|
||||
Scenarios (pick one with `--scenario <name>`)
|
||||
---------------------------------------------
|
||||
|
||||
- `control_subint_thread_fork` — the KNOWN-BROKEN case we
|
||||
documented in `subint_fork_blocked_by_cpython_post_fork_issue.md`:
|
||||
drive a subint from a thread, call `os.fork()` inside its
|
||||
`_interpreters.exec()`, watch the child abort. **Included as
|
||||
a control** — if this scenario DOESN'T abort the child, our
|
||||
analysis is wrong and we should re-check everything.
|
||||
|
||||
- `main_thread_fork` — baseline sanity. Call `os.fork()` from
|
||||
the process's main thread. Must always succeed; if this
|
||||
fails something much bigger is broken.
|
||||
|
||||
- `worker_thread_fork` — the architectural assertion. Spawn a
|
||||
regular `threading.Thread` (attached to main interp, NOT a
|
||||
subint), have IT call `os.fork()`. Child should survive
|
||||
post-fork cleanup.
|
||||
|
||||
- `full_architecture` — end-to-end: main-interp worker thread
|
||||
forks. In the child, fork-thread (still main-interp) creates
|
||||
a subint, drives a second worker thread inside it that runs
|
||||
a trivial `trio.run()`. Validates the "root runtime lives in
|
||||
a subint in the child" piece of the proposed arch.
|
||||
|
||||
All scenarios print a self-contained pass/fail banner. Exit
|
||||
code 0 on expected outcome (which for `control_*` means "child
|
||||
aborted", not "child succeeded"!).
|
||||
|
||||
Requires Python 3.14+.
|
||||
|
||||
Usage
|
||||
-----
|
||||
::
|
||||
|
||||
python subint_fork_from_main_thread_smoketest.py \\
|
||||
--scenario main_thread_fork
|
||||
|
||||
python subint_fork_from_main_thread_smoketest.py \\
|
||||
--scenario full_architecture
|
||||
|
||||
'''
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
|
||||
|
||||
# Hard-require py3.14 for the public `concurrent.interpreters`
|
||||
# API (we still drop to `_interpreters` internally, same as
|
||||
# `tractor.spawn._subint`).
|
||||
try:
|
||||
from concurrent import interpreters as _public_interpreters # noqa: F401
|
||||
import _interpreters # type: ignore
|
||||
except ImportError:
|
||||
print(
|
||||
'FAIL (setup): requires Python 3.14+ '
|
||||
'(missing `concurrent.interpreters`)',
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(2)
|
||||
|
||||
|
||||
# The actual primitives this script exercises live in
|
||||
# `tractor.spawn._subint_forkserver` — we re-import them here
|
||||
# rather than inlining so the module and the validation stay
|
||||
# in sync. (Early versions of this file had them inline for
|
||||
# the "zero tractor imports" isolation guarantee; now that
|
||||
# CPython-level feasibility is confirmed, the validated
|
||||
# primitives have moved into tractor proper.)
|
||||
from tractor.spawn._main_thread_forkserver import (
|
||||
fork_from_worker_thread,
|
||||
wait_child,
|
||||
)
|
||||
from tractor.spawn._subint_forkserver import (
|
||||
run_subint_in_worker_thread,
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# small observability helpers (test-harness only)
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
def _banner(title: str) -> None:
|
||||
line = '=' * 60
|
||||
print(f'\n{line}\n{title}\n{line}', flush=True)
|
||||
|
||||
|
||||
def _report(
|
||||
label: str,
|
||||
*,
|
||||
ok: bool,
|
||||
status_str: str,
|
||||
expect_exit_ok: bool,
|
||||
) -> None:
|
||||
verdict: str = 'PASS' if ok else 'FAIL'
|
||||
expected_str: str = (
|
||||
'normal exit (rc=0)'
|
||||
if expect_exit_ok
|
||||
else 'abnormal death (signal or nonzero exit)'
|
||||
)
|
||||
print(
|
||||
f'[{verdict}] {label}: '
|
||||
f'expected {expected_str}; observed {status_str}',
|
||||
flush=True,
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# scenario: `control_subint_thread_fork` (known-broken)
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
def scenario_control_subint_thread_fork() -> int:
|
||||
_banner(
|
||||
'[control] fork from INSIDE a subint (expected: child aborts)'
|
||||
)
|
||||
interp_id = _interpreters.create('legacy')
|
||||
print(f' created subint {interp_id}', flush=True)
|
||||
|
||||
# Shared flag: child writes a sentinel file we can detect from
|
||||
# the parent. If the child manages to write this, CPython's
|
||||
# post-fork refusal is NOT happening → analysis is wrong.
|
||||
sentinel = '/tmp/subint_fork_smoketest_control_child_ran'
|
||||
try:
|
||||
os.unlink(sentinel)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
bootstrap = (
|
||||
'import os\n'
|
||||
'pid = os.fork()\n'
|
||||
'if pid == 0:\n'
|
||||
# child — if CPython's refusal fires this code never runs
|
||||
f' with open({sentinel!r}, "w") as f:\n'
|
||||
' f.write("ran")\n'
|
||||
' os._exit(0)\n'
|
||||
'else:\n'
|
||||
# parent side (inside the launchpad subint) — stash the
|
||||
# forked PID on a shareable dict so we can waitpid()
|
||||
# from the outer main interp. We can't just return it;
|
||||
# _interpreters.exec() returns nothing useful.
|
||||
' import builtins\n'
|
||||
' builtins._forked_child_pid = pid\n'
|
||||
)
|
||||
|
||||
# NOTE, we can't easily pull state back from the subint.
|
||||
# For the CONTROL scenario we just time-bound the fork +
|
||||
# check the sentinel. If sentinel exists → child ran →
|
||||
# analysis wrong. If not → child aborted → analysis
|
||||
# confirmed.
|
||||
done = threading.Event()
|
||||
|
||||
def _drive() -> None:
|
||||
try:
|
||||
_interpreters.exec(interp_id, bootstrap)
|
||||
except Exception as err:
|
||||
print(
|
||||
f' subint bootstrap raised (expected on some '
|
||||
f'CPython versions): {type(err).__name__}: {err}',
|
||||
flush=True,
|
||||
)
|
||||
finally:
|
||||
done.set()
|
||||
|
||||
t = threading.Thread(
|
||||
target=_drive,
|
||||
name='control-subint-fork-launchpad',
|
||||
daemon=True,
|
||||
)
|
||||
t.start()
|
||||
done.wait(timeout=5.0)
|
||||
t.join(timeout=2.0)
|
||||
|
||||
# Give the (possibly-aborted) child a moment to die.
|
||||
time.sleep(0.5)
|
||||
|
||||
sentinel_present = os.path.exists(sentinel)
|
||||
verdict = (
|
||||
# "PASS" for our analysis means sentinel NOT present.
|
||||
'PASS' if not sentinel_present else 'FAIL (UNEXPECTED)'
|
||||
)
|
||||
print(
|
||||
f'[{verdict}] control: sentinel present={sentinel_present} '
|
||||
f'(analysis predicts False — child should abort before '
|
||||
f'writing)',
|
||||
flush=True,
|
||||
)
|
||||
if sentinel_present:
|
||||
os.unlink(sentinel)
|
||||
|
||||
try:
|
||||
_interpreters.destroy(interp_id)
|
||||
except _interpreters.InterpreterError:
|
||||
pass
|
||||
|
||||
return 0 if not sentinel_present else 1
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# scenario: `main_thread_fork` (baseline sanity)
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
def scenario_main_thread_fork() -> int:
|
||||
_banner(
|
||||
'[baseline] fork from MAIN thread (expected: child exits normally)'
|
||||
)
|
||||
|
||||
pid = os.fork()
|
||||
if pid == 0:
|
||||
os._exit(0)
|
||||
|
||||
return 0 if _wait_child(
|
||||
pid,
|
||||
label='main_thread_fork',
|
||||
expect_exit_ok=True,
|
||||
) else 1
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# scenario: `worker_thread_fork` (architectural assertion)
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
def _run_worker_thread_fork_scenario(
|
||||
label: str,
|
||||
*,
|
||||
child_target=None,
|
||||
) -> int:
|
||||
'''
|
||||
Thin wrapper: delegate the actual fork to the
|
||||
`tractor.spawn._subint_forkserver` primitive, then wait
|
||||
on the child and render a pass/fail banner.
|
||||
|
||||
'''
|
||||
try:
|
||||
pid: int = fork_from_worker_thread(
|
||||
child_target=child_target,
|
||||
thread_name=f'worker-fork-thread[{label}]',
|
||||
)
|
||||
except RuntimeError as err:
|
||||
print(f'[FAIL] {label}: {err}', flush=True)
|
||||
return 1
|
||||
print(f' forked child pid={pid}', flush=True)
|
||||
ok, status_str = wait_child(pid, expect_exit_ok=True)
|
||||
_report(
|
||||
label,
|
||||
ok=ok,
|
||||
status_str=status_str,
|
||||
expect_exit_ok=True,
|
||||
)
|
||||
return 0 if ok else 1
|
||||
|
||||
|
||||
def scenario_worker_thread_fork() -> int:
|
||||
_banner(
|
||||
'[arch] fork from MAIN-INTERP WORKER thread '
|
||||
'(expected: child exits normally — this is the one '
|
||||
'that matters)'
|
||||
)
|
||||
return _run_worker_thread_fork_scenario(
|
||||
'worker_thread_fork',
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# scenario: `full_architecture`
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
_CHILD_TRIO_BOOTSTRAP: str = (
|
||||
'import trio\n'
|
||||
'async def _main():\n'
|
||||
' await trio.sleep(0.05)\n'
|
||||
' return 42\n'
|
||||
'result = trio.run(_main)\n'
|
||||
'assert result == 42, f"trio.run returned {result}"\n'
|
||||
'print(" CHILD subint: trio.run OK, result=42", '
|
||||
'flush=True)\n'
|
||||
)
|
||||
|
||||
|
||||
def _child_trio_in_subint() -> int:
|
||||
'''
|
||||
CHILD-side `child_target`: drive a trivial `trio.run()`
|
||||
inside a fresh legacy-config subint on a worker thread,
|
||||
using the `tractor.spawn._subint_forkserver.run_subint_in_worker_thread`
|
||||
primitive. Returns 0 on success.
|
||||
|
||||
'''
|
||||
try:
|
||||
run_subint_in_worker_thread(
|
||||
_CHILD_TRIO_BOOTSTRAP,
|
||||
thread_name='child-subint-trio-thread',
|
||||
)
|
||||
except RuntimeError as err:
|
||||
print(
|
||||
f' CHILD: run_subint_in_worker_thread timed out / thread '
|
||||
f'never returned: {err}',
|
||||
flush=True,
|
||||
)
|
||||
return 3
|
||||
except BaseException as err:
|
||||
print(
|
||||
f' CHILD: subint bootstrap raised: '
|
||||
f'{type(err).__name__}: {err}',
|
||||
flush=True,
|
||||
)
|
||||
return 4
|
||||
return 0
|
||||
|
||||
|
||||
def scenario_full_architecture() -> int:
|
||||
_banner(
|
||||
'[arch-full] worker-thread fork + child runs trio in a '
|
||||
'subint (end-to-end proposed arch)'
|
||||
)
|
||||
return _run_worker_thread_fork_scenario(
|
||||
'full_architecture',
|
||||
child_target=_child_trio_in_subint,
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# main
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
SCENARIOS: dict[str, Callable[[], int]] = {
|
||||
'control_subint_thread_fork': scenario_control_subint_thread_fork,
|
||||
'main_thread_fork': scenario_main_thread_fork,
|
||||
'worker_thread_fork': scenario_worker_thread_fork,
|
||||
'full_architecture': scenario_full_architecture,
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(
|
||||
description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
ap.add_argument(
|
||||
'--scenario',
|
||||
choices=sorted(SCENARIOS.keys()),
|
||||
required=True,
|
||||
)
|
||||
args = ap.parse_args()
|
||||
return SCENARIOS[args.scenario]()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
|
@ -0,0 +1,187 @@
|
|||
# `subint_forkserver` × `multiprocessing.SharedMemory`: fork-inherited `resource_tracker` fd
|
||||
|
||||
Surfaced by `tests/test_shm.py` under
|
||||
`--spawn-backend=subint_forkserver`. Two distinct
|
||||
failure modes, one root cause:
|
||||
**`multiprocessing.resource_tracker` is fork-without-exec
|
||||
unsafe** (canonical CPython class — bpo-38119, bpo-45209).
|
||||
|
||||
**Status: resolved by `tractor/ipc/_mp_bs.py` +
|
||||
`tractor/ipc/_shm.py` changes (see "Resolution" below).
|
||||
This doc kept as the
|
||||
post-mortem / decision record.**
|
||||
|
||||
## TL;DR
|
||||
|
||||
`mp.shared_memory.SharedMemory` registers each shm
|
||||
allocation with the per-process
|
||||
`multiprocessing.resource_tracker` singleton. The
|
||||
tracker is a daemon process started lazily; the
|
||||
parent owns a unix-pipe-fd to it. When the parent
|
||||
forks-without-execing into a `subint_forkserver`
|
||||
child, the child inherits that fd — but it refers to
|
||||
the *parent's* tracker, which the child has no
|
||||
business writing to.
|
||||
|
||||
Two manifestations under the original (pre-fix) code:
|
||||
|
||||
1. **`test_child_attaches_alot`** — child loops 1000×
|
||||
`attach_shm_list()`. First `mp.SharedMemory` call
|
||||
in the child triggers
|
||||
`resource_tracker._ensure_running_and_write` →
|
||||
`_teardown_dead_process` → `os.close(self._fd)` on
|
||||
an fd the child should never have touched. Surfaces
|
||||
as `OSError: [Errno 9] Bad file descriptor`
|
||||
wrapped in `tractor.RemoteActorError`.
|
||||
|
||||
2. **`test_parent_writer_child_reader[*]`** — first
|
||||
parametrize variant "passes" (with
|
||||
`resource_tracker: leaked shared_memory` warning)
|
||||
because nobody ever cleans up `/shm_list`.
|
||||
Subsequent variants then fail with
|
||||
`FileExistsError: '/shm_list'` because the leak
|
||||
persists across the parametrize loop and forkserver
|
||||
children can't `shm_open(create=True)` an existing
|
||||
key.
|
||||
|
||||
Trio backend (`mp_spawn`-style) doesn't surface this:
|
||||
each subactor `exec`s a fresh interpreter →
|
||||
independent resource tracker per subactor → no
|
||||
inherited-fd issue, and the test's pre-existing leak
|
||||
gets masked by the per-process tracker reset.
|
||||
|
||||
Under `subint_forkserver`, the child is `os.fork()`'d
|
||||
from a worker thread (no `exec`) → inherits parent's
|
||||
`mp.resource_tracker._resource_tracker._fd` → EBADF
|
||||
/ cross-talk on first `mp.SharedMemory` op.
|
||||
|
||||
## Resolution
|
||||
|
||||
We side-step the broken upstream machinery entirely
|
||||
rather than try to make it fork-safe. Two-part fix
|
||||
landed (commits to follow this doc):
|
||||
|
||||
### 1. `tractor/ipc/_mp_bs.py::disable_mantracker()`
|
||||
— unconditional disable
|
||||
|
||||
The previous "3.13+ short-circuit" path used
|
||||
`partial(SharedMemory, track=False)` to opt-out of
|
||||
registration on 3.13+. The `track=False` switch is
|
||||
necessary but not sufficient under fork: the
|
||||
inherited tracker fd can still be touched indirectly
|
||||
(e.g. through `_ensure_running_and_write`'s
|
||||
self-check path).
|
||||
|
||||
The fix takes both belts AND suspenders:
|
||||
|
||||
- **Always** monkey-patch
|
||||
`mp.resource_tracker._resource_tracker` to a
|
||||
no-op `ManTracker` subclass whose
|
||||
`register`/`unregister`/`ensure_running` are all
|
||||
empty.
|
||||
- **Always** wrap `SharedMemory` with
|
||||
`track=False`.
|
||||
|
||||
Result: the inherited tracker fd in the fork child
|
||||
is still inherited (fd is a kernel object; we can't
|
||||
un-inherit it across fork) but **nothing in the
|
||||
shm code path will ever try to use it** — both the
|
||||
tracker singleton and the per-allocation registration
|
||||
are short-circuited.
|
||||
|
||||
### 2. `tractor/ipc/_shm.py::open_shm_list()`
|
||||
— own the cleanup
|
||||
|
||||
Without `mp.resource_tracker`, nobody else will
|
||||
unlink leaked segments at process exit. tractor
|
||||
already controls actor lifecycle, so we register
|
||||
unlink on the actor's lifetime stack:
|
||||
|
||||
```python
|
||||
def try_unlink():
|
||||
try:
|
||||
shml.shm.unlink()
|
||||
except FileNotFoundError as fne:
|
||||
log.exception(...) # benign sibling-already-cleaned race
|
||||
|
||||
actor.lifetime_stack.callback(try_unlink)
|
||||
```
|
||||
|
||||
The `FileNotFoundError` swallow handles the case
|
||||
where a sibling actor already unlinked the same
|
||||
segment (legitimate race in shared-key setups).
|
||||
|
||||
## Why this is the right call
|
||||
|
||||
- **mp's tracker is widely criticized.** The
|
||||
in-tree comment "non-SC madness" predates this
|
||||
fix and matches CPython upstream's own discomfort
|
||||
(e.g. the per-context tracker design rework
|
||||
discussions in bpo-43475).
|
||||
- **tractor already owns process lifecycle.** We
|
||||
have `actor.lifetime_stack`, `Portal.cancel_actor`,
|
||||
and the IPC cancel cascade. Adding mp's tracker
|
||||
on top buys nothing we can't do better ourselves.
|
||||
- **Backend-uniform.** No special-casing per spawn
|
||||
backend. trio (`mp_spawn`-style), `subint_forkserver`,
|
||||
and the future `subint` all behave identically
|
||||
— register-time no-op, exit-time unlink-via-
|
||||
lifetime-stack.
|
||||
|
||||
## Trade-offs / known gaps
|
||||
|
||||
- **Crash-leaked segments.** If an actor segfaults
|
||||
or is `SIGKILL`'d before its lifetime stack runs,
|
||||
`/dev/shm/<key>` will leak. Mitigation:
|
||||
`scripts/tractor-reap --shm` walks `/dev/shm`,
|
||||
filters to segments owned by the current uid that
|
||||
no live process is mapping or holding open (via
|
||||
`/proc/*/maps` + `/proc/*/fd/*`), and unlinks
|
||||
them. The "nobody-has-it-open" filter is
|
||||
kernel-canonical so it never touches in-flight
|
||||
segments held by sibling apps (verified locally
|
||||
against 81 piker/lttng/aja-held segments — all
|
||||
preserved).
|
||||
- Higher-level apps using shm should still pin a
|
||||
UUID into the key (the `'shml_<uuid>'` pattern
|
||||
in `test_child_attaches_alot`) so concurrent
|
||||
sessions don't collide on the same key.
|
||||
- **Cross-actor unlink races.** Two actors holding
|
||||
the same shm key racing on `unlink()` — handled
|
||||
by the `FileNotFoundError` swallow.
|
||||
- **Crashes won't show up in mp's leak warning.**
|
||||
We've turned off `resource_tracker`, so the usual
|
||||
`resource_tracker: There appear to be N leaked
|
||||
shared_memory objects to clean up at shutdown`
|
||||
warning is gone too. If we ever want it back as
|
||||
a crash-detection signal, we'd need our own
|
||||
equivalent (walk the actor's `_shm_list_keys` set
|
||||
at root teardown, log any unfreed).
|
||||
|
||||
## Verification
|
||||
|
||||
```sh
|
||||
# fixed under both backends:
|
||||
./py314/bin/python -m pytest tests/test_shm.py \
|
||||
--spawn-backend=subint_forkserver
|
||||
# 7 passed
|
||||
|
||||
./py314/bin/python -m pytest tests/test_shm.py \
|
||||
--spawn-backend=trio
|
||||
# 7 passed (regression check)
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
- CPython upstream issues:
|
||||
- https://bugs.python.org/issue38119 (fork
|
||||
+ resource_tracker fd inheritance)
|
||||
- https://bugs.python.org/issue45209
|
||||
(SharedMemory + resource_tracker)
|
||||
- https://bugs.python.org/issue43475
|
||||
(per-context tracker rework discussion)
|
||||
- Long-term alternative: migrate off
|
||||
`multiprocessing.shared_memory` entirely to
|
||||
`posix_ipc` (no tracker) or finish the
|
||||
`hotbaud`-based ringbuf transport. Not blocked on
|
||||
this fix — both are independently tracked.
|
||||
|
|
@ -0,0 +1,385 @@
|
|||
# `subint_forkserver` backend: orphaned-subactor SIGINT wedged in `epoll_wait`
|
||||
|
||||
Follow-up to the Phase C `subint_forkserver` spawn-backend
|
||||
PR (see `tractor.spawn._subint_forkserver`, issue #379).
|
||||
Surfaced by the xfail'd
|
||||
`tests/spawn/test_subint_forkserver.py::test_orphaned_subactor_sigint_cleanup_DRAFT`.
|
||||
|
||||
Related-but-distinct from
|
||||
`subint_cancel_delivery_hang_issue.md` (orphaned-channel
|
||||
park AFTER subint teardown) and
|
||||
`subint_sigint_starvation_issue.md` (GIL-starvation,
|
||||
SIGINT never delivered): here the SIGINT IS delivered,
|
||||
trio's handler IS installed, but trio's event loop never
|
||||
wakes — so the KBI-at-checkpoint → `_trio_main` catch path
|
||||
(which is the runtime's *intentional* OS-cancel design)
|
||||
never fires.
|
||||
|
||||
## TL;DR
|
||||
|
||||
When a `subint_forkserver`-spawned subactor is orphaned
|
||||
(parent `SIGKILL`'d, no IPC cancel path available) and then
|
||||
externally `SIGINT`'d, the subactor hangs in
|
||||
`trio/_core/_io_epoll.py::get_events` (epoll_wait)
|
||||
indefinitely — even though:
|
||||
|
||||
1. `threading.current_thread() is threading.main_thread()`
|
||||
post-fork (CPython 3.14 re-designates correctly).
|
||||
2. Trio's SIGINT handler IS installed in the subactor
|
||||
(`signal.getsignal(SIGINT)` returns
|
||||
`<function KIManager.install.<locals>.handler at 0x...>`).
|
||||
3. The kernel does deliver SIGINT — the signal arrives at
|
||||
the only thread in the process (the fork-inherited
|
||||
worker which IS now "main" per Python).
|
||||
|
||||
Yet `epoll_wait` does not return. Trio's wakeup-fd mechanism
|
||||
— the machinery that turns SIGINT into an epoll-wake — is
|
||||
somehow not firing the wakeup. Until that's fixed, the
|
||||
intentional "KBI-as-OS-cancel" path in
|
||||
`tractor/spawn/_entry.py::_trio_main:164` is unreachable
|
||||
for forkserver-spawned subactors whose parent dies.
|
||||
|
||||
## Symptom
|
||||
|
||||
Test: `tests/spawn/test_subint_forkserver.py::test_orphaned_subactor_sigint_cleanup_DRAFT`
|
||||
(currently marked `@pytest.mark.xfail(strict=True)`).
|
||||
|
||||
1. Harness subprocess brings up a tractor root actor +
|
||||
one `run_in_actor(_sleep_forever)` subactor via
|
||||
`try_set_start_method('subint_forkserver')`.
|
||||
2. Harness prints `CHILD_PID` (subactor) and
|
||||
`PARENT_READY` (root actor) markers to stdout.
|
||||
3. Test `os.kill(parent_pid, SIGKILL)` + `proc.wait()`
|
||||
to fully reap the root-actor harness.
|
||||
4. Child (now reparented to pid 1) is still alive.
|
||||
5. Test `os.kill(child_pid, SIGINT)` and polls
|
||||
`os.kill(child_pid, 0)` for up to 10s.
|
||||
6. **Observed**: the child is still alive at deadline —
|
||||
SIGINT did not unwedge the trio loop.
|
||||
|
||||
## What the "intentional" cancel path IS
|
||||
|
||||
`tractor/spawn/_entry.py::_trio_main:157-186` —
|
||||
|
||||
```python
|
||||
try:
|
||||
if infect_asyncio:
|
||||
actor._infected_aio = True
|
||||
run_as_asyncio_guest(trio_main)
|
||||
else:
|
||||
trio.run(trio_main)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logmeth = log.cancel
|
||||
exit_status: str = (
|
||||
'Actor received KBI (aka an OS-cancel)\n'
|
||||
...
|
||||
)
|
||||
```
|
||||
|
||||
The "KBI == OS-cancel" mapping IS the runtime's
|
||||
deliberate, documented design. An OS-level SIGINT should
|
||||
flow as: kernel → trio handler → KBI at trio checkpoint
|
||||
→ unwinds `async_main` → surfaces at `_trio_main`'s
|
||||
`except KeyboardInterrupt:` → `log.cancel` + clean `rc=0`.
|
||||
|
||||
**So fixing this hang is not "add a new SIGINT behavior" —
|
||||
it's "make the existing designed behavior actually fire in
|
||||
this backend config".** That's why option (B) ("fix root
|
||||
cause") is aligned with existing design intent, not a
|
||||
scope expansion.
|
||||
|
||||
## Evidence
|
||||
|
||||
### Positive control: standalone fork-from-worker + `trio.run(sleep_forever)` + SIGINT WORKS
|
||||
|
||||
```python
|
||||
import os, signal, time, trio
|
||||
from tractor.spawn._subint_forkserver import (
|
||||
fork_from_worker_thread, wait_child,
|
||||
)
|
||||
|
||||
def child_target() -> int:
|
||||
async def _main():
|
||||
try:
|
||||
await trio.sleep_forever()
|
||||
except KeyboardInterrupt:
|
||||
print('CHILD: caught KBI — trio SIGINT works!')
|
||||
return
|
||||
trio.run(_main)
|
||||
return 0
|
||||
|
||||
pid = fork_from_worker_thread(child_target, thread_name='trio-sigint-test')
|
||||
time.sleep(1.0)
|
||||
os.kill(pid, signal.SIGINT)
|
||||
wait_child(pid)
|
||||
```
|
||||
|
||||
Result: `CHILD: caught KBI — trio SIGINT works!` + clean
|
||||
exit. So the fork-child + trio signal plumbing IS healthy
|
||||
in isolation. The hang appears only with the full tractor
|
||||
subactor runtime on top.
|
||||
|
||||
### Negative test: full tractor subactor + orphan-SIGINT
|
||||
|
||||
Equivalent to the xfail test. Traceback dump via
|
||||
`faulthandler.register(SIGUSR1, all_threads=True)` at the
|
||||
stuck moment:
|
||||
|
||||
```
|
||||
Current thread 0x00007... [subint-forkserv] (most recent call first):
|
||||
File ".../trio/_core/_io_epoll.py", line 245 in get_events
|
||||
File ".../trio/_core/_run.py", line 2415 in run
|
||||
File "tractor/spawn/_entry.py", line 162 in _trio_main
|
||||
File "tractor/_child.py", line 72 in _actor_child_main
|
||||
File "tractor/spawn/_subint_forkserver.py", line 650 in _child_target
|
||||
File "tractor/spawn/_subint_forkserver.py", line 308 in _worker
|
||||
File ".../threading.py", line 1024 in run
|
||||
```
|
||||
|
||||
### Thread + signal-mask inventory of the stuck subactor
|
||||
|
||||
Single thread (`tid == pid`, comm `'subint-forkserv'`,
|
||||
which IS `threading.main_thread()` post-fork):
|
||||
|
||||
```
|
||||
SigBlk: 0000000000000000 # nothing blocked
|
||||
SigIgn: 0000000001001000 # SIGPIPE etc (Python defaults)
|
||||
SigCgt: 0000000108000202 # bit 1 = SIGINT caught
|
||||
```
|
||||
|
||||
Bit 1 set in `SigCgt` → SIGINT handler IS installed. So
|
||||
trio's handler IS in place at the kernel level — not a
|
||||
"handler missing" situation.
|
||||
|
||||
### Handler identity
|
||||
|
||||
Inside the subactor's RPC body, `signal.getsignal(SIGINT)`
|
||||
returns `<function KIManager.install.<locals>.handler at
|
||||
0x...>` — trio's own `KIManager` handler. tractor's only
|
||||
SIGINT touches are `signal.getsignal()` *reads* (to stash
|
||||
into `debug.DebugStatus._trio_handler`); nothing writes
|
||||
over trio's handler outside the debug-REPL shielding path
|
||||
(`devx/debug/_tty_lock.py::shield_sigint`) which isn't
|
||||
engaged here (no debug_mode).
|
||||
|
||||
## Ruled out
|
||||
|
||||
- **GIL starvation / signal-pipe-full** (class A,
|
||||
`subint_sigint_starvation_issue.md`): subactor runs on
|
||||
its own GIL (separate OS process), not sharing with the
|
||||
parent → no cross-process GIL contention. And `strace`-
|
||||
equivalent in the signal mask shows SIGINT IS caught,
|
||||
not queued.
|
||||
- **Orphaned channel park** (`subint_cancel_delivery_hang_issue.md`):
|
||||
different failure mode — that one has trio iterating
|
||||
normally and getting wedged on an orphaned
|
||||
`chan.recv()` AFTER teardown. Here trio's event loop
|
||||
itself never wakes.
|
||||
- **Tractor explicitly catching + swallowing KBI**:
|
||||
greppable — the one `except KeyboardInterrupt:` in the
|
||||
runtime is the INTENTIONAL cancel-path catch at
|
||||
`_trio_main:164`. `async_main` uses `except Exception`
|
||||
(not BaseException), so KBI should propagate through
|
||||
cleanly if it ever fires.
|
||||
- **Missing `signal.set_wakeup_fd` (main-thread
|
||||
restriction)**: post-fork, the fork-worker thread IS
|
||||
`threading.main_thread()`, so trio's main-thread check
|
||||
passes and its wakeup-fd install should succeed.
|
||||
|
||||
## Root cause hypothesis (unverified)
|
||||
|
||||
The SIGINT handler fires but trio's wakeup-fd write does
|
||||
not wake `epoll_wait`. Candidate causes, ranked by
|
||||
plausibility:
|
||||
|
||||
1. **Wakeup-fd lifecycle race around tractor IPC setup.**
|
||||
`async_main` spins up an IPC server + `process_messages`
|
||||
loops early. Somewhere in that path the wakeup-fd that
|
||||
trio registered with its epoll instance may be
|
||||
closed/replaced/clobbered, so subsequent SIGINT writes
|
||||
land on an fd that's no longer in the epoll set.
|
||||
Evidence needed: compare
|
||||
`signal.set_wakeup_fd(-1)` return value inside a
|
||||
post-tractor-bringup RPC body vs. a pre-bringup
|
||||
equivalent. If they differ, that's it.
|
||||
2. **Shielded cancel scope around `process_messages`.**
|
||||
The RPC message loop is likely wrapped in a trio cancel
|
||||
scope; if that scope is `shield=True` at any outer
|
||||
layer, KBI scheduled at a checkpoint could be absorbed
|
||||
by the shield and never bubble out to `_trio_main`.
|
||||
3. **Pre-fork wakeup-fd inheritance.** trio in the PARENT
|
||||
process registered a wakeup-fd with its own epoll. The
|
||||
child inherits the fd number but not the parent's
|
||||
epoll instance — if tractor/trio re-uses the parent's
|
||||
stale fd number anywhere, writes would go to a no-op
|
||||
fd. (This is the least likely — `trio.run()` on the
|
||||
child calls `KIManager.install` which should install a
|
||||
fresh wakeup-fd from scratch.)
|
||||
|
||||
## Cross-backend scope question
|
||||
|
||||
**Untested**: does the same orphan-SIGINT hang reproduce
|
||||
against the `trio_proc` backend (stock subprocess + exec)?
|
||||
If yes → pre-existing tractor bug, independent of
|
||||
`subint_forkserver`. If no → something specific to the
|
||||
fork-from-worker path (e.g. inherited fds, mid-epoll-setup
|
||||
interference).
|
||||
|
||||
**Quick repro for trio_proc**:
|
||||
|
||||
```python
|
||||
# save as /tmp/trio_proc_orphan_sigint_repro.py
|
||||
import os, sys, signal, time, glob
|
||||
import subprocess as sp
|
||||
|
||||
SCRIPT = '''
|
||||
import os, sys, trio, tractor
|
||||
async def _sleep_forever():
|
||||
print(f"CHILD_PID={os.getpid()}", flush=True)
|
||||
await trio.sleep_forever()
|
||||
|
||||
async def _main():
|
||||
async with (
|
||||
tractor.open_root_actor(registry_addrs=[("127.0.0.1", 12350)]),
|
||||
tractor.open_nursery() as an,
|
||||
):
|
||||
await an.run_in_actor(_sleep_forever, name="sf-child")
|
||||
print(f"PARENT_READY={os.getpid()}", flush=True)
|
||||
await trio.sleep_forever()
|
||||
|
||||
trio.run(_main)
|
||||
'''
|
||||
|
||||
proc = sp.Popen(
|
||||
[sys.executable, '-c', SCRIPT],
|
||||
stdout=sp.PIPE, stderr=sp.STDOUT,
|
||||
)
|
||||
# parse CHILD_PID + PARENT_READY off proc.stdout ...
|
||||
# SIGKILL parent, SIGINT child, poll.
|
||||
```
|
||||
|
||||
If that hangs too, open a broader issue; if not, this is
|
||||
`subint_forkserver`-specific (likely fd-inheritance-related).
|
||||
|
||||
## Why this is ours to fix (not CPython's)
|
||||
|
||||
- Signal IS delivered (`SigCgt` bitmask confirms).
|
||||
- Handler IS installed (trio's `KIManager`).
|
||||
- Thread identity is correct post-fork.
|
||||
- `_trio_main` already has the intentional KBI→clean-exit
|
||||
path waiting to fire.
|
||||
|
||||
Every CPython-level precondition is met. Something in
|
||||
tractor's runtime or trio's integration with it is
|
||||
breaking the SIGINT→wakeup→event-loop-wake pipeline.
|
||||
|
||||
## Possible fix directions
|
||||
|
||||
1. **Audit the wakeup-fd across tractor's IPC bringup.**
|
||||
Add a trio startup hook that captures
|
||||
`signal.set_wakeup_fd(-1)` at `_trio_main` entry,
|
||||
after `async_main` enters, and periodically — assert
|
||||
it's unchanged. If it moves, track down the writer.
|
||||
2. **Explicit `signal.set_wakeup_fd` reset after IPC
|
||||
setup.** Brute force: re-install a fresh wakeup-fd
|
||||
mid-bringup. Band-aid, but fast to try.
|
||||
3. **Ensure no `shield=True` cancel scope envelopes the
|
||||
RPC-message-loop / IPC-server task.** If one does,
|
||||
KBI-at-checkpoint never escapes.
|
||||
4. **Once fixed, the `child_sigint='trio'` mode on
|
||||
`subint_forkserver_proc`** becomes effectively a no-op
|
||||
or a doc-only mode — trio's natural handler already
|
||||
does the right thing. Might end up removing the flag
|
||||
entirely if there's no behavioral difference between
|
||||
modes.
|
||||
|
||||
## Current workaround
|
||||
|
||||
None; `child_sigint` defaults to `'ipc'` (IPC cancel is
|
||||
the only reliable cancel path today), and the xfail test
|
||||
documents the gap. Operators hitting orphan-SIGINT get a
|
||||
hung process that needs `SIGKILL`.
|
||||
|
||||
## Reproducer
|
||||
|
||||
Inline, standalone (no pytest):
|
||||
|
||||
```python
|
||||
# save as /tmp/orphan_sigint_repro.py (py3.14+)
|
||||
import os, sys, signal, time, glob, trio
|
||||
import tractor
|
||||
from tractor.spawn._subint_forkserver import (
|
||||
fork_from_worker_thread,
|
||||
)
|
||||
|
||||
async def _sleep_forever():
|
||||
print(f'SUBACTOR[{os.getpid()}]', flush=True)
|
||||
await trio.sleep_forever()
|
||||
|
||||
async def _main():
|
||||
async with (
|
||||
tractor.open_root_actor(
|
||||
registry_addrs=[('127.0.0.1', 12349)],
|
||||
),
|
||||
tractor.open_nursery() as an,
|
||||
):
|
||||
await an.run_in_actor(_sleep_forever, name='sf-child')
|
||||
await trio.sleep_forever()
|
||||
|
||||
def child_target() -> int:
|
||||
from tractor.spawn._spawn import try_set_start_method
|
||||
try_set_start_method('subint_forkserver')
|
||||
trio.run(_main)
|
||||
return 0
|
||||
|
||||
pid = fork_from_worker_thread(child_target, thread_name='repro')
|
||||
time.sleep(3.0)
|
||||
|
||||
# find the subactor pid via /proc
|
||||
children = []
|
||||
for path in glob.glob(f'/proc/{pid}/task/*/children'):
|
||||
with open(path) as f:
|
||||
children.extend(int(x) for x in f.read().split() if x)
|
||||
subactor_pid = children[0]
|
||||
|
||||
# SIGKILL root → orphan the subactor
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
os.waitpid(pid, 0)
|
||||
time.sleep(0.3)
|
||||
|
||||
# SIGINT the orphan — should cause clean trio exit
|
||||
os.kill(subactor_pid, signal.SIGINT)
|
||||
|
||||
# poll for exit
|
||||
for _ in range(100):
|
||||
try:
|
||||
os.kill(subactor_pid, 0)
|
||||
time.sleep(0.1)
|
||||
except ProcessLookupError:
|
||||
print('HARNESS: subactor exited cleanly ✔')
|
||||
sys.exit(0)
|
||||
os.kill(subactor_pid, signal.SIGKILL)
|
||||
print('HARNESS: subactor hung — reproduced')
|
||||
sys.exit(1)
|
||||
```
|
||||
|
||||
Expected (current): `HARNESS: subactor hung — reproduced`.
|
||||
|
||||
After fix: `HARNESS: subactor exited cleanly ✔`.
|
||||
|
||||
## References
|
||||
|
||||
- `tractor/spawn/_entry.py::_trio_main:157-186` — the
|
||||
intentional KBI→clean-exit path this bug makes
|
||||
unreachable.
|
||||
- `tractor/spawn/_subint_forkserver` — the backend whose
|
||||
orphan cancel-robustness this blocks.
|
||||
- `tests/spawn/test_subint_forkserver.py::test_orphaned_subactor_sigint_cleanup_DRAFT`
|
||||
— the xfail'd reproducer in the test suite.
|
||||
- `ai/conc-anal/subint_cancel_delivery_hang_issue.md` —
|
||||
sibling "orphaned channel park" hang (different class).
|
||||
- `ai/conc-anal/subint_sigint_starvation_issue.md` —
|
||||
sibling "GIL starvation SIGINT drop" hang (different
|
||||
class).
|
||||
- tractor issue #379 — subint backend tracking.
|
||||
|
|
@ -0,0 +1,851 @@
|
|||
# `subint_forkserver` backend: `test_cancellation.py` multi-level cancel cascade hang
|
||||
|
||||
> **Tracked at:** [#449](https://github.com/goodboy/tractor/issues/449)
|
||||
|
||||
Follow-up tracker: surfaced while wiring the new
|
||||
`subint_forkserver` spawn backend into the full tractor
|
||||
test matrix (step 2 of the post-backend-lands plan).
|
||||
See also
|
||||
`ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`
|
||||
— sibling tracker for a different forkserver-teardown
|
||||
class which probably shares the same fundamental root
|
||||
cause (fork-FD-inheritance across nested spawns).
|
||||
|
||||
## TL;DR
|
||||
|
||||
`tests/test_cancellation.py::test_nested_multierrors[subint_forkserver]`
|
||||
hangs indefinitely under our new backend. The hang is
|
||||
**inside the graceful IPC cancel cascade** — every actor
|
||||
in the multi-level tree parks in `epoll_wait` waiting
|
||||
for IPC messages that never arrive. Not a hard-kill /
|
||||
tree-reap issue (we don't reach the hard-kill fallback
|
||||
path at all).
|
||||
|
||||
Working hypothesis (unverified): **`os.fork()` from a
|
||||
subactor inherits the root parent's IPC listener socket
|
||||
FDs**. When a first-level subactor forkserver-spawns a
|
||||
grandchild, that grandchild inherits both its direct
|
||||
spawner's FDs AND the root's FDs — IPC message routing
|
||||
becomes ambiguous (or silently sends to the wrong
|
||||
channel), so the cancel cascade can't reach its target.
|
||||
|
||||
## Corrected diagnosis vs. earlier draft
|
||||
|
||||
An earlier version of this doc claimed the root cause
|
||||
was **"forkserver teardown doesn't tree-kill
|
||||
descendants"** (SIGKILL only reaches the direct child,
|
||||
grandchildren survive and hold TCP `:1616`). That
|
||||
diagnosis was **wrong**, caused by conflating two
|
||||
observations:
|
||||
|
||||
1. *5-zombie leak holding :1616* — happened in my own
|
||||
workflow when I aborted a bg pytest task with
|
||||
`pkill` (SIGTERM/SIGKILL, not SIGINT). The abrupt
|
||||
kill skipped the graceful `ActorNursery.__aexit__`
|
||||
cancel cascade entirely, orphaning descendants to
|
||||
init. **This was my cleanup bug, not a forkserver
|
||||
teardown bug.** Codified the fix (SIGINT-first +
|
||||
bounded wait before SIGKILL) in
|
||||
`feedback_sc_graceful_cancel_first.md` +
|
||||
`.claude/skills/run-tests/SKILL.md`.
|
||||
2. *`test_nested_multierrors` hangs indefinitely* —
|
||||
the real, separate, forkserver-specific bug
|
||||
captured by this doc.
|
||||
|
||||
The two symptoms are unrelated. The tree-kill / setpgrp
|
||||
fix direction proposed earlier would not help (1) (SC-
|
||||
graceful-cleanup is the right answer there) and would
|
||||
not help (2) (the hang is in the cancel cascade, not
|
||||
in the hard-kill fallback).
|
||||
|
||||
## Symptom
|
||||
|
||||
Reproducer (py3.14, clean env):
|
||||
|
||||
```sh
|
||||
# preflight: ensure clean env
|
||||
ss -tlnp 2>/dev/null | grep ':1616' && echo 'FOUL — cleanup first!' || echo 'clean'
|
||||
|
||||
./py314/bin/python -m pytest --spawn-backend=subint_forkserver \
|
||||
'tests/test_cancellation.py::test_nested_multierrors[subint_forkserver]' \
|
||||
--timeout=30 --timeout-method=thread --tb=short -v
|
||||
```
|
||||
|
||||
Expected: `pytest-timeout` fires at 30s with a thread-
|
||||
dump banner, but the process itself **remains alive
|
||||
after timeout** and doesn't unwedge on subsequent
|
||||
SIGINT. Requires SIGKILL to reap.
|
||||
|
||||
## Evidence (tree structure at hang point)
|
||||
|
||||
All 5 processes are kernel-level `S` (sleeping) in
|
||||
`do_epoll_wait` (trio's event loop waiting on I/O):
|
||||
|
||||
```
|
||||
PID PPID THREADS NAME ROLE
|
||||
333986 1 2 subint-forkserv pytest main (the test body)
|
||||
333993 333986 3 subint-forkserv "child 1" spawner subactor
|
||||
334003 333993 1 subint-forkserv grandchild errorer under child-1
|
||||
334014 333993 1 subint-forkserv grandchild errorer under child-1
|
||||
333999 333986 1 subint-forkserv "child 2" spawner subactor (NO grandchildren!)
|
||||
```
|
||||
|
||||
### Asymmetric tree depth
|
||||
|
||||
The test's `spawn_and_error(breadth=2, depth=3)` should
|
||||
have BOTH direct children spawning 2 grandchildren
|
||||
each, going 3 levels deep. Reality:
|
||||
|
||||
- Child 1 (333993, 3 threads) DID spawn its two
|
||||
grandchildren as expected — fully booted trio
|
||||
runtime.
|
||||
- Child 2 (333999, 1 thread) did NOT spawn any
|
||||
grandchildren — clearly never completed its
|
||||
nursery's first `run_in_actor`. Its 1-thread state
|
||||
suggests the runtime never fully booted (no trio
|
||||
worker threads for `waitpid`/IPC).
|
||||
|
||||
This asymmetry is the key clue: the two direct
|
||||
children started identically but diverged. Probably a
|
||||
race around fork-inherited state (listener FDs,
|
||||
subactor-nursery channel state) that happens to land
|
||||
differently depending on spawn ordering.
|
||||
|
||||
### Parent-side state
|
||||
|
||||
Thread-dump of pytest main (333986) at the hang:
|
||||
|
||||
- Main trio thread — parked in
|
||||
`trio._core._io_epoll.get_events` (epoll_wait on
|
||||
its event loop). Waiting for IPC from children.
|
||||
- Two trio-cache worker threads — each parked in
|
||||
`outcome.capture(sync_fn)` calling
|
||||
`os.waitpid(child_pid, 0)`. These are our
|
||||
`_ForkedProc.wait()` off-loads. They're waiting for
|
||||
the direct children to exit — but children are
|
||||
stuck in their own epoll_wait waiting for IPC from
|
||||
the parent.
|
||||
|
||||
**It's a deadlock, not a leak:** the parent is
|
||||
correctly running `soft_kill(proc, _ForkedProc.wait,
|
||||
portal)` (graceful IPC cancel via
|
||||
`Portal.cancel_actor()`), but the children never
|
||||
acknowledge the cancel message (or the message never
|
||||
reaches them through the tangled post-fork IPC).
|
||||
|
||||
## What's NOT the cause (ruled out)
|
||||
|
||||
- **`_ForkedProc.kill()` only SIGKILLs direct pid /
|
||||
missing tree-kill**: doesn't apply — we never reach
|
||||
the hard-kill path. The deadlock is in the graceful
|
||||
cancel cascade.
|
||||
- **Port `:1616` contention**: ruled out after the
|
||||
`reg_addr` fixture-wiring fix; each test session
|
||||
gets a unique port now.
|
||||
- **GIL starvation / SIGINT pipe filling** (class-A,
|
||||
`subint_sigint_starvation_issue.md`): doesn't apply
|
||||
— each subactor is its own OS process with its own
|
||||
GIL (not legacy-config subint).
|
||||
- **Child-side `_trio_main` absorbing KBI**: grep
|
||||
confirmed; `_trio_main` only catches KBI at the
|
||||
`trio.run()` callsite, which is reached only if the
|
||||
trio loop exits normally. The children here never
|
||||
exit trio.run() — they're wedged inside.
|
||||
|
||||
## Hypothesis: FD inheritance across nested forks
|
||||
|
||||
`subint_forkserver_proc` calls
|
||||
`fork_from_worker_thread()` which ultimately does
|
||||
`os.fork()` from a dedicated worker thread. Standard
|
||||
Linux/POSIX fork semantics: **the child inherits ALL
|
||||
open FDs from the parent**, including listener
|
||||
sockets, epoll fds, trio wakeup pipes, and the
|
||||
parent's IPC channel sockets.
|
||||
|
||||
At root-actor fork-spawn time, the root's IPC server
|
||||
listener FDs are open in the parent. Those get
|
||||
inherited by child 1. Child 1 then forkserver-spawns
|
||||
its OWN subactor (grandchild). The grandchild
|
||||
inherits FDs from child 1 — but child 1's address
|
||||
space still contains **the root's IPC listener FDs
|
||||
too** (inherited at first fork). So the grandchild
|
||||
has THREE sets of FDs:
|
||||
|
||||
1. Its own (created after becoming a subactor).
|
||||
2. Its direct parent child-1's.
|
||||
3. The ROOT's (grandparent's) — inherited transitively.
|
||||
|
||||
IPC message routing may be ambiguous in this tangled
|
||||
state. Or a listener socket that the root thinks it
|
||||
owns is actually open in multiple processes, and
|
||||
messages sent to it go to an arbitrary one. That
|
||||
would exactly match the observed "graceful cancel
|
||||
never propagates".
|
||||
|
||||
This hypothesis predicts the bug **scales with fork
|
||||
depth**: single-level forkserver spawn
|
||||
(`test_subint_forkserver_spawn_basic`) works
|
||||
perfectly, but any test that spawns a second level
|
||||
deadlocks. Matches observations so far.
|
||||
|
||||
## Fix directions (to validate)
|
||||
|
||||
### 1. `close_fds=True` equivalent in `fork_from_worker_thread()`
|
||||
|
||||
`subprocess.Popen` / `trio.lowlevel.open_process` have
|
||||
`close_fds=True` by default on POSIX — they
|
||||
enumerate open FDs in the child post-fork and close
|
||||
everything except stdio + any explicitly-passed FDs.
|
||||
Our raw `os.fork()` doesn't. Adding the equivalent to
|
||||
our `_worker` prelude would isolate each fork
|
||||
generation's FD set.
|
||||
|
||||
Implementation sketch in
|
||||
`tractor.spawn._subint_forkserver.fork_from_worker_thread._worker`:
|
||||
|
||||
```python
|
||||
def _worker() -> None:
|
||||
pid: int = os.fork()
|
||||
if pid == 0:
|
||||
# CHILD: close inherited FDs except stdio + the
|
||||
# pid-pipe we just opened.
|
||||
keep: set[int] = {0, 1, 2, rfd, wfd}
|
||||
import resource
|
||||
soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
|
||||
os.closerange(3, soft) # blunt; or enumerate /proc/self/fd
|
||||
# ... then child_target() as before
|
||||
```
|
||||
|
||||
Problem: overly aggressive — closes FDs the
|
||||
grandchild might legitimately need (e.g. its parent's
|
||||
IPC channel for the spawn-spec handshake, if we rely
|
||||
on that). Needs thought about which FDs are
|
||||
"inheritable and safe" vs. "inherited by accident".
|
||||
|
||||
### 2. Cloexec on tractor's own FDs
|
||||
|
||||
Set `FD_CLOEXEC` on tractor-created sockets (listener
|
||||
sockets, IPC channel sockets, pipes). This flag
|
||||
causes automatic close on `execve`, but since we
|
||||
`fork()` without `exec()`, this alone doesn't help.
|
||||
BUT — combined with a child-side explicit close-
|
||||
non-cloexec loop, it gives us a way to mark "my
|
||||
private FDs" vs. "safe to inherit". Most robust, but
|
||||
requires tractor-wide audit.
|
||||
|
||||
### 3. Explicit FD cleanup in `_ForkedProc`/`_child_target`
|
||||
|
||||
Have `subint_forkserver_proc`'s `_child_target`
|
||||
closure explicitly close the parent-side IPC listener
|
||||
FDs before calling `_actor_child_main`. Requires
|
||||
being able to enumerate "the parent's listener FDs
|
||||
that the child shouldn't keep" — plausible via
|
||||
`Actor.ipc_server`'s socket objects.
|
||||
|
||||
### 4. Use `os.posix_spawn` with explicit `file_actions`
|
||||
|
||||
Instead of raw `os.fork()`, use `os.posix_spawn()`
|
||||
which supports explicit file-action specifications
|
||||
(close this FD, dup2 that FD). Cleaner semantics, but
|
||||
probably incompatible with our "no exec" requirement
|
||||
(subint_forkserver is a fork-without-exec design).
|
||||
|
||||
**Likely correct answer: (3) — targeted FD cleanup
|
||||
via `actor.ipc_server` handle.** (1) is too blunt,
|
||||
(2) is too wide-ranging, (4) changes the spawn
|
||||
mechanism.
|
||||
|
||||
## Reproducer (standalone, no pytest)
|
||||
|
||||
```python
|
||||
# save as /tmp/forkserver_nested_hang_repro.py (py3.14+)
|
||||
import trio, tractor
|
||||
|
||||
async def assert_err():
|
||||
assert 0
|
||||
|
||||
async def spawn_and_error(breadth: int = 2, depth: int = 1):
|
||||
async with tractor.open_nursery() as n:
|
||||
for i in range(breadth):
|
||||
if depth > 0:
|
||||
await n.run_in_actor(
|
||||
spawn_and_error,
|
||||
breadth=breadth,
|
||||
depth=depth - 1,
|
||||
name=f'spawner_{i}_{depth}',
|
||||
)
|
||||
else:
|
||||
await n.run_in_actor(
|
||||
assert_err,
|
||||
name=f'errorer_{i}',
|
||||
)
|
||||
|
||||
async def _main():
|
||||
async with tractor.open_nursery() as n:
|
||||
for i in range(2):
|
||||
await n.run_in_actor(
|
||||
spawn_and_error,
|
||||
name=f'top_{i}',
|
||||
breadth=2,
|
||||
depth=1,
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
from tractor.spawn._spawn import try_set_start_method
|
||||
try_set_start_method('subint_forkserver')
|
||||
with trio.fail_after(20):
|
||||
trio.run(_main)
|
||||
```
|
||||
|
||||
Expected (current): hangs on `trio.fail_after(20)`
|
||||
— children never ack the error-propagation cancel
|
||||
cascade. Pattern: top 2 direct children, 4
|
||||
grandchildren, 1 errorer deadlocks while trying to
|
||||
unwind through its parent chain.
|
||||
|
||||
After fix: `trio.TooSlowError`-free completion; the
|
||||
root's `open_nursery` receives the
|
||||
`BaseExceptionGroup` containing the `AssertionError`
|
||||
from the errorer and unwinds cleanly.
|
||||
|
||||
## Update — 2026-04-23: partial fix landed, deeper layer surfaced
|
||||
|
||||
Three improvements landed as separate commits in the
|
||||
`subint_forkserver_backend` branch (see `git log`):
|
||||
|
||||
1. **`_close_inherited_fds()` in fork-child prelude**
|
||||
(`tractor/spawn/_subint_forkserver.py`). POSIX
|
||||
close-fds-equivalent enumeration via
|
||||
`/proc/self/fd` (or `RLIMIT_NOFILE` fallback), keep
|
||||
only stdio. This is fix-direction (1) from the list
|
||||
above — went with the blunt form rather than the
|
||||
targeted enum-via-`actor.ipc_server` form, turns
|
||||
out the aggressive close is safe because every
|
||||
inheritable resource the fresh child needs
|
||||
(IPC-channel socket, etc.) is opened AFTER the
|
||||
fork anyway.
|
||||
2. **`_ForkedProc.wait()` via `os.pidfd_open()` +
|
||||
`trio.lowlevel.wait_readable()`** — matches the
|
||||
`trio.Process.wait` / `mp.Process.sentinel` pattern
|
||||
used by `trio_proc` and `proc_waiter`. Gives us
|
||||
fully trio-cancellable child-wait (prior impl
|
||||
blocked a cache thread on a sync `os.waitpid` that
|
||||
was NOT trio-cancellable due to
|
||||
`abandon_on_cancel=False`).
|
||||
3. **`_parent_chan_cs` wiring** in
|
||||
`tractor/runtime/_runtime.py`: capture the shielded
|
||||
`loop_cs` for the parent-channel `process_messages`
|
||||
task in `async_main`; explicitly cancel it in
|
||||
`Actor.cancel()` teardown. This breaks the shield
|
||||
during teardown so the parent-chan loop exits when
|
||||
cancel is issued, instead of parking on a parent-
|
||||
socket EOF that might never arrive under fork
|
||||
semantics.
|
||||
|
||||
**Concrete wins from (1):** the sibling
|
||||
`subint_forkserver_orphan_sigint_hang_issue.md` class
|
||||
is **now fixed** — `test_orphaned_subactor_sigint_cleanup_DRAFT`
|
||||
went from strict-xfail to pass. The xfail mark was
|
||||
removed; the test remains as a regression guard.
|
||||
|
||||
**test_nested_multierrors STILL hangs** though.
|
||||
|
||||
### Updated diagnosis (narrowed)
|
||||
|
||||
DIAGDEBUG instrumentation of `process_messages` ENTER/
|
||||
EXIT pairs + `_parent_chan_cs.cancel()` call sites
|
||||
showed (captured during a 20s-timeout repro):
|
||||
|
||||
- 80 `process_messages` ENTERs, 75 EXITs → 5 stuck.
|
||||
- **All 40 `shield=True` ENTERs matched EXIT** — every
|
||||
shielded parent-chan loop exits cleanly. The
|
||||
`_parent_chan_cs` wiring works as intended.
|
||||
- **The 5 stuck loops are all `shield=False`** — peer-
|
||||
channel handlers (inbound connections handled by
|
||||
`handle_stream_from_peer` in stream_handler_tn).
|
||||
- After our `_parent_chan_cs.cancel()` fires, NEW
|
||||
shielded process_messages loops start (on the
|
||||
session reg_addr port — probably discovery-layer
|
||||
reconnection attempts). These don't block teardown
|
||||
(they all exit) but indicate the cancel cascade has
|
||||
more moving parts than expected.
|
||||
|
||||
### Remaining unknown
|
||||
|
||||
Why don't the 5 peer-channel loops exit when
|
||||
`service_tn.cancel_scope.cancel()` fires? They're in
|
||||
`stream_handler_tn` which IS `service_tn` in the
|
||||
current configuration (`open_ipc_server(parent_tn=
|
||||
service_tn, stream_handler_tn=service_tn)`). A
|
||||
standard nursery-scope-cancel should propagate through
|
||||
them — no shield, no special handler. Something
|
||||
specific to the fork-spawned configuration keeps them
|
||||
alive.
|
||||
|
||||
Candidate follow-up experiments:
|
||||
|
||||
- Dump the trio task tree at the hang point (via
|
||||
`stackscope` or direct trio introspection) to see
|
||||
what each stuck loop is awaiting. `chan.__anext__`
|
||||
on a socket recv? An inner lock? A shielded sub-task?
|
||||
- Compare peer-channel handler lifecycle under
|
||||
`trio_proc` vs `subint_forkserver` with equivalent
|
||||
logging to spot the divergence.
|
||||
- Investigate whether the peer handler is caught in
|
||||
the `except trio.Cancelled:` path at
|
||||
`tractor/ipc/_server.py:448` that re-raises — but
|
||||
re-raise means it should still exit. Unless
|
||||
something higher up swallows it.
|
||||
|
||||
### Attempted fix (DID NOT work) — hypothesis (3)
|
||||
|
||||
Tried: in `_serve_ipc_eps` finally, after closing
|
||||
listeners, also iterate `server._peers` and
|
||||
sync-close each peer channel's underlying stream
|
||||
socket fd:
|
||||
|
||||
```python
|
||||
for _uid, _chans in list(server._peers.items()):
|
||||
for _chan in _chans:
|
||||
try:
|
||||
_stream = _chan._transport.stream if _chan._transport else None
|
||||
if _stream is not None:
|
||||
_stream.socket.close() # sync fd close
|
||||
except (AttributeError, OSError):
|
||||
pass
|
||||
```
|
||||
|
||||
Theory: closing the socket fd from outside the stuck
|
||||
recv task would make the recv see EBADF /
|
||||
ClosedResourceError and unblock.
|
||||
|
||||
Result: `test_nested_multierrors[subint_forkserver]`
|
||||
still hangs identically. Either:
|
||||
- The sync `socket.close()` doesn't propagate into
|
||||
trio's in-flight `recv_some()` the way I expected
|
||||
(trio may hold an internal reference that keeps the
|
||||
fd open even after an external close), or
|
||||
- The stuck recv isn't even the root blocker and the
|
||||
peer handlers never reach the finally for some
|
||||
reason I haven't understood yet.
|
||||
|
||||
Either way, the sync-close hypothesis is **ruled
|
||||
out**. Reverted the experiment, restored the skip-
|
||||
mark on the test.
|
||||
|
||||
### Aside: `-s` flag does NOT change `test_nested_multierrors` behavior
|
||||
|
||||
Tested explicitly: both with and without `-s`, the
|
||||
test hangs identically. So the capture-pipe-fill
|
||||
hypothesis is **ruled out** for this test.
|
||||
|
||||
The earlier `test_context_stream_semantics.py` `-s`
|
||||
observation was most likely caused by a competing
|
||||
pytest run in my session (confirmed via process list
|
||||
— my leftover pytest was alive at that time and
|
||||
could have been holding state on the default
|
||||
registry port).
|
||||
|
||||
## Update — 2026-04-23 (late): cancel delivery ruled in, nursery-wait ruled BLOCKER
|
||||
|
||||
**New diagnostic run** instrumented
|
||||
`handle_stream_from_peer` at ENTER / `except
|
||||
trio.Cancelled:` / finally, plus `Actor.cancel()`
|
||||
just before `self._parent_chan_cs.cancel()`. Result:
|
||||
|
||||
- **40 `handle_stream_from_peer` ENTERs**.
|
||||
- **0 `except trio.Cancelled:` hits** — cancel
|
||||
never fires on any peer-handler.
|
||||
- **35 finally hits** — those handlers exit via
|
||||
peer-initiated EOF (normal return), NOT cancel.
|
||||
- **5 handlers never reach finally** — stuck forever.
|
||||
- **`Actor.cancel()` fired in 12 PIDs** — but the
|
||||
PIDs with peer handlers that DIDN'T fire
|
||||
Actor.cancel are exactly **root + 2 direct
|
||||
spawners**. These 3 actors have peer handlers
|
||||
(for their own subactors) that stay stuck because
|
||||
**`Actor.cancel()` at these levels never runs**.
|
||||
|
||||
### The actual deadlock shape
|
||||
|
||||
`Actor.cancel()` lives in
|
||||
`open_root_actor.__aexit__` / `async_main` teardown.
|
||||
That only runs when the enclosing `async with
|
||||
tractor.open_nursery()` exits. The nursery's
|
||||
`__aexit__` calls the backend `*_proc` spawn target's
|
||||
teardown, which does `soft_kill() →
|
||||
_ForkedProc.wait()` on its child PID. That wait is
|
||||
trio-cancellable via pidfd now (good) — but nothing
|
||||
CANCELS it because the outer scope only cancels when
|
||||
`Actor.cancel()` runs, which only runs when the
|
||||
nursery completes, which waits on the child.
|
||||
|
||||
It's a **multi-level mutual wait**:
|
||||
|
||||
```
|
||||
root blocks on spawner.wait()
|
||||
spawner blocks on grandchild.wait()
|
||||
grandchild blocks on errorer.wait()
|
||||
errorer Actor.cancel() ran, but process
|
||||
may not have fully exited yet
|
||||
(something in root_tn holding on?)
|
||||
```
|
||||
|
||||
Each level waits for the level below. The bottom
|
||||
level (errorer) reaches Actor.cancel(), but its
|
||||
process may not fully exit — meaning its pidfd
|
||||
doesn't go readable, meaning the grandchild's
|
||||
waitpid doesn't return, meaning the grandchild's
|
||||
nursery doesn't unwind, etc. all the way up.
|
||||
|
||||
### Refined question
|
||||
|
||||
**Why does an errorer process not exit after its
|
||||
`Actor.cancel()` completes?**
|
||||
|
||||
Possibilities:
|
||||
1. `_parent_chan_cs.cancel()` fires (shielded
|
||||
parent-chan loop unshielded), but the task is
|
||||
stuck INSIDE the shielded loop's recv in a way
|
||||
that cancel still can't break.
|
||||
2. After `Actor.cancel()` returns, `async_main`
|
||||
still has other tasks in `root_tn` waiting for
|
||||
something that never arrives (e.g. outbound
|
||||
IPC reply delivery).
|
||||
3. The `os._exit(rc)` in `_worker` (at
|
||||
`_subint_forkserver.py`) doesn't run because
|
||||
`_child_target` never returns.
|
||||
|
||||
Next-session candidate probes (in priority order):
|
||||
|
||||
1. **Instrument `_worker`'s fork-child branch** to
|
||||
confirm whether `child_target()` returns (and
|
||||
thus `os._exit(rc)` is reached) for errorer
|
||||
PIDs. If yes → process should die; if no →
|
||||
trace back into `_actor_child_main` /
|
||||
`_trio_main` / `async_main` to find the stuck
|
||||
spot.
|
||||
2. **Instrument `async_main`'s final unwind** to
|
||||
see which await in the teardown doesn't
|
||||
complete.
|
||||
3. **Compare under `trio_proc` backend** at the
|
||||
same `_worker`-equivalent level to see where
|
||||
the flows diverge.
|
||||
|
||||
### Rule-out: NOT a stuck peer-chan recv
|
||||
|
||||
Earlier hypothesis was that the 5 stuck peer-chan
|
||||
loops were blocked on a socket recv that cancel
|
||||
couldn't interrupt. This pass revealed the real
|
||||
cause: cancel **never reaches those tasks** because
|
||||
their owning actor's `Actor.cancel()` never runs.
|
||||
The recvs are fine — they're just parked because
|
||||
nothing is telling them to stop.
|
||||
|
||||
## Update — 2026-04-23 (very late): leaves exit, middle actors stuck in `trio.run`
|
||||
|
||||
Yet another instrumentation pass — this time
|
||||
printing at:
|
||||
|
||||
- `_worker` child branch: `pre child_target()` /
|
||||
`child_target RETURNED rc=N` / `about to
|
||||
os._exit(rc)`
|
||||
- `_trio_main`: `about to trio.run` /
|
||||
`trio.run RETURNED NORMALLY` / `FINALLY`
|
||||
|
||||
**Fresh-run results** (`test_nested_multierrors[
|
||||
subint_forkserver]`, depth=1/breadth=2, 1 root + 14
|
||||
forked = 15 actors total):
|
||||
|
||||
- **9 processes completed the full flow** —
|
||||
`trio.run RETURNED NORMALLY` → `child_target
|
||||
RETURNED rc=0` → `about to os._exit(0)`. These
|
||||
are the LEAVES of the tree (errorer actors) plus
|
||||
their direct parents (depth-0 spawners). They
|
||||
actually exit their processes.
|
||||
- **5 processes are stuck INSIDE `trio.run(trio_main)`**
|
||||
— they hit "about to trio.run" but NEVER see
|
||||
"trio.run RETURNED NORMALLY". These are root +
|
||||
top-level spawners + one intermediate.
|
||||
|
||||
**What this means:** `async_main` itself is the
|
||||
deadlock holder, not the peer-channel loops.
|
||||
Specifically, the outer `async with root_tn:` in
|
||||
`async_main` never exits for the 5 stuck actors.
|
||||
Their `trio.run` never returns → `_trio_main`
|
||||
catch/finally never runs → `_worker` never reaches
|
||||
`os._exit(rc)` → the PROCESS never dies → its
|
||||
parent's `_ForkedProc.wait()` blocks → parent's
|
||||
nursery hangs → parent's `async_main` hangs → ...
|
||||
|
||||
### The new precise question
|
||||
|
||||
**What task in the 5 stuck actors' `async_main`
|
||||
never completes?** Candidates:
|
||||
|
||||
1. The shielded parent-chan `process_messages`
|
||||
task in `root_tn` — but we explicitly cancel it
|
||||
via `_parent_chan_cs.cancel()` in `Actor.cancel()`.
|
||||
However, `Actor.cancel()` only runs during
|
||||
`open_root_actor.__aexit__`, which itself runs
|
||||
only after `async_main`'s outer unwind — which
|
||||
doesn't happen. So the shield isn't broken.
|
||||
|
||||
2. `await actor_nursery._join_procs.wait()` or
|
||||
similar in the inline backend `*_proc` flow.
|
||||
|
||||
3. `_ForkedProc.wait()` on a grandchild that
|
||||
actually DID exit — but the pidfd_open watch
|
||||
didn't fire for some reason (race between
|
||||
pidfd_open and the child exiting?).
|
||||
|
||||
The most specific next probe: **add DIAG around
|
||||
`_ForkedProc.wait()` enter/exit** to see whether
|
||||
the pidfd-based wait returns for every grandchild
|
||||
exit. If a stuck parent's `_ForkedProc.wait()`
|
||||
NEVER returns despite its child exiting, the
|
||||
pidfd mechanism has a race bug under nested
|
||||
forkserver.
|
||||
|
||||
Alternative probe: instrument `async_main`'s outer
|
||||
nursery exits to find which nursery's `__aexit__`
|
||||
is stuck, drilling down from `trio.run` to the
|
||||
specific `async with` that never completes.
|
||||
|
||||
### Cascade summary (updated tree view)
|
||||
|
||||
```
|
||||
ROOT (pytest) STUCK in trio.run
|
||||
├── top_0 (spawner, d=1) STUCK in trio.run
|
||||
│ ├── spawner_0_d1_0 (d=0) exited (os._exit 0)
|
||||
│ │ ├── errorer_0_0 exited (os._exit 0)
|
||||
│ │ └── errorer_0_1 exited (os._exit 0)
|
||||
│ └── spawner_0_d1_1 (d=0) exited (os._exit 0)
|
||||
│ ├── errorer_0_2 exited (os._exit 0)
|
||||
│ └── errorer_0_3 exited (os._exit 0)
|
||||
└── top_1 (spawner, d=1) STUCK in trio.run
|
||||
├── spawner_1_d1_0 (d=0) STUCK in trio.run (sibling race?)
|
||||
│ ├── errorer_1_0 exited
|
||||
│ └── errorer_1_1 exited
|
||||
└── spawner_1_d1_1 (d=0) STUCK in trio.run
|
||||
├── errorer_1_2 exited
|
||||
└── errorer_1_3 exited
|
||||
```
|
||||
|
||||
Grandchildren (d=0 spawners) exit OR stick —
|
||||
asymmetric. Not purely depth-determined. Some race
|
||||
condition in nursery teardown when multiple
|
||||
siblings error simultaneously.
|
||||
|
||||
## Update — 2026-04-23 (late, probe iteration 3): hang pinpointed to `wait_for_no_more_peers()`
|
||||
|
||||
Further DIAGDEBUG at every milestone in `async_main`
|
||||
(runtime UP / EXITED service_tn / EXITED root_tn /
|
||||
FINALLY ENTER / RETURNING) plus `_ForkedProc.wait`
|
||||
ENTER/RETURNED per-pidfd. Result:
|
||||
|
||||
**Every stuck actor reaches `async_main: FINALLY
|
||||
ENTER` but NOT `async_main: RETURNING`.**
|
||||
|
||||
That isolates the hang to a specific await in
|
||||
`async_main`'s finally block at
|
||||
`tractor/runtime/_runtime.py:1837+`. The suspect:
|
||||
|
||||
```python
|
||||
# Ensure all peers (actors connected to us as clients) are finished
|
||||
if ipc_server := actor.ipc_server and ipc_server.has_peers(check_chans=True):
|
||||
...
|
||||
await ipc_server.wait_for_no_more_peers() # ← UNBOUNDED, blocks forever
|
||||
```
|
||||
|
||||
`_no_more_peers` is an `Event` set only when
|
||||
`server._peers` empties (see
|
||||
`ipc/_server.py:526-530`). If ANY peer-handler is
|
||||
stuck (the 5 unclosed loops from the earlier pass),
|
||||
it keeps its channel in `server._peers`, so the
|
||||
event never fires, so the wait hangs.
|
||||
|
||||
### Applied fix (partial, landed as defensive-in-depth)
|
||||
|
||||
`tractor/runtime/_runtime.py:1981` —
|
||||
`wait_for_no_more_peers()` call now wrapped in
|
||||
`trio.move_on_after(3.0)` + a warning log when the
|
||||
timeout fires. Commented with the full rationale.
|
||||
|
||||
**Verified:** with this fix, ALL 15 actors reach
|
||||
`async_main: RETURNING` cleanly (up from 10/15
|
||||
reaching end before).
|
||||
|
||||
**Unfortunately:** the test still hangs past 45s
|
||||
total — meaning there's YET ANOTHER unbounded wait
|
||||
downstream of `async_main`. The bounded
|
||||
`wait_for_no_more_peers` unblocks one level, but
|
||||
the cascade has another level above it.
|
||||
|
||||
### Candidates for the remaining hang
|
||||
|
||||
1. `open_root_actor`'s own finally / post-
|
||||
`async_main` flow in `_root.py` — specifically
|
||||
`await actor.cancel(None)` which has its own
|
||||
internal waits.
|
||||
2. The `trio.run()` itself doesn't return even
|
||||
after the root task completes because trio's
|
||||
nursery still has background tasks running.
|
||||
3. Maybe `_serve_ipc_eps`'s finally has an await
|
||||
that blocks when peers aren't clearing.
|
||||
|
||||
### Current stance
|
||||
|
||||
- Defensive `wait_for_no_more_peers` bound landed
|
||||
(good hygiene regardless). Revealing a real
|
||||
deadlock-avoidance gap in tractor's cleanup.
|
||||
- Test still hangs → skip-mark restored on
|
||||
`test_nested_multierrors[subint_forkserver]`.
|
||||
- The full chain of unbounded waits needs another
|
||||
session of drilling, probably at
|
||||
`open_root_actor` / `actor.cancel` level.
|
||||
|
||||
### Summary of this investigation's wins
|
||||
|
||||
1. **FD hygiene fix** (`_close_inherited_fds`) —
|
||||
correct, closed orphan-SIGINT sibling issue.
|
||||
2. **pidfd-based `_ForkedProc.wait`** — cancellable,
|
||||
matches trio_proc pattern.
|
||||
3. **`_parent_chan_cs` wiring** —
|
||||
`Actor.cancel()` now breaks the shielded parent-
|
||||
chan `process_messages` loop.
|
||||
4. **`wait_for_no_more_peers` bounded** —
|
||||
prevents the actor-level finally hang.
|
||||
5. **Ruled-out hypotheses:** tree-kill missing
|
||||
(wrong), stuck socket recv (wrong).
|
||||
6. **Pinpointed remaining unknown:** at least one
|
||||
more unbounded wait in the teardown cascade
|
||||
above `async_main`. Concrete candidates
|
||||
enumerated above.
|
||||
|
||||
## Update — 2026-04-23 (VERY late): pytest capture pipe IS the final gate
|
||||
|
||||
After landing fixes 1-4 and instrumenting every
|
||||
layer down to `tractor_test`'s `trio.run(_main)`:
|
||||
|
||||
**Empirical result: with `pytest -s` the test PASSES
|
||||
in 6.20s.** Without `-s` (default `--capture=fd`) it
|
||||
hangs forever.
|
||||
|
||||
DIAG timeline for the root pytest PID (with `-s`
|
||||
implied from later verification):
|
||||
|
||||
```
|
||||
tractor_test: about to trio.run(_main)
|
||||
open_root_actor: async_main task started, yielding to test body
|
||||
_main: about to await wrapped test fn
|
||||
_main: wrapped RETURNED cleanly ← test body completed!
|
||||
open_root_actor: about to actor.cancel(None)
|
||||
Actor.cancel ENTER req_chan=False
|
||||
Actor.cancel RETURN
|
||||
open_root_actor: actor.cancel RETURNED
|
||||
open_root_actor: outer FINALLY
|
||||
open_root_actor: finally END (returning from ctxmgr)
|
||||
tractor_test: trio.run FINALLY (returned or raised) ← trio.run fully returned!
|
||||
```
|
||||
|
||||
`trio.run()` fully returns. The test body itself
|
||||
completes successfully (pytest.raises absorbed the
|
||||
expected `BaseExceptionGroup`). What blocks is
|
||||
**pytest's own stdout/stderr capture** — under
|
||||
`--capture=fd` default, pytest replaces the parent
|
||||
process's fd 1,2 with pipe write-ends it's reading
|
||||
from. Fork children inherit those pipe fds
|
||||
(because `_close_inherited_fds` correctly preserves
|
||||
stdio). High-volume subactor error-log tracebacks
|
||||
(7+ actors each logging multiple
|
||||
`RemoteActorError`/`ExceptionGroup` tracebacks on
|
||||
the error-propagation cascade) fill the 64KB Linux
|
||||
pipe buffer. Subactor writes block. Subactor can't
|
||||
progress. Process doesn't exit. Parent's
|
||||
`_ForkedProc.wait` (now pidfd-based and
|
||||
cancellable, but nothing's cancelling here since
|
||||
the test body already completed) keeps the pipe
|
||||
reader alive... but pytest isn't draining its end
|
||||
fast enough because test-teardown/fixture-cleanup
|
||||
is in progress.
|
||||
|
||||
**Actually** the exact mechanism is slightly
|
||||
different: pytest's capture fixture MIGHT be
|
||||
actively reading, but faster-than-writer subactors
|
||||
overflow its internal buffer. Or pytest might be
|
||||
blocked itself on the finalization step.
|
||||
|
||||
Either way, `-s` conclusively fixes it.
|
||||
|
||||
### Why I ruled this out earlier (and shouldn't have)
|
||||
|
||||
Earlier in this investigation I tested
|
||||
`test_nested_multierrors` with/without `-s` and
|
||||
both hung. That's because AT THAT TIME, fixes 1-4
|
||||
weren't all in place yet. The test was hanging at
|
||||
multiple deeper levels long before reaching the
|
||||
"generate lots of error-log output" phase. Once
|
||||
the cascade actually tore down cleanly, enough
|
||||
output was produced to hit the capture-pipe limit.
|
||||
|
||||
**Classic order-of-operations mistake in
|
||||
debugging:** ruling something out too early based
|
||||
on a test that was actually failing for a
|
||||
different reason.
|
||||
|
||||
### Fix direction (next session)
|
||||
|
||||
Redirect subactor stdout/stderr to `/dev/null` (or
|
||||
a session-scoped log file) in the fork-child
|
||||
prelude, right after `_close_inherited_fds()`. This
|
||||
severs the inherited pytest-capture pipes and lets
|
||||
subactor output flow elsewhere. Under normal
|
||||
production use (non-pytest), stdout/stderr would
|
||||
be the TTY — we'd want to keep that. So the
|
||||
redirect should be conditional or opt-in via the
|
||||
`child_sigint`/proc_kwargs flag family.
|
||||
|
||||
Alternative: document as a gotcha and recommend
|
||||
`pytest -s` for any tests using the
|
||||
`subint_forkserver` backend with multi-level actor
|
||||
trees. Simpler, user-visible, no code change.
|
||||
|
||||
### Current state
|
||||
|
||||
- Skip-mark on `test_nested_multierrors[subint_forkserver]`
|
||||
restored with reason pointing here.
|
||||
- Test confirmed passing with `-s` after all 4
|
||||
cascade fixes applied.
|
||||
- The 4 cascade fixes are NOT wasted — they're
|
||||
correct hardening regardless of the capture-pipe
|
||||
issue, AND without them we'd never reach the
|
||||
"actually produces enough output to fill the
|
||||
pipe" state.
|
||||
|
||||
## Stopgap (landed)
|
||||
|
||||
`test_nested_multierrors` skip-marked under
|
||||
`subint_forkserver` via
|
||||
`@pytest.mark.skipon_spawn_backend('subint_forkserver',
|
||||
reason='...')`, cross-referenced to this doc. Mark
|
||||
should be dropped once the peer-channel-loop exit
|
||||
issue is fixed.
|
||||
|
||||
## References
|
||||
|
||||
- `tractor/spawn/_subint_forkserver.py::fork_from_worker_thread`
|
||||
— the primitive whose post-fork FD hygiene is
|
||||
probably the culprit.
|
||||
- `tractor/spawn/_subint_forkserver.py::subint_forkserver_proc`
|
||||
— the backend function that orchestrates the
|
||||
graceful cancel path hitting this bug.
|
||||
- `tractor/spawn/_subint_forkserver.py::_ForkedProc`
|
||||
— the `trio.Process`-compatible shim; NOT the
|
||||
failing component (confirmed via thread-dump).
|
||||
- `tests/test_cancellation.py::test_nested_multierrors`
|
||||
— the test that surfaced the hang.
|
||||
- `ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`
|
||||
— sibling hang class; probably same underlying
|
||||
fork-FD-inheritance root cause.
|
||||
- tractor issue #379 — subint backend tracking.
|
||||
|
|
@ -0,0 +1,186 @@
|
|||
# Revisit `subint_forkserver` thread-cache constraints once msgspec PEP 684 support lands
|
||||
|
||||
> **Tracked at:** [#450](https://github.com/goodboy/tractor/issues/450)
|
||||
|
||||
Follow-up tracker for cleanup work gated on the msgspec
|
||||
PEP 684 adoption upstream ([jcrist/msgspec#563](https://github.com/jcrist/msgspec/issues/563)).
|
||||
|
||||
Context — why this exists
|
||||
-------------------------
|
||||
|
||||
The `tractor.spawn._subint_forkserver` submodule currently
|
||||
carries two "non-trio" thread-hygiene constraints whose
|
||||
necessity is tangled with issues that *should* dissolve
|
||||
under PEP 684 isolated-mode subinterpreters:
|
||||
|
||||
1. `fork_from_worker_thread()` / `run_subint_in_worker_thread()`
|
||||
internally allocate a **dedicated `threading.Thread`**
|
||||
rather than using `trio.to_thread.run_sync()`.
|
||||
2. The test helper is named
|
||||
`run_fork_in_non_trio_thread()` — the
|
||||
`non_trio` qualifier is load-bearing today.
|
||||
|
||||
This doc catalogs *why* those constraints exist, which of
|
||||
them isolated-mode would fix, and what the
|
||||
audit-and-cleanup path looks like once msgspec #563 is
|
||||
resolved.
|
||||
|
||||
The three reasons the constraints exist
|
||||
---------------------------------------
|
||||
|
||||
### 1. GIL-starvation class → fixed by PEP 684 isolated mode
|
||||
|
||||
The class-A hang documented in
|
||||
`subint_sigint_starvation_issue.md` is entirely about
|
||||
legacy-config subints **sharing the main GIL**. Once
|
||||
msgspec #563 lands and tractor flips
|
||||
`tractor.spawn._subint` to
|
||||
`concurrent.interpreters.create()` (isolated config), each
|
||||
subint gets its own GIL. Abandoned subint threads can't
|
||||
contend for main's GIL → can't starve the main trio loop
|
||||
→ signal-wakeup-pipe drains normally → no SIGINT-drop.
|
||||
|
||||
This class of hazard **dissolves entirely**. The
|
||||
non-trio-thread requirement for *this reason* disappears.
|
||||
|
||||
### 2. Destroy race / tstate-recycling → orthogonal; unclear
|
||||
|
||||
The `subint_proc` dedicated-thread fix (commit `26fb8206`)
|
||||
addressed a different issue: `_interpreters.destroy(interp_id)`
|
||||
was blocking on a trio-cache worker that had run an
|
||||
earlier `interp.exec()` for that subint. Working
|
||||
hypothesis at the time was "the cached thread retains the
|
||||
subint's tstate".
|
||||
|
||||
But tstate-handling is **not specific to GIL mode** —
|
||||
`_PyXI_Enter` / `_PyXI_Exit` (the C-level machinery both
|
||||
configs use to enter/leave a subint from a thread) should
|
||||
restore the caller's tstate regardless of GIL config. So
|
||||
isolated mode **doesn't obviously fix this**. It might be:
|
||||
|
||||
- A py3.13 bug fixed in later versions — we saw the race
|
||||
first on 3.13 and never re-tested on 3.14 after moving
|
||||
to dedicated threads.
|
||||
- A genuine CPython quirk around cached threads that
|
||||
exec'd into a subint, persisting across GIL modes.
|
||||
- Something else we misdiagnosed — the empirical fix
|
||||
(dedicated thread) worked but the analysis may have
|
||||
been incomplete.
|
||||
|
||||
Only way to know: once we're on isolated mode, empirically
|
||||
retry `trio.to_thread.run_sync(interp.exec, ...)` and see
|
||||
if `destroy()` still blocks. If it does, keep the
|
||||
dedicated thread; if not, one constraint relaxed.
|
||||
|
||||
### 3. Fork-from-main-interp-tstate (the constraint in this module's helper names)
|
||||
|
||||
The fork-from-main-interp-tstate invariant — CPython's
|
||||
`PyOS_AfterFork_Child` →
|
||||
`_PyInterpreterState_DeleteExceptMain` gate documented in
|
||||
`subint_fork_blocked_by_cpython_post_fork_issue.md` — is
|
||||
about the calling thread's **current** tstate at the
|
||||
moment `os.fork()` runs. If trio's cache threads never
|
||||
enter subints at all, their tstate is plain main-interp,
|
||||
and fork from them would be fine.
|
||||
|
||||
The reason the smoke test +
|
||||
`run_fork_in_non_trio_thread` test helper
|
||||
currently use a dedicated `threading.Thread` is narrow:
|
||||
**we don't want to risk a trio cache thread that has
|
||||
previously been used as a subint driver being the one that
|
||||
picks up the fork job**. If cached tstate doesn't get
|
||||
cleared (back to reason #2), the fork's child-side
|
||||
post-init would see the wrong interp and abort.
|
||||
|
||||
In an isolated-mode world where msgspec works:
|
||||
|
||||
- `subint_proc` would use the public
|
||||
`concurrent.interpreters.create()` + `Interpreter.exec()`
|
||||
/ `Interpreter.close()` — which *should* handle tstate
|
||||
cleanly (they're the "blessed" API).
|
||||
- If so, trio's cache threads are safe to fork from
|
||||
regardless of whether they've previously driven subints.
|
||||
- → the `non_trio` qualifier in
|
||||
`run_fork_in_non_trio_thread` becomes
|
||||
*overcautious* rather than load-bearing, and the
|
||||
dedicated-thread primitives in `_subint_forkserver.py`
|
||||
can likely be replaced with straight
|
||||
`trio.to_thread.run_sync()` wrappers.
|
||||
|
||||
TL;DR
|
||||
-----
|
||||
|
||||
| constraint | fixed by isolated mode? |
|
||||
|---|---|
|
||||
| GIL-starvation (class A) | **yes** |
|
||||
| destroy race on cached worker | unclear — empirical test on py3.14 + isolated API required |
|
||||
| fork-from-main-tstate requirement on worker | **probably yes, conditional on the destroy-race question above** |
|
||||
|
||||
If #2 also resolves on py3.14+ with isolated mode,
|
||||
tractor could drop the `non_trio` qualifier from the fork
|
||||
helper's name and just use `trio.to_thread.run_sync(...)`
|
||||
for everything. But **we shouldn't do that preemptively**
|
||||
— the current cautious design is cheap (one dedicated
|
||||
thread per fork / per subint-exec) and correct.
|
||||
|
||||
Audit plan when msgspec #563 lands
|
||||
----------------------------------
|
||||
|
||||
Assuming msgspec grows `Py_mod_multiple_interpreters`
|
||||
support:
|
||||
|
||||
1. **Flip `tractor.spawn._subint` to isolated mode.** Drop
|
||||
the `_interpreters.create('legacy')` call in favor of
|
||||
the public API (`concurrent.interpreters.create()` +
|
||||
`Interpreter.exec()` / `Interpreter.close()`). Run the
|
||||
three `ai/conc-anal/subint_*_issue.md` reproducers —
|
||||
class-A (`test_stale_entry_is_deleted` etc.) should
|
||||
pass without the `skipon_spawn_backend('subint')` marks
|
||||
(revisit the marker inventory).
|
||||
|
||||
2. **Empirical destroy-race retest.** In `subint_proc`,
|
||||
swap the dedicated `threading.Thread` back to
|
||||
`trio.to_thread.run_sync(Interpreter.exec, ...,
|
||||
abandon_on_cancel=False)` and run the full subint test
|
||||
suite. If `Interpreter.close()` (or the backing
|
||||
destroy) blocks the same way as the legacy version
|
||||
did, revert and keep the dedicated thread.
|
||||
|
||||
3. **If #2 clean**, audit `_subint_forkserver.py`:
|
||||
- Rename `run_fork_in_non_trio_thread` → drop the
|
||||
`_non_trio_` qualifier (e.g. `run_fork_in_thread`) or
|
||||
inline the two-line `trio.to_thread.run_sync` call at
|
||||
the call sites and drop the helper entirely.
|
||||
- Consider whether `fork_from_worker_thread` +
|
||||
`run_subint_in_worker_thread` still warrant being
|
||||
separate module-level primitives or whether they
|
||||
collapse into a compound
|
||||
`trio.to_thread.run_sync`-driven pattern inside the
|
||||
(future) `subint_forkserver_proc` backend.
|
||||
|
||||
4. **Doc fallout.** `subint_sigint_starvation_issue.md`
|
||||
and `subint_cancel_delivery_hang_issue.md` both cite
|
||||
the legacy-GIL-sharing architecture as the root cause.
|
||||
Close them with commit-refs to the isolated-mode
|
||||
migration. This doc itself should get a closing
|
||||
post-mortem section noting which of #1/#2/#3 actually
|
||||
resolved vs persisted.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
- `tractor.spawn._subint_forkserver` — the in-tree module
|
||||
whose constraints this doc catalogs.
|
||||
- `ai/conc-anal/subint_sigint_starvation_issue.md` — the
|
||||
GIL-starvation class.
|
||||
- `ai/conc-anal/subint_cancel_delivery_hang_issue.md` —
|
||||
sibling Ctrl-C-able hang class.
|
||||
- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
||||
— why fork-from-subint is blocked (this drives the
|
||||
forkserver-via-non-subint-thread workaround).
|
||||
- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`
|
||||
— empirical validation for the workaround.
|
||||
- [PEP 684 — per-interpreter GIL](https://peps.python.org/pep-0684/)
|
||||
- [PEP 734 — `concurrent.interpreters` public API](https://peps.python.org/pep-0734/)
|
||||
- [jcrist/msgspec#563 — PEP 684 support tracker](https://github.com/jcrist/msgspec/issues/563)
|
||||
- tractor issue #379 — subint backend tracking.
|
||||
|
|
@ -0,0 +1,350 @@
|
|||
# `subint` backend: abandoned-subint thread can wedge main trio event loop (Ctrl-C unresponsive)
|
||||
|
||||
Follow-up to the Phase B subint spawn-backend PR (see
|
||||
`tractor.spawn._subint`, issue #379). The hard-kill escape
|
||||
hatch we landed (`_HARD_KILL_TIMEOUT`, bounded shields,
|
||||
`daemon=True` driver-thread abandonment) handles *most*
|
||||
stuck-subint scenarios cleanly, but there's one class of
|
||||
hang that can't be fully escaped from within tractor: a
|
||||
still-running abandoned sub-interpreter can starve the
|
||||
**parent's** trio event loop to the point where **SIGINT is
|
||||
effectively dropped by the kernel ↔ Python boundary** —
|
||||
making the pytest process un-Ctrl-C-able.
|
||||
|
||||
## Symptom
|
||||
|
||||
Running `test_stale_entry_is_deleted[subint]` under
|
||||
`--spawn-backend=subint`:
|
||||
|
||||
1. Test spawns a subactor (`transport_fails_actor`) which
|
||||
kills its own IPC server and then
|
||||
`trio.sleep_forever()`.
|
||||
2. Parent tries `Portal.cancel_actor()` → channel
|
||||
disconnected → fast return.
|
||||
3. Nursery teardown triggers our `subint_proc` cancel path.
|
||||
Portal-cancel fails (dead channel),
|
||||
`_HARD_KILL_TIMEOUT` fires, driver thread is abandoned
|
||||
(`daemon=True`), `_interpreters.destroy(interp_id)`
|
||||
raises `InterpreterError` (because the subint is still
|
||||
running).
|
||||
4. Test appears to hang indefinitely at the *outer*
|
||||
`async with tractor.open_nursery() as an:` exit.
|
||||
5. `Ctrl-C` at the terminal does nothing. The pytest
|
||||
process is un-interruptable.
|
||||
|
||||
## Evidence
|
||||
|
||||
### `strace` on the hung pytest process
|
||||
|
||||
```
|
||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||
write(37, "\2", 1) = -1 EAGAIN (Resource temporarily unavailable)
|
||||
rt_sigreturn({mask=[WINCH]}) = 140585542325792
|
||||
```
|
||||
|
||||
Translated:
|
||||
|
||||
- Kernel delivers `SIGINT` to pytest.
|
||||
- CPython's C-level signal handler fires and tries to
|
||||
write the signal number byte (`0x02` = SIGINT) to fd 37
|
||||
— the **Python signal-wakeup fd** (set via
|
||||
`signal.set_wakeup_fd()`, which trio uses to wake its
|
||||
event loop on signals).
|
||||
- Write returns `EAGAIN` — **the pipe is full**. Nothing
|
||||
is draining it.
|
||||
- `rt_sigreturn` with the signal masked off — signal is
|
||||
"handled" from the kernel's perspective but the actual
|
||||
Python-level handler (and therefore trio's
|
||||
`KeyboardInterrupt` delivery) never runs.
|
||||
|
||||
### Stack dump (via `tractor.devx.dump_on_hang`)
|
||||
|
||||
At 20s into the hang, only the **main thread** is visible:
|
||||
|
||||
```
|
||||
Thread 0x...7fdca0191780 [python] (most recent call first):
|
||||
File ".../trio/_core/_io_epoll.py", line 245 in get_events
|
||||
File ".../trio/_core/_run.py", line 2415 in run
|
||||
File ".../tests/discovery/test_registrar.py", line 575 in test_stale_entry_is_deleted
|
||||
...
|
||||
```
|
||||
|
||||
No driver thread shows up. The abandoned-legacy-subint
|
||||
thread still exists from the OS's POV (it's still running
|
||||
inside `_interpreters.exec()` driving the subint's
|
||||
`trio.run()` on `trio.sleep_forever()`) but the **main
|
||||
interp's faulthandler can't see threads currently executing
|
||||
inside a sub-interpreter's tstate**. Concretely: the thread
|
||||
is alive, holding state we can't introspect from here.
|
||||
|
||||
## Root cause analysis
|
||||
|
||||
The most consistent explanation for both observations:
|
||||
|
||||
1. **Legacy-config subinterpreters share the main GIL.**
|
||||
PEP 734's public `concurrent.interpreters.create()`
|
||||
defaults to `'isolated'` (per-interp GIL), but tractor
|
||||
uses `_interpreters.create('legacy')` as a workaround
|
||||
for C extensions that don't yet support PEP 684
|
||||
(notably `msgspec`, see
|
||||
[jcrist/msgspec#563](https://github.com/jcrist/msgspec/issues/563)).
|
||||
Legacy-mode subints share process-global state
|
||||
including the GIL.
|
||||
|
||||
2. **Our abandoned subint thread never exits.** After our
|
||||
hard-kill timeout, `driver_thread.join()` is abandoned
|
||||
via `abandon_on_cancel=True` and the thread is
|
||||
`daemon=True` so proc-exit won't block on it — but the
|
||||
thread *itself* is still alive inside
|
||||
`_interpreters.exec()`, driving a `trio.run()` that
|
||||
will never return (the subint actor is in
|
||||
`trio.sleep_forever()`).
|
||||
|
||||
3. **`_interpreters.destroy()` cannot force-stop a running
|
||||
subint.** It raises `InterpreterError` on any
|
||||
still-running subinterpreter; there is no public
|
||||
CPython API to force-destroy one.
|
||||
|
||||
4. **Shared-GIL + non-terminating subint thread → main
|
||||
trio loop starvation.** Under enough load (the subint's
|
||||
trio event loop iterating in the background, IPC-layer
|
||||
tasks still in the subint, etc.) the main trio event
|
||||
loop can fail to iterate frequently enough to drain its
|
||||
wakeup pipe. Once that pipe fills, `SIGINT` writes from
|
||||
the C signal handler return `EAGAIN` and signals are
|
||||
silently dropped — exactly what `strace` shows.
|
||||
|
||||
The shielded
|
||||
`await actor_nursery._join_procs.wait()` at the top of
|
||||
`subint_proc` (inherited unchanged from the `trio_proc`
|
||||
pattern) is structurally involved too: if main trio *does*
|
||||
get a schedule slice, it'd find the `subint_proc` task
|
||||
parked on `_join_procs` under shield — which traps whatever
|
||||
`Cancelled` arrives. But that's a second-order effect; the
|
||||
signal-pipe-full condition is the primary "Ctrl-C doesn't
|
||||
work" cause.
|
||||
|
||||
## Why we can't fix this from inside tractor
|
||||
|
||||
- **No force-destroy API.** CPython provides neither a
|
||||
`_interpreters.force_destroy()` nor a thread-
|
||||
cancellation primitive (`pthread_cancel` is actively
|
||||
discouraged and unavailable on Windows). A subint stuck
|
||||
in pure-Python loops (or worse, C code that doesn't poll
|
||||
for signals) is structurally unreachable from outside.
|
||||
- **Shared GIL is the root scheduling issue.** As long as
|
||||
we're forced into legacy-mode subints for `msgspec`
|
||||
compatibility, the abandoned-thread scenario is
|
||||
fundamentally a process-global GIL-starvation window.
|
||||
- **`signal.set_wakeup_fd()` is process-global.** Even if
|
||||
we wanted to put our own drainer on the wakeup pipe,
|
||||
only one party owns it at a time.
|
||||
|
||||
## Current workaround
|
||||
|
||||
- **Fixture-side SIGINT loop on the `daemon` subproc** (in
|
||||
this test's `daemon: subprocess.Popen` fixture in
|
||||
`tests/conftest.py`). The daemon dying closes its end of
|
||||
the registry IPC, which unblocks a pending recv in main
|
||||
trio's IPC-server task, which lets the event loop
|
||||
iterate, which drains the wakeup pipe, which finally
|
||||
delivers the test-harness SIGINT.
|
||||
- **Module-level skip on py3.13**
|
||||
(`pytest.importorskip('concurrent.interpreters')`) — the
|
||||
private `_interpreters` C module exists on 3.13 but the
|
||||
multi-trio-task interaction hangs silently there
|
||||
independently of this issue.
|
||||
|
||||
## Path forward
|
||||
|
||||
1. **Primary**: upstream `msgspec` PEP 684 adoption
|
||||
([jcrist/msgspec#563](https://github.com/jcrist/msgspec/issues/563)).
|
||||
Unlocks `concurrent.interpreters.create()` isolated
|
||||
mode → per-interp GIL → abandoned subint threads no
|
||||
longer starve the parent's main trio loop. At that
|
||||
point we can flip `_subint.py` back to the public API
|
||||
(`create()` / `Interpreter.exec()` / `Interpreter.close()`)
|
||||
and drop the private `_interpreters` path.
|
||||
|
||||
2. **Secondary**: watch CPython for a public
|
||||
force-destroy primitive. If something like
|
||||
`Interpreter.close(force=True)` lands, we can use it as
|
||||
a hard-kill final stage and actually tear down
|
||||
abandoned subints.
|
||||
|
||||
3. **Harness-level**: document the fixture-side SIGINT
|
||||
loop pattern as the "known workaround" for subint-
|
||||
backend tests that can leave background state holding
|
||||
the main event loop hostage.
|
||||
|
||||
## References
|
||||
|
||||
- PEP 734 (`concurrent.interpreters`):
|
||||
<https://peps.python.org/pep-0734/>
|
||||
- PEP 684 (per-interpreter GIL):
|
||||
<https://peps.python.org/pep-0684/>
|
||||
- `msgspec` PEP 684 tracker:
|
||||
<https://github.com/jcrist/msgspec/issues/563>
|
||||
- CPython `_interpretersmodule.c` source:
|
||||
<https://github.com/python/cpython/blob/main/Modules/_interpretersmodule.c>
|
||||
- `tractor.spawn._subint` module docstring (in-tree
|
||||
explanation of the legacy-mode choice and its
|
||||
tradeoffs).
|
||||
|
||||
## Reproducer
|
||||
|
||||
```
|
||||
./py314/bin/python -m pytest \
|
||||
tests/discovery/test_registrar.py::test_stale_entry_is_deleted \
|
||||
--spawn-backend=subint \
|
||||
--tb=short --no-header -v
|
||||
```
|
||||
|
||||
Hangs indefinitely without the fixture-side SIGINT loop;
|
||||
with the loop, the test completes (albeit with the
|
||||
abandoned-thread warning in logs).
|
||||
|
||||
## Additional known-hanging tests (same class)
|
||||
|
||||
All three tests below exhibit the same
|
||||
signal-wakeup-fd-starvation fingerprint (`write() → EAGAIN`
|
||||
on the wakeup pipe after enough SIGINT attempts) and
|
||||
share the same structural cause — abandoned legacy-subint
|
||||
driver threads contending with the main interpreter for
|
||||
the shared GIL until the main trio loop can no longer
|
||||
drain its wakeup pipe fast enough to deliver signals.
|
||||
|
||||
They're listed separately because each exposes the class
|
||||
under a different load pattern worth documenting.
|
||||
|
||||
### `tests/discovery/test_registrar.py::test_stale_entry_is_deleted[subint]`
|
||||
|
||||
Original exemplar — see the **Symptom** and **Evidence**
|
||||
sections above. One abandoned subint
|
||||
(`transport_fails_actor`, stuck in `trio.sleep_forever()`
|
||||
after self-cancelling its IPC server) is sufficient to
|
||||
tip main into starvation once the harness's `daemon`
|
||||
fixture subproc keeps its half of the registry IPC alive.
|
||||
|
||||
### `tests/test_cancellation.py::test_cancel_while_childs_child_in_sync_sleep[subint-False]`
|
||||
|
||||
Cancel a grandchild that's in sync Python sleep from 2
|
||||
nurseries up. The test's own docstring declares the
|
||||
dependency: "its parent should issue a 'zombie reaper' to
|
||||
hard kill it after sufficient timeout" — which for
|
||||
`trio`/`mp_*` is an OS-level `SIGKILL` of the grandchild
|
||||
subproc. **Under `subint` there's no equivalent** (no
|
||||
public CPython API to force-destroy a running
|
||||
sub-interpreter), so the grandchild's sync-sleeping
|
||||
`trio.run()` persists inside its abandoned driver thread
|
||||
indefinitely. The nested actor-tree (parent → child →
|
||||
grandchild, all subints) means a single cancel triggers
|
||||
multiple concurrent hard-kill abandonments, each leaving
|
||||
a live driver thread.
|
||||
|
||||
This test often only manifests the starvation under
|
||||
**full-suite runs** rather than solo execution —
|
||||
earlier-in-session subint tests also leave abandoned
|
||||
driver threads behind, and the combined population is
|
||||
what actually tips main trio into starvation. Solo runs
|
||||
may stay Ctrl-C-able with fewer abandoned threads in the
|
||||
mix.
|
||||
|
||||
### `tests/test_cancellation.py::test_multierror_fast_nursery[subint-25-0.5]`
|
||||
|
||||
Nursery-error-path throughput stress-test parametrized
|
||||
for **25 concurrent subactors**. When the multierror
|
||||
fires and the nursery cancels, every subactor goes
|
||||
through our `subint_proc` teardown. The bounded
|
||||
hard-kills run in parallel (all `subint_proc` tasks are
|
||||
sibling trio tasks), so the timeout budget is ~3s total
|
||||
rather than 3s × 25. After that, **25 abandoned
|
||||
`daemon=True` driver threads are simultaneously alive** —
|
||||
an extreme pressure multiplier on the same mechanism.
|
||||
|
||||
The `strace` fingerprint is striking under this load: six
|
||||
or more **successful** `write(16, "\2", 1) = 1` calls
|
||||
(main trio getting brief GIL slices, each long enough to
|
||||
drain exactly one wakeup-pipe byte) before finally
|
||||
saturating with `EAGAIN`:
|
||||
|
||||
```
|
||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||
write(16, "\2", 1) = 1
|
||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||
write(16, "\2", 1) = 1
|
||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||
write(16, "\2", 1) = 1
|
||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||
write(16, "\2", 1) = 1
|
||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||
write(16, "\2", 1) = 1
|
||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||
write(16, "\2", 1) = 1
|
||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||
write(16, "\2", 1) = -1 EAGAIN (Resource temporarily unavailable)
|
||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||
```
|
||||
|
||||
Those successful writes indicate CPython's
|
||||
`sys.getswitchinterval()`-based GIL round-robin *is*
|
||||
giving main brief slices — just never long enough to run
|
||||
the Python-level signal handler through to the point
|
||||
where trio converts the delivered SIGINT into a
|
||||
`Cancelled` on the appropriate scope. Once the
|
||||
accumulated write rate outpaces main's drain rate, the
|
||||
pipe saturates and subsequent signals are silently
|
||||
dropped.
|
||||
|
||||
The `pstree` below (pid `530060` = hung `pytest`) shows
|
||||
the subint-driver thread population at the moment of
|
||||
capture. Even with fewer than the full 25 shown (pstree
|
||||
truncates thread names to `subint-driver[<interp_id>` —
|
||||
interpreters `3` and `4` visible across 16 thread
|
||||
entries), the GIL-contender count is more than enough to
|
||||
explain the starvation:
|
||||
|
||||
```
|
||||
>>> pstree -snapt 530060
|
||||
systemd,1 --switched-root --system --deserialize=40
|
||||
└─login,1545 --
|
||||
└─bash,1872
|
||||
└─sway,2012
|
||||
└─alacritty,70471 -e xonsh
|
||||
└─xonsh,70487 .../bin/xonsh
|
||||
└─uv,70955 run xonsh
|
||||
└─xonsh,70959 .../py314/bin/xonsh
|
||||
└─python,530060 .../py314/bin/pytest -v tests/test_cancellation.py --spawn-backend=subint
|
||||
├─{subint-driver[3},531857
|
||||
├─{subint-driver[3},531860
|
||||
├─{subint-driver[3},531862
|
||||
├─{subint-driver[3},531866
|
||||
├─{subint-driver[3},531877
|
||||
├─{subint-driver[3},531882
|
||||
├─{subint-driver[3},531884
|
||||
├─{subint-driver[3},531945
|
||||
├─{subint-driver[3},531950
|
||||
├─{subint-driver[3},531952
|
||||
├─{subint-driver[4},531956
|
||||
├─{subint-driver[4},531959
|
||||
├─{subint-driver[4},531961
|
||||
├─{subint-driver[4},531965
|
||||
├─{subint-driver[4},531968
|
||||
└─{subint-driver[4},531979
|
||||
```
|
||||
|
||||
(`pstree` uses `{...}` to denote threads rather than
|
||||
processes — these are all the **driver OS-threads** our
|
||||
`subint_proc` creates with name
|
||||
`f'subint-driver[{interp_id}]'`. Every one of them is
|
||||
still alive, executing `_interpreters.exec()` inside a
|
||||
sub-interpreter our hard-kill has abandoned. At 16+
|
||||
abandoned driver threads competing for the main GIL, the
|
||||
main-interpreter trio loop gets starved and signal
|
||||
delivery stalls.)
|
||||
|
|
@ -0,0 +1,273 @@
|
|||
# `test_register_duplicate_name` racy connect-failure on `daemon` fixture readiness
|
||||
|
||||
## Symptom
|
||||
|
||||
`tests/test_multi_program.py::test_register_duplicate_name`
|
||||
fails intermittently under BOTH transports + ALL spawn
|
||||
backends with connect-refused errors:
|
||||
|
||||
```
|
||||
# under --tpt-proto=uds
|
||||
FAILED tests/test_multi_program.py::test_register_duplicate_name
|
||||
- ConnectionRefusedError: [Errno 111] Connection refused
|
||||
( ^^^ this exc was collapsed from a group ^^^ )
|
||||
|
||||
# under --tpt-proto=tcp
|
||||
FAILED tests/test_multi_program.py::test_register_duplicate_name
|
||||
- OSError: all attempts to connect to 127.0.0.1:36003 failed
|
||||
( ^^^ this exc was collapsed from a group ^^^ )
|
||||
```
|
||||
|
||||
Distinct from the cancel-cascade `TooSlowError` flake
|
||||
class — see
|
||||
`cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
|
||||
This is a **connect-time race** before the daemon is
|
||||
fully ready to `accept()`, not a teardown-cascade
|
||||
slowness.
|
||||
|
||||
## Root cause: blind `time.sleep()` in `daemon` fixture
|
||||
|
||||
`tests/conftest.py::daemon` boots a sub-py-process via
|
||||
`subprocess.Popen([python, '-c', 'tractor.run_daemon(...)'])`,
|
||||
then **blindly sleeps** a fixed delay before yielding
|
||||
`proc` to the test:
|
||||
|
||||
```python
|
||||
# excerpt from tests/conftest.py::daemon
|
||||
proc = subprocess.Popen([
|
||||
sys.executable, '-c', code,
|
||||
])
|
||||
|
||||
bg_daemon_spawn_delay: float = _PROC_SPAWN_WAIT # 0.6
|
||||
if tpt_proto == 'uds':
|
||||
bg_daemon_spawn_delay += 1.6
|
||||
if _non_linux and ci_env:
|
||||
bg_daemon_spawn_delay += 1
|
||||
|
||||
# XXX, allow time for the sub-py-proc to boot up.
|
||||
# !TODO, see ping-polling ideas above!
|
||||
time.sleep(bg_daemon_spawn_delay)
|
||||
|
||||
assert not proc.returncode
|
||||
yield proc
|
||||
```
|
||||
|
||||
Inherent fragility: the delay is "long enough on dev
|
||||
boxes most of the time" but has no actual
|
||||
synchronization with the daemon's `bind()` + `listen()`
|
||||
completion. Under any of:
|
||||
|
||||
- Loaded box (CI parallelism, big rebuild in
|
||||
background, low-cpu-freq)
|
||||
- Cold first-run (`importlib` cache miss, JIT warmup)
|
||||
- Higher-than-expected `tractor` import cost
|
||||
- Filesystem latency (UDS sockfile create, slow
|
||||
tmpfs)
|
||||
|
||||
...the sleep finishes BEFORE the daemon has bound its
|
||||
listen socket → first test client call to
|
||||
`tractor.find_actor()` / `wait_for_actor()` /
|
||||
`open_nursery(registry_addrs=[reg_addr])`'s implicit
|
||||
connect → `ConnectionRefusedError` (TCP) or
|
||||
`FileNotFoundError`/`ConnectionRefusedError` (UDS).
|
||||
|
||||
## Reproducer
|
||||
|
||||
Easiest: run the suite under load.
|
||||
|
||||
```bash
|
||||
# create CPU pressure on another core in parallel
|
||||
stress-ng --cpu 2 --timeout 600s &
|
||||
|
||||
./py313/bin/python -m pytest \
|
||||
tests/test_multi_program.py::test_register_duplicate_name \
|
||||
--spawn-backend=main_thread_forkserver \
|
||||
--tpt-proto=tcp -v
|
||||
```
|
||||
|
||||
Reproduces ~30-50% of the time on a dev laptop. On a
|
||||
quiet idle box, may need 5-10 runs to hit.
|
||||
|
||||
## Why the existing `_PROC_SPAWN_WAIT` tuning is
|
||||
inadequate
|
||||
|
||||
Recent `bg_daemon_spawn_delay` rename
|
||||
(de-monotonic-grow fix) just-shipped removed the
|
||||
*accumulation* bug where each invocation made the
|
||||
NEXT test's wait longer too. Net effect: every
|
||||
invocation now uses the SAME `0.6 + 1.6` (UDS) or
|
||||
`0.6` (TCP) sleep, no growth. Good — but does
|
||||
NOTHING for the underlying race. Each individual
|
||||
test still relies on a blind sleep that may or may
|
||||
not be sufficient.
|
||||
|
||||
Bumping the constant higher pushes flake rate down
|
||||
but never to zero AND adds dead time to every
|
||||
non-flaking run. Not a fix, just a knob.
|
||||
|
||||
## Side effects
|
||||
|
||||
- **Inter-test cascade**: a single failure can cascade
|
||||
via leaked subprocesses (the `daemon` fixture's
|
||||
cleanup may not fully tear down a daemon that never
|
||||
reached "ready"). The `_reap_orphaned_subactors`
|
||||
session-end + `_track_orphaned_uds_per_test`
|
||||
per-test fixtures handle most of this now, but the
|
||||
affected test itself still fails.
|
||||
- **Worsens under fork-spawn backends**: the daemon
|
||||
has more init work
|
||||
(`_main_thread_forkserver`-coordinator-thread
|
||||
startup, etc.) so the sleep has to cover MORE.
|
||||
|
||||
## Fix design — replace blind sleep with active poll
|
||||
|
||||
The right primitive is **poll the daemon's bind
|
||||
address until it accepts a connection or we time
|
||||
out**, with the timeout being a hard ceiling rather
|
||||
than a baseline. Two implementation paths:
|
||||
|
||||
### Path A — TCP/UDS connect-poll loop
|
||||
|
||||
Try `socket.connect(reg_addr)` in a tight loop with
|
||||
short backoff (~50ms), succeed on the first non-error
|
||||
return, fail-loud on a hard cap (e.g. 10s). Same
|
||||
primitive works for both transports because both use
|
||||
`socket.connect()` semantics.
|
||||
|
||||
Rough shape:
|
||||
|
||||
```python
|
||||
def _wait_for_daemon_ready(
|
||||
reg_addr,
|
||||
tpt_proto: str,
|
||||
timeout: float = 10.0,
|
||||
poll_interval: float = 0.05,
|
||||
) -> None:
|
||||
deadline = time.monotonic() + timeout
|
||||
while True:
|
||||
if tpt_proto == 'tcp':
|
||||
sock = socket.socket(socket.AF_INET)
|
||||
target = reg_addr # (host, port)
|
||||
else: # uds
|
||||
sock = socket.socket(socket.AF_UNIX)
|
||||
target = os.path.join(*reg_addr)
|
||||
try:
|
||||
sock.settimeout(poll_interval)
|
||||
sock.connect(target)
|
||||
except (
|
||||
ConnectionRefusedError,
|
||||
FileNotFoundError,
|
||||
socket.timeout,
|
||||
) as exc:
|
||||
if time.monotonic() >= deadline:
|
||||
raise TimeoutError(
|
||||
f'Daemon never accepted on {target!r} '
|
||||
f'within {timeout}s'
|
||||
) from exc
|
||||
time.sleep(poll_interval)
|
||||
else:
|
||||
sock.close()
|
||||
return
|
||||
```
|
||||
|
||||
Pros: trivial primitive, no tractor-runtime
|
||||
dependency, works pre-yield in the fixture body,
|
||||
fail-fast on truly-broken daemon.
|
||||
Cons: doesn't actually do an IPC handshake, just
|
||||
proves listen-side is up. A daemon that bound but
|
||||
hasn't initialized its registrar table yet would
|
||||
still race.
|
||||
|
||||
### Path B — `tractor.find_actor()` poll
|
||||
|
||||
Use the actual discovery API the test would call:
|
||||
|
||||
```python
|
||||
async def _wait_for_daemon_ready_via_discovery(
|
||||
reg_addr,
|
||||
timeout: float = 10.0,
|
||||
poll_interval: float = 0.05,
|
||||
):
|
||||
deadline = trio.current_time() + timeout
|
||||
async with tractor.open_root_actor(
|
||||
registry_addrs=[reg_addr],
|
||||
# ephemeral root just for the probe
|
||||
):
|
||||
while True:
|
||||
try:
|
||||
async with tractor.find_actor(
|
||||
'registrar', # daemon's own name
|
||||
registry_addrs=[reg_addr],
|
||||
) as portal:
|
||||
if portal is not None:
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
if trio.current_time() >= deadline:
|
||||
raise TimeoutError(...)
|
||||
await trio.sleep(poll_interval)
|
||||
```
|
||||
|
||||
Pros: actually proves the discovery path works,
|
||||
handles the "bound but not ready" case naturally.
|
||||
Cons: requires booting an ephemeral root actor JUST
|
||||
for the probe (overhead), more code, and runs in trio
|
||||
which complicates the sync-fixture context. Need a
|
||||
`trio.run()` wrapper.
|
||||
|
||||
### Recommended: Path A with optional handshake check
|
||||
|
||||
Path A is much simpler + handles 95% of the bug
|
||||
class. If "bound-but-not-ready" turns out to still
|
||||
race (it shouldn't — `tractor.run_daemon` doesn't
|
||||
return from `bind()` until the registrar is
|
||||
fully populated), escalate to Path B as a focused
|
||||
follow-up.
|
||||
|
||||
## Workarounds (until fix lands)
|
||||
|
||||
1. **Bump `_PROC_SPAWN_WAIT`** higher (current: 0.6).
|
||||
2.0–3.0 hides most flakes at the cost of adding
|
||||
dead time to every test. Not a fix but reduces
|
||||
blast radius while the proper poll lands.
|
||||
2. **`pytest-rerunfailures`** with `reruns=1` on the
|
||||
`daemon` fixture's tests specifically. Hides the
|
||||
flake but doesn't address it.
|
||||
3. **Mark known-affected tests as `xfail(strict=False)`**
|
||||
under `--ci`. Lets CI go green at the cost of
|
||||
silently hiding regressions.
|
||||
|
||||
(Recommend skipping all three — implement the active
|
||||
poll instead.)
|
||||
|
||||
## Investigation next steps
|
||||
|
||||
1. Implement Path A as a `_wait_for_daemon_ready()`
|
||||
helper in `tests/conftest.py`. Replace the
|
||||
`time.sleep(bg_daemon_spawn_delay)` call with it.
|
||||
2. Drop the `_PROC_SPAWN_WAIT` constant entirely
|
||||
(active poll obsoletes blind sleep).
|
||||
3. Run the suite 5-10 times to validate flake rate
|
||||
drops to 0.
|
||||
4. If flakes persist, profile whether the daemon
|
||||
process exits with non-zero before the poll's
|
||||
deadline hits — that'd be a different bug
|
||||
(daemon startup crash) that the blind sleep was
|
||||
masking.
|
||||
5. Cross-check `tests/test_multi_program.py::test_*`
|
||||
— multiple tests use the `daemon` fixture; all
|
||||
should benefit from the same poll primitive.
|
||||
|
||||
## Related
|
||||
|
||||
- `tests/conftest.py::daemon` — the fixture under
|
||||
fix
|
||||
- `tests/conftest.py::_PROC_SPAWN_WAIT` — the
|
||||
constant to drop
|
||||
- `cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`
|
||||
— distinct flake class (cancel-cascade
|
||||
`TooSlowError` at teardown, not connect-time race)
|
||||
- `trio_wakeup_socketpair_busy_loop_under_fork_issue.md`
|
||||
— different bug entirely; this race was masked
|
||||
pre-WakeupSocketpair-patch by the busy-loop
|
||||
hangs.
|
||||
|
|
@ -0,0 +1,221 @@
|
|||
# trio `WakeupSocketpair.drain()` busy-loop in forked child (peer-closed missed-EOF)
|
||||
|
||||
## Reproducer
|
||||
|
||||
```bash
|
||||
./py313/bin/python -m pytest \
|
||||
tests/test_multi_program.py::test_register_duplicate_name \
|
||||
--tpt-proto=tcp \
|
||||
--spawn-backend=main_thread_forkserver \
|
||||
-v --capture=sys
|
||||
```
|
||||
|
||||
Subactor pegs a CPU core indefinitely; parent test
|
||||
hangs waiting for the subactor.
|
||||
|
||||
## Empirical evidence (caught alive)
|
||||
|
||||
```
|
||||
$ sudo strace -p <subactor-pid>
|
||||
recvfrom(6, "", 65536, 0, NULL, NULL) = 0
|
||||
recvfrom(6, "", 65536, 0, NULL, NULL) = 0
|
||||
recvfrom(6, "", 65536, 0, NULL, NULL) = 0
|
||||
... (no `epoll_wait`, no other syscalls, just this back-to-back)
|
||||
```
|
||||
|
||||
Pattern: tight C-level `recvfrom` loop returning 0
|
||||
each call. No `epoll_wait` between iterations →
|
||||
**not trio's task scheduler**. Pure synchronous C
|
||||
loop.
|
||||
|
||||
```
|
||||
$ sudo readlink /proc/<subactor-pid>/fd/6
|
||||
socket:[<inode>]
|
||||
|
||||
$ sudo lsof -p <subactor-pid> | grep ' 6u'
|
||||
<cmd> <pid> goodboy 6u unix 0xffff... 0t0 <inode> type=STREAM (CONNECTED)
|
||||
```
|
||||
|
||||
fd=6 is an **AF_UNIX socket** in CONNECTED state.
|
||||
Even though the test uses `--tpt-proto=tcp`, this fd
|
||||
is NOT a tractor IPC channel — it's an internal
|
||||
trio socketpair.
|
||||
|
||||
## Root-cause: `WakeupSocketpair.drain()`
|
||||
|
||||
`/site-packages/trio/_core/_wakeup_socketpair.py`:
|
||||
|
||||
```python
|
||||
class WakeupSocketpair:
|
||||
def __init__(self) -> None:
|
||||
self.wakeup_sock, self.write_sock = socket.socketpair()
|
||||
self.wakeup_sock.setblocking(False)
|
||||
self.write_sock.setblocking(False)
|
||||
...
|
||||
|
||||
def drain(self) -> None:
|
||||
try:
|
||||
while True:
|
||||
self.wakeup_sock.recv(2**16)
|
||||
except BlockingIOError:
|
||||
pass
|
||||
```
|
||||
|
||||
`socket.socketpair()` on Linux defaults to AF_UNIX
|
||||
SOCK_STREAM. Both ends non-blocking. Normal flow:
|
||||
|
||||
1. Signal/wake event → `write_sock.send(b'\x00')`
|
||||
queues a byte.
|
||||
2. `wakeup_sock` becomes readable → trio's epoll
|
||||
triggers.
|
||||
3. Trio calls `drain()` to flush the buffer.
|
||||
4. drain loops on `wakeup_sock.recv(64KB)`.
|
||||
5. Eventually buffer empty → non-blocking socket
|
||||
raises `BlockingIOError` → except → break.
|
||||
|
||||
**Bug surface — peer-closed missed-EOF**:
|
||||
|
||||
Non-blocking socket semantics:
|
||||
- buffer has data → `recv` returns N>0 bytes (loop continues)
|
||||
- buffer empty → `recv` raises `BlockingIOError`
|
||||
- **peer FIN'd → `recv` returns 0 bytes (NEITHER exception NOR
|
||||
break — infinite tight loop)**
|
||||
|
||||
`drain()` does not handle the `b''` return-value
|
||||
(EOF) case. If `write_sock` has been closed (or the
|
||||
process holding it is gone), every iteration returns
|
||||
0 → infinite loop → 100% CPU on a single core.
|
||||
|
||||
## Why this triggers under `main_thread_forkserver`
|
||||
|
||||
Under `os.fork()` from the forkserver-worker thread:
|
||||
|
||||
1. Parent has a `WakeupSocketpair` instance with
|
||||
`wakeup_sock=fdN`, `write_sock=fdM`. Both fds
|
||||
open in parent.
|
||||
2. Fork → child inherits BOTH fds (kernel-level fd
|
||||
table dup).
|
||||
3. `_close_inherited_fds()` runs in child →
|
||||
closes everything except stdio. `wakeup_sock` and
|
||||
`write_sock` of the parent's `WakeupSocketpair`
|
||||
ARE closed in child.
|
||||
4. Child's trio (running fresh) creates its OWN
|
||||
`WakeupSocketpair` → NEW fd numbers (e.g. fd 6, 7).
|
||||
5. **In `infect_asyncio` mode** the asyncio loop is
|
||||
the host; trio runs as guest via
|
||||
`start_guest_run`. trio still creates its
|
||||
`WakeupSocketpair` in the I/O manager but its
|
||||
role is different.
|
||||
|
||||
The race window: somewhere between (3) and (5), if a
|
||||
`WakeupSocketpair` Python object reference inherited
|
||||
via COW (from parent's pre-fork heap) survives long
|
||||
enough that `drain()` is called on it AFTER its fds
|
||||
were closed but BEFORE the child's NEW socketpair
|
||||
takes over the recycled fd numbers — the recycled fd
|
||||
will be one of the child's NEW socketpair ends, whose
|
||||
peer might be FIN-flagged (e.g. parent-process
|
||||
peer-end is closed).
|
||||
|
||||
Or simpler: the `wait_for_actor`/`find_actor` discovery
|
||||
flow in `test_register_duplicate_name` triggers an
|
||||
unusual code path where a stale `WakeupSocketpair`
|
||||
gets `drain()`-called on a fd whose peer has already
|
||||
closed.
|
||||
|
||||
## Why `drain()` shouldn't loop indefinitely on EOF
|
||||
(upstream trio bug)
|
||||
|
||||
Even WITHOUT fork, `drain()` should treat `b''` as
|
||||
EOF and break. The current code is correct for the
|
||||
"buffer drained on a healthy socketpair" scenario but
|
||||
incorrect for the "peer is gone" scenario. It's a
|
||||
defensive-programming gap in trio.
|
||||
|
||||
A one-line patch upstream:
|
||||
|
||||
```python
|
||||
def drain(self) -> None:
|
||||
try:
|
||||
while True:
|
||||
data = self.wakeup_sock.recv(2**16)
|
||||
if not data:
|
||||
break # peer-closed; nothing more to drain
|
||||
except BlockingIOError:
|
||||
pass
|
||||
```
|
||||
|
||||
## Workarounds (until the underlying issue lands)
|
||||
|
||||
1. **Skip-mark on the fork backend**:
|
||||
`tests/test_multi_program.py` →
|
||||
`pytest.mark.skipon_spawn_backend('main_thread_forkserver',
|
||||
reason='trio WakeupSocketpair.drain busy-loop, see ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md')`.
|
||||
|
||||
2. **Defensive monkey-patch in tractor's
|
||||
forkserver-child prelude** — wrap
|
||||
`WakeupSocketpair.drain` to handle `b''`:
|
||||
|
||||
```python
|
||||
# in `_actor_child_main` or `_close_inherited_fds`'s
|
||||
# post-fork prelude:
|
||||
from trio._core._wakeup_socketpair import WakeupSocketpair
|
||||
_orig_drain = WakeupSocketpair.drain
|
||||
def _safe_drain(self):
|
||||
try:
|
||||
while True:
|
||||
data = self.wakeup_sock.recv(2**16)
|
||||
if not data:
|
||||
return # peer closed
|
||||
except BlockingIOError:
|
||||
pass
|
||||
WakeupSocketpair.drain = _safe_drain
|
||||
```
|
||||
|
||||
Tracks upstream — remove once trio fixes.
|
||||
|
||||
3. **Upstream the fix**: 1-line PR to `python-trio/trio`
|
||||
adding `if not data: break` to `drain()`.
|
||||
|
||||
## Investigation next steps
|
||||
|
||||
1. **Confirm via py-spy**: when caught alive, detach
|
||||
strace first then
|
||||
`sudo py-spy dump --pid <subactor> --locals`. The
|
||||
busy thread should show `drain` from `WakeupSocketpair`
|
||||
in the call chain.
|
||||
2. **Identify which write-end peer is closed**: from
|
||||
the inode of fd 6, look up the matching peer
|
||||
inode via `ss -xp` and see whose process it
|
||||
was/is.
|
||||
3. **Verify the missed-EOF hypothesis**: hand-craft a
|
||||
minimal `WakeupSocketpair` repro:
|
||||
|
||||
```python
|
||||
from trio._core._wakeup_socketpair import WakeupSocketpair
|
||||
ws = WakeupSocketpair()
|
||||
ws.write_sock.close() # simulate peer-gone
|
||||
ws.drain() # should hang forever
|
||||
```
|
||||
|
||||
## Sibling bug
|
||||
|
||||
`tests/test_infected_asyncio.py::test_aio_simple_error`
|
||||
hangs under the same backend with a DIFFERENT
|
||||
fingerprint (Mode-A deadlock, both parties in
|
||||
`epoll_wait`, no busy-loop). Distinct root cause —
|
||||
see `infected_asyncio_under_main_thread_forkserver_hang_issue.md`.
|
||||
|
||||
Both share the broader theme: **trio internal-state
|
||||
initialization isn't fully fork-safe under
|
||||
`main_thread_forkserver`** for the more exotic
|
||||
dispatch paths.
|
||||
|
||||
## See also
|
||||
|
||||
- [#379](https://github.com/goodboy/tractor/issues/379) — subint umbrella
|
||||
- python-trio/trio#1614 — trio + fork hazards
|
||||
- `trio._core._wakeup_socketpair.WakeupSocketpair`
|
||||
source (the smoking gun)
|
||||
- `ai/conc-anal/fork_thread_semantics_execution_vs_memory.md`
|
||||
- `ai/conc-anal/infected_asyncio_under_main_thread_forkserver_hang_issue.md`
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
---
|
||||
model: claude-opus-4-6
|
||||
service: claude
|
||||
session: (ad-hoc, not tracked via conf.toml)
|
||||
timestamp: 2026-04-06T17:28:48Z
|
||||
git_ref: 02b2ef1
|
||||
scope: tests
|
||||
substantive: true
|
||||
raw_file: 20260406T172848Z_02b2ef1_prompt_io.raw.md
|
||||
---
|
||||
|
||||
## Prompt
|
||||
|
||||
User asked to extend `tests/test_resource_cache.py` with a test
|
||||
that reproduces the edge case fixed in commit `02b2ef18` (per-key
|
||||
locking+user tracking in `maybe_open_context()`). The bug was
|
||||
originally triggered in piker's `brokerd.kraken` backend where the
|
||||
same `acm_func` was called with different kwargs, and the old
|
||||
global `_Cache.users` counter caused:
|
||||
|
||||
- teardown skipped for one `ctx_key` bc another key's users kept
|
||||
the global count > 0
|
||||
- re-entry hitting `assert not resources.get(ctx_key)` during the
|
||||
teardown window
|
||||
|
||||
User requested a test that would fail under the old code and pass
|
||||
with the fix.
|
||||
|
||||
## Response summary
|
||||
|
||||
Designed and implemented `test_per_ctx_key_resource_lifecycle`
|
||||
which verifies per-`ctx_key` resource isolation by:
|
||||
|
||||
1. Holding resource `'a'` open in a bg task
|
||||
2. Opening+closing resource `'b'` (same `acm_func`, different
|
||||
kwargs) while `'a'` is still alive
|
||||
3. Re-opening `'b'` and asserting cache MISS — proving `'b'` was
|
||||
torn down independently despite `'a'` keeping its own user
|
||||
count > 0
|
||||
|
||||
With the old global counter, phase 3 would produce a stale cache
|
||||
HIT (leaked resource) or crash on the assert.
|
||||
|
||||
Also added a trivial `acm_with_resource(resource_id)` ACM helper
|
||||
at module level.
|
||||
|
||||
## Files changed
|
||||
|
||||
- `tests/test_resource_cache.py` — add `acm_with_resource` ACM +
|
||||
`test_per_ctx_key_resource_lifecycle` test fn
|
||||
|
||||
## Human edits
|
||||
|
||||
None — committed as generated (pending user review).
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
---
|
||||
model: claude-opus-4-6
|
||||
service: claude
|
||||
session: (ad-hoc, not tracked via conf.toml)
|
||||
timestamp: 2026-04-06T19:31:25Z
|
||||
git_ref: 85f9c5d
|
||||
scope: tests
|
||||
substantive: true
|
||||
raw_file: 20260406T193125Z_85f9c5d_prompt_io.raw.md
|
||||
---
|
||||
|
||||
## Prompt
|
||||
|
||||
User asked to reproduce the exact `assert not resources.get(ctx_key)`
|
||||
crash originally seen in piker's `brokerd.kraken` backend via
|
||||
`open_cached_client('kraken')`. Key constraints from user:
|
||||
|
||||
- In piker, kwargs were the **same** (empty) so all callers
|
||||
share one `ctx_key = (fid, ())`
|
||||
- The root issue is `_Cache.locks` being indexed by `fid`
|
||||
rather than `ctx_key`, plus the race window between
|
||||
`values.pop()` and `resources.pop()` in `_Cache.run_ctx`
|
||||
(the acm `__aexit__` has checkpoints in between)
|
||||
- Mark the test `@pytest.mark.xfail` so it can be committed
|
||||
against the unpatched code and later adjusted once the fix
|
||||
from `9e49eddd` is applied
|
||||
|
||||
User pointed to piker's `open_cached_client('kraken')` usage
|
||||
as the real-world pattern to model.
|
||||
|
||||
## Response summary
|
||||
|
||||
Added `test_moc_reentry_during_teardown` which deterministically
|
||||
reproduces the teardown race:
|
||||
|
||||
1. A `cached_client()` acm (no params, like
|
||||
`kraken.api.get_client()`) signals via `in_aexit` event
|
||||
when its `__aexit__` starts, then sleeps to hold the
|
||||
window open
|
||||
2. `use_and_exit` task enters+exits the cached resource
|
||||
3. `reenter_during_teardown` waits on `in_aexit` then
|
||||
re-enters — landing exactly in the window where
|
||||
`values` is popped but `resources` still exists
|
||||
4. The old `assert not resources.get(ctx_key)` fires
|
||||
5. `@pytest.mark.xfail(raises=AssertionError)` catches it
|
||||
|
||||
Test confirmed: `XFAIL` with the exact traceback at
|
||||
`_mngrs.py:327`.
|
||||
|
||||
## Files changed
|
||||
|
||||
- `tests/test_resource_cache.py` — add
|
||||
`test_moc_reentry_during_teardown` xfail test fn
|
||||
|
||||
## Human edits
|
||||
|
||||
None — committed as generated (pending user review).
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
---
|
||||
model: claude-opus-4-6
|
||||
service: claude
|
||||
session: (ad-hoc, not tracked via conf.toml)
|
||||
timestamp: 2026-04-06T22:26:48Z
|
||||
git_ref: f086222
|
||||
scope: code
|
||||
substantive: true
|
||||
raw_file: 20260406T222648Z_f086222_prompt_io.raw.md
|
||||
---
|
||||
|
||||
## Prompt
|
||||
|
||||
User applied the per-key locking patch (`f086222d`),
|
||||
removed the `@pytest.mark.xfail` from
|
||||
`test_moc_reentry_during_teardown`, and found the
|
||||
test STILL fails — now with `RuntimeError` instead
|
||||
of `AssertionError`. Asked to diagnose why the fix
|
||||
doesn't actually resolve this race.
|
||||
|
||||
## Response summary
|
||||
|
||||
Traced the exact scheduling sequence across three
|
||||
concurrent tasks to show that the per-`ctx_key`
|
||||
locking change doesn't close the fundamental gap:
|
||||
|
||||
- `maybe_open_context`'s finally pops the lock and
|
||||
sets `no_more_users` in the **exiting caller's
|
||||
task**
|
||||
- But the actual resource cleanup (`values.pop`,
|
||||
acm `__aexit__`, `resources.pop`) runs in the
|
||||
**`run_ctx` task** inside `service_tn`
|
||||
- A re-entering task creates a fresh lock (old one
|
||||
popped), acquires immediately, and hits stale
|
||||
`resources[ctx_key]` while `run_ctx` is still in
|
||||
`__aexit__`
|
||||
|
||||
Identified three possible fix strategies:
|
||||
1. Let `run_ctx` own lock release/pop after
|
||||
`resources.pop()`
|
||||
2. Add a "teardown complete" event that new callers
|
||||
wait on
|
||||
3. Move `resources.pop()` into `run_ctx`'s inner
|
||||
finally (before acm `__aexit__`)
|
||||
|
||||
## Files changed
|
||||
|
||||
No code changes — analysis only.
|
||||
|
||||
## Human edits
|
||||
|
||||
N/A — diagnostic output, no code generated.
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
---
|
||||
model: claude-opus-4-6
|
||||
service: claude
|
||||
timestamp: 2026-04-06T22:26:48Z
|
||||
git_ref: f086222
|
||||
diff_cmd: git diff f086222~1..f086222
|
||||
---
|
||||
|
||||
Here's the problem. The per-key locking doesn't fix this race because the **lock lifetime is decoupled from `run_ctx`'s lifetime**. Trace through what happens:
|
||||
|
||||
**`maybe_open_context` finally** (lines 468-495):
|
||||
1. `users[ctx_key] -= 1` → 0
|
||||
2. `no_more_users.set()` → wakes `run_ctx` (but it doesn't run yet, just becomes ready)
|
||||
3. `_Cache.locks.pop(ctx_key)` → **lock is gone**
|
||||
4. `maybe_open_context` exits
|
||||
|
||||
**`run_ctx`** now gets scheduled (lines 237-241):
|
||||
5. Inner finally: `values.pop(ctx_key)` → values gone
|
||||
6. `mng.__aexit__()` runs → `in_aexit.set()`, `await trio.sleep(10)` → **CHECKPOINT**
|
||||
|
||||
**`reenter_during_teardown`** wakes:
|
||||
7. Enters `maybe_open_context` → no lock exists for `ctx_key` → creates **fresh** lock, acquires immediately
|
||||
8. `values[ctx_key]` → `KeyError`
|
||||
9. `resources.get(ctx_key)` → **still exists!** (step 6 hasn't reached `resources.pop()` in `run_ctx`'s outer finally yet)
|
||||
10. `raise RuntimeError('Caching resources ALREADY exist?!')`
|
||||
|
||||
The core issue: `no_more_users.set()` (step 2) and `locks.pop()` (step 3) happen in the **exiting caller's task**, but the actual resource cleanup (`values.pop`, acm `__aexit__`, `resources.pop`) happens in the **`run_ctx` task** inside `service_tn`. There's no synchronization between "lock removed, new callers welcome" and "run_ctx has actually finished tearing down". The per-`ctx_key` lock change just made the lock more granular — it didn't close this gap.
|
||||
|
||||
The fix needs to ensure a new caller can't acquire a lock for `ctx_key` until `run_ctx` has fully exited (including `resources.pop()`). Either:
|
||||
- **don't pop the lock in `maybe_open_context`** — let `run_ctx` release/pop it after `resources.pop()`
|
||||
- **wait on a "teardown complete" event** in the cache-miss path when stale `resources` exist
|
||||
- **pop `resources` before the acm `__aexit__`** (move `resources.pop()` into the inner finally alongside `values.pop()`)
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
---
|
||||
model: claude-opus-4-6
|
||||
service: claude
|
||||
session: (ad-hoc, not tracked via conf.toml)
|
||||
timestamp: 2026-04-13T19:21:16Z
|
||||
git_ref: f851f28
|
||||
scope: code
|
||||
substantive: true
|
||||
---
|
||||
|
||||
## Prompt
|
||||
|
||||
User provided a detailed implementation plan for
|
||||
a new `test_tpt_bind_addrs.py` test suite covering
|
||||
`open_root_actor(tpt_bind_addrs=...)` — the three
|
||||
runtime code paths in `_root.py:385-450`:
|
||||
|
||||
1. Non-registrar, no explicit bind -> random addrs
|
||||
2. Registrar, no explicit bind -> registry_addrs
|
||||
3. Explicit bind given -> `wrap_address()` + merge
|
||||
|
||||
Plan specified 6 test functions (~10 parametrized
|
||||
variants), predicted a type-mixing bug at line 446,
|
||||
and asked for an in-flight fix if confirmed.
|
||||
|
||||
## Response summary
|
||||
|
||||
Created `tests/discovery/test_tpt_bind_addrs.py`
|
||||
with 9 collected test variants across 6 functions:
|
||||
|
||||
- `test_registrar_root_tpt_bind_addrs` (3 variants:
|
||||
`bind-eq-reg`, `bind-subset-reg`,
|
||||
`bind-disjoint-reg`)
|
||||
- `test_non_registrar_root_tpt_bind_addrs`
|
||||
(2 variants: `bind-same-bindspace`,
|
||||
`bind-disjoint`)
|
||||
- `test_non_registrar_default_random_bind`
|
||||
(baseline, no explicit bind)
|
||||
- `test_tpt_bind_addrs_as_maddr_str`
|
||||
(multiaddr string input)
|
||||
- `test_registrar_merge_binds_union`
|
||||
(registrar + disjoint bind -> union)
|
||||
- `test_open_nursery_forwards_tpt_bind_addrs`
|
||||
(`open_nursery(**kwargs)` forwarding)
|
||||
|
||||
Confirmed and fixed the predicted bug at
|
||||
`_root.py:446`: the registrar merge path mixed
|
||||
`Address` objects (`tpt_bind_addrs`) with raw tuples
|
||||
(`uw_reg_addrs`) inside `set()`, preventing
|
||||
deduplication and causing double-bind `OSError`.
|
||||
|
||||
Fix: wrap `uw_reg_addrs` before the set union:
|
||||
```python
|
||||
# before (broken)
|
||||
tpt_bind_addrs = list(set(
|
||||
tpt_bind_addrs + uw_reg_addrs
|
||||
))
|
||||
# after (fixed)
|
||||
tpt_bind_addrs = list(set(
|
||||
tpt_bind_addrs
|
||||
+ [wrap_address(a) for a in uw_reg_addrs]
|
||||
))
|
||||
```
|
||||
|
||||
All 9 tests pass after the fix.
|
||||
|
||||
## Files changed
|
||||
|
||||
- `tests/discovery/test_tpt_bind_addrs.py` (new)
|
||||
- `tractor/_root.py:446` (bug fix, 1 line)
|
||||
|
||||
## Human edits
|
||||
|
||||
N/A — pending review.
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
---
|
||||
model: claude-opus-4-6
|
||||
service: claude
|
||||
session: 76154e65-d8e1-4b5f-9275-0ea45ba7e98a
|
||||
timestamp: 2026-04-13T20:50:48Z
|
||||
git_ref: 269d939c
|
||||
scope: code
|
||||
substantive: true
|
||||
raw_file: 20260413T205048Z_269d939c_prompt_io.raw.md
|
||||
---
|
||||
|
||||
## Prompt
|
||||
|
||||
Implement a `parse_endpoints()` API in
|
||||
`tractor.discovery._multiaddr` that lets downstream
|
||||
projects declare per-actor transport bind addresses
|
||||
as a service table of actor-name -> multiaddr
|
||||
strings (loaded from config, e.g. TOML `[network]`
|
||||
section). Add type aliases `EndpointsTable` and
|
||||
`ParsedEndpoints`, plus 7 unit tests covering TCP,
|
||||
UDS, tuples, mixed input, unsupported protocols, and
|
||||
edge cases.
|
||||
|
||||
A detailed implementation plan was provided
|
||||
specifying insertion points, type signatures, test
|
||||
names, and verification commands.
|
||||
|
||||
## Response summary
|
||||
|
||||
Add `parse_endpoints()` function with two supporting
|
||||
type aliases to `_multiaddr.py`. The function
|
||||
iterates a `dict[str, list[str|tuple]]` service
|
||||
table and delegates each entry to the existing
|
||||
`wrap_address()` helper, which already handles maddr
|
||||
strings, raw tuples, and pre-wrapped `Address`
|
||||
objects. Seven pure unit tests (no trio runtime)
|
||||
validate all input variants and edge cases. All 22
|
||||
multiaddr tests pass.
|
||||
|
||||
## Files changed
|
||||
|
||||
- `tractor/discovery/_multiaddr.py` — add
|
||||
`EndpointsTable`, `ParsedEndpoints` type aliases
|
||||
and `parse_endpoints()` function after line 133
|
||||
- `tests/discovery/test_multiaddr.py` — add
|
||||
`parse_endpoints` import and 7 new test functions
|
||||
|
||||
## Human edits
|
||||
|
||||
None — committed as generated.
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
---
|
||||
model: claude-opus-4-6
|
||||
service: claude
|
||||
timestamp: 2026-04-13T20:50:48Z
|
||||
git_ref: 269d939c
|
||||
diff_cmd: git diff 269d939c~1..269d939c
|
||||
---
|
||||
|
||||
## Implementation output
|
||||
|
||||
### `tractor/discovery/_multiaddr.py`
|
||||
|
||||
> `git diff 269d939c~1..269d939c -- tractor/discovery/_multiaddr.py`
|
||||
|
||||
Type aliases `EndpointsTable` and `ParsedEndpoints`
|
||||
plus `parse_endpoints()` helper that wraps each
|
||||
entry via `wrap_address()`.
|
||||
|
||||
### `tests/discovery/test_multiaddr.py`
|
||||
|
||||
> `git diff 269d939c~1..269d939c -- tests/discovery/test_multiaddr.py`
|
||||
|
||||
Add import of `parse_endpoints` to existing import
|
||||
block, then 7 new test functions.
|
||||
|
||||
### Test run output
|
||||
|
||||
```
|
||||
22 passed, 1 warning in 0.05s
|
||||
```
|
||||
|
||||
All 22 tests pass (15 existing + 7 new).
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
---
|
||||
model: claude-opus-4-6
|
||||
service: claude
|
||||
session: multiaddr-support-rename-prefer
|
||||
timestamp: 2026-04-14T16:33:00Z
|
||||
git_ref: befedc49
|
||||
scope: code
|
||||
substantive: true
|
||||
raw_file: 20260414T163300Z_befedc49_prompt_io.raw.md
|
||||
---
|
||||
|
||||
## Prompt
|
||||
|
||||
Create a helper function that determines the best transport given
|
||||
actor locality (distributed vs same host). Use PID/hostname
|
||||
comparison for locality detection, apply at registry addr selection
|
||||
only (not spawn-time).
|
||||
|
||||
## Response summary
|
||||
|
||||
New `prefer_addr()` + `_is_local_addr()` helpers
|
||||
in `_api.py` using `socket.getaddrinfo()` and
|
||||
`ipaddress` for PID/hostname locality detection.
|
||||
Preference: UDS > local TCP > remote TCP.
|
||||
Integrated into `query_actor()` and
|
||||
`wait_for_actor()`. Also changed
|
||||
`Registrar.find_actor()` to return full addr list
|
||||
so callers can apply preference.
|
||||
|
||||
## Files changed
|
||||
|
||||
- `tractor/discovery/_discovery.py` → `_api.py`
|
||||
— renamed + added `prefer_addr()`,
|
||||
`_is_local_addr()`; updated `query_actor()` and
|
||||
`wait_for_actor()` call sites
|
||||
- `tractor/discovery/_registry.py`
|
||||
— `Registrar.find_actor()` returns
|
||||
`list[UnwrappedAddress]|None`
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
---
|
||||
model: claude-opus-4-6
|
||||
service: claude
|
||||
timestamp: 2026-04-14T16:33:00Z
|
||||
git_ref: befedc49
|
||||
diff_cmd: git diff befedc49~1..befedc49
|
||||
---
|
||||
|
||||
### `tractor/discovery/_api.py`
|
||||
|
||||
> `git diff befedc49~1..befedc49 -- tractor/discovery/_api.py`
|
||||
|
||||
Add `_is_local_addr()` and `prefer_addr()` transport
|
||||
preference helpers.
|
||||
|
||||
#### `_is_local_addr(addr: Address) -> bool`
|
||||
|
||||
Determines whether an `Address` is reachable on the
|
||||
local host:
|
||||
|
||||
- `UDSAddress`: always returns `True`
|
||||
(filesystem-bound, inherently local)
|
||||
- `TCPAddress`: checks if `._host` is a loopback IP
|
||||
via `ipaddress.ip_address().is_loopback`, then
|
||||
falls back to comparing against the machine's own
|
||||
interface IPs via
|
||||
`socket.getaddrinfo(socket.gethostname(), None)`
|
||||
|
||||
#### `prefer_addr(addrs: list[UnwrappedAddress]) -> UnwrappedAddress`
|
||||
|
||||
Selects the "best" transport address from a
|
||||
multihomed actor's address list. Wraps each
|
||||
candidate via `wrap_address()` to get typed
|
||||
`Address` objects, then classifies into three tiers:
|
||||
|
||||
1. **UDS** (same-host guaranteed, lowest overhead)
|
||||
2. **TCP loopback / same-host IP** (local network)
|
||||
3. **TCP remote** (only option for distributed)
|
||||
|
||||
Within each tier, the last-registered (latest) entry
|
||||
is preferred. Falls back to `addrs[-1]` if no
|
||||
heuristic matches.
|
||||
|
||||
### `tractor/discovery/_registry.py`
|
||||
|
||||
> `git diff befedc49~1..befedc49 -- tractor/discovery/_registry.py`
|
||||
|
||||
`Registrar.find_actor()` return type broadened from
|
||||
single addr to `list[UnwrappedAddress]|None` — full
|
||||
addr list lets callers apply transport preference.
|
||||
|
||||
#### Integration
|
||||
|
||||
`query_actor()` and `wait_for_actor()` now call
|
||||
`prefer_addr(addrs)` instead of `addrs[-1]`.
|
||||
|
||||
### Verification
|
||||
|
||||
All discovery tests pass (13/13 non-daemon).
|
||||
`test_local.py` and `test_multi_program.py` also
|
||||
pass (daemon fixture teardown failures are
|
||||
pre-existing and unrelated).
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
---
|
||||
model: claude-opus-4-7[1m]
|
||||
service: claude
|
||||
session: subints-spawner-design-kickoff
|
||||
timestamp: 2026-04-17T03:49:18Z
|
||||
git_ref: 9703210
|
||||
scope: docs
|
||||
substantive: true
|
||||
raw_file: 20260417T034918Z_9703210_prompt_io.raw.md
|
||||
---
|
||||
|
||||
## Prompt
|
||||
|
||||
Drive the "first big boi, from GH issue" task seeded by
|
||||
`ai/prompt-io/prompts/subints_spawner.md`: design, plan
|
||||
and implement sub-interpreter (subint) spawn-backend
|
||||
support per issue #379, including (1) modularizing
|
||||
`tractor.spawn._spawn` into per-backend submods, (2) a new
|
||||
`._subint` backend, and (3) harness parametrization via the
|
||||
existing `--spawn-backend` / `start_method` pytest fixture
|
||||
in `tractor._testing.pytest`.
|
||||
|
||||
Follow-up clarifications from the user (this turn):
|
||||
1. Pin `<3.15` on this dev branch and feature-gate subint
|
||||
tests — chose option (a).
|
||||
2. Split Phase A (modularization) into its own PR first.
|
||||
3. Defer the `fork()`-via-subint hack to a follow-up.
|
||||
4. Harness flag is `pytest --spawn-backend <key>` CLI →
|
||||
`start_method` fixture (prompt file updated to match).
|
||||
|
||||
## Response summary
|
||||
|
||||
Produced a three-phase plan and a concrete Phase A (pure
|
||||
modularization) file-split plan for user review; no code
|
||||
written yet — the green-light to start Phase A was given
|
||||
in this same turn conditional on logging this prompt-io
|
||||
entry first.
|
||||
|
||||
Phases:
|
||||
- **A — modularize** `tractor/spawn/_spawn.py` (847 LOC):
|
||||
keep generic machinery in `_spawn.py`, extract
|
||||
`trio_proc` → `spawn/_trio.py`, `mp_proc` →
|
||||
`spawn/_mp.py`. No pin bump.
|
||||
- **B — `_subint` backend**: bump `pyproject.toml`
|
||||
`requires-python` upper to `<3.15`; add `'subint'` to
|
||||
`SpawnMethodKey`; reuse existing UDS transport; shm
|
||||
escape-hatch deferred.
|
||||
- **C — harness**: drive the valid-backend tuple in
|
||||
`tractor/_testing/pytest.py:345-349` from
|
||||
`typing.get_args(SpawnMethodKey)`; skip subint tests on
|
||||
Python < 3.14.
|
||||
|
||||
Key findings surfaced to the user:
|
||||
- `pyproject.toml:12` currently pins `<3.14`; PEP 734
|
||||
`concurrent.interpreters` only ships in 3.14 — the
|
||||
load-bearing constraint.
|
||||
- `_testing/pytest.py:345-349` hardcodes valid backends
|
||||
as a string tuple (`'mp_spawn'`, `'mp_forkserver'`,
|
||||
`'trio'`) — should be `get_args(SpawnMethodKey)`.
|
||||
- `_testing/pytest.py:228` already imports
|
||||
`try_set_start_method` from `tractor.spawn._spawn` —
|
||||
keeping the `_spawn.py` path as the "core" module
|
||||
avoids breaking external refs during Phase A.
|
||||
|
||||
Phase A commit breakdown (each green via `/run-tests`):
|
||||
1. Drive `_testing/pytest.py` backend-tuple from
|
||||
`get_args(SpawnMethodKey)` — prep.
|
||||
2. Extract `trio_proc` → `spawn/_trio.py`; `_methods`
|
||||
registry imports it.
|
||||
3. Extract `mp_proc` → `spawn/_mp.py`; `_methods` imports
|
||||
it.
|
||||
4. Tidy `spawn/__init__.py` docstring for the new layout.
|
||||
|
||||
Workflow: Phase A runs in a new worktree via `/open-wkt
|
||||
spawn_modularize` (branch `wkt/spawn_modularize`) and
|
||||
lands as its own PR before Phase B/C returns to the
|
||||
current `claude_bun_opus4.7_subint_spawner_backend`
|
||||
branch.
|
||||
|
||||
## Files changed
|
||||
|
||||
None this turn — design/planning only. The user updated
|
||||
`ai/prompt-io/prompts/subints_spawner.md` themselves to
|
||||
complete the truncated sentence on line 32 (harness-flag
|
||||
reference).
|
||||
|
||||
Upcoming (Phase A, next turn):
|
||||
- `tractor/spawn/_spawn.py` — trimmed to "core"
|
||||
- `tractor/spawn/_trio.py` — new; holds `trio_proc()`
|
||||
- `tractor/spawn/_mp.py` — new; holds `mp_proc()`
|
||||
- `tractor/spawn/__init__.py` — docstring refresh
|
||||
- `tractor/_testing/pytest.py` — backend tuple from
|
||||
`get_args(SpawnMethodKey)`
|
||||
|
||||
## Human edits
|
||||
|
||||
None — committed as generated. User completed the
|
||||
truncated sentence on line 32 of
|
||||
`ai/prompt-io/prompts/subints_spawner.md` themselves
|
||||
*before* this prompt-io log was written; that edit was
|
||||
the user's authoring, not an edit to AI output.
|
||||
|
|
@ -0,0 +1,117 @@
|
|||
---
|
||||
model: claude-opus-4-7[1m]
|
||||
service: claude
|
||||
timestamp: 2026-04-17T03:49:18Z
|
||||
git_ref: 9703210
|
||||
diff_cmd: git diff HEAD~1..HEAD
|
||||
---
|
||||
|
||||
No code was produced in this turn — this is a design/planning
|
||||
exchange for subint spawner support per issue #379. Captured
|
||||
verbatim below are the two substantive AI responses, which
|
||||
together fix the phased plan (A: modularize, B: subint, C:
|
||||
harness) and the concrete Phase A file split before any
|
||||
implementation begins.
|
||||
|
||||
## Turn 1 — initial triage + gating questions
|
||||
|
||||
> Read: issue #379, current `spawn/_spawn.py` (847 lines),
|
||||
> and pyproject pins.
|
||||
>
|
||||
> ## Task summary
|
||||
> Design/plan/implement subint backend per #379, plus
|
||||
> modularize `spawn/_spawn.py` into per-backend submods, plus
|
||||
> harness flag for test parametrization.
|
||||
>
|
||||
> ## Key finding
|
||||
> `pyproject.toml:12` pins `requires-python = ">=3.12, <3.14"`
|
||||
> — but `concurrent.interpreters` / PEP 734 **only lands in
|
||||
> 3.14**. This is the load-bearing design question before
|
||||
> coding.
|
||||
>
|
||||
> ## Proposed phasing (incremental commits)
|
||||
> - **Phase A — modularize** `_spawn.py`: extract `trio_proc`
|
||||
> → `spawn/_trio.py`, `mp_proc` → `spawn/_mp.py`, keep
|
||||
> generic helpers (`hard_kill`/`soft_kill`/`new_proc`
|
||||
> dispatcher/`SpawnMethodKey`) in a slim core. Tests green
|
||||
> at each step.
|
||||
> - **Phase B — `_subint` backend**: extend `SpawnMethodKey`
|
||||
> with `'subint'`; reuse UDS transport by default; shm is
|
||||
> escape-hatch follow-up.
|
||||
> - **Phase C — harness**: add backend-parametrize flag, skip
|
||||
> subint when Python < 3.14.
|
||||
>
|
||||
> ## Gating questions before I start
|
||||
> 1. **Python pin**: bump `<3.15` and feature-gate subint
|
||||
> tests (a), require 3.14 outright (b), or backport shim
|
||||
> (c)? I'd recommend (a).
|
||||
> 2. **PR strategy**: one branch, commits per phase (current
|
||||
> `claude_bun_opus4.7_subint_spawner_backend`)? Or split
|
||||
> Phase A as its own PR first?
|
||||
> 3. **`fork()`-via-subint hack** from the issue's "Our own
|
||||
> thoughts" section — v1 or follow-up? I'd defer to
|
||||
> follow-up.
|
||||
> 4. **Prompt file line 32 is truncated** — what's the
|
||||
> existing harness flag name you had in mind ("using the
|
||||
> existing ___")?
|
||||
>
|
||||
> Which of these are already decided vs. open?
|
||||
|
||||
## User answers (for the raw record)
|
||||
|
||||
1. bump `<3.15`, feature-gate — option (a).
|
||||
2. split Phase A as its own PR first.
|
||||
3. defer `fork()`-via-subint hack.
|
||||
4. harness flag is `--spawn-backend` CLI → `start_method`
|
||||
fixture, per `tractor._testing._pytest`. Prompt file
|
||||
updated accordingly.
|
||||
|
||||
## Turn 2 — concrete Phase A plan
|
||||
|
||||
> Good — prompt file now ends cleanly; harness flag
|
||||
> confirmed as `--spawn-backend` CLI → `start_method`
|
||||
> fixture (via `_testing/pytest.py:193,355`). Also noted
|
||||
> `pytest.py:345-349` hardcodes valid backends as a string
|
||||
> tuple — `typing.get_args(SpawnMethodKey)` should drive
|
||||
> that list.
|
||||
>
|
||||
> ## Phase A — concrete plan (separate PR)
|
||||
>
|
||||
> **File split** in `tractor/spawn/`:
|
||||
> - `_spawn.py` stays as "core": keeps `SpawnMethodKey`,
|
||||
> `_spawn_method`, `proc_waiter`, `try_set_start_method`,
|
||||
> `exhaust_portal`, `cancel_on_completion`, `hard_kill`,
|
||||
> `soft_kill`, `new_proc` dispatcher, `_methods`
|
||||
> registry. Kept at this path so `_testing/pytest.py:228`
|
||||
> and other external imports don't break.
|
||||
> - `_trio.py` ← `trio_proc()` (lines 448–671, ~224 lines)
|
||||
> - `_mp.py` ← `mp_proc()` (lines 672–842, ~171 lines)
|
||||
> - `_methods` dict imports the two backend fns from the
|
||||
> new modules.
|
||||
>
|
||||
> **Commits** (each green via `/run-tests`):
|
||||
> 1. Drive `_testing/pytest.py` backend-tuple from
|
||||
> `get_args(SpawnMethodKey)` — tiny prep.
|
||||
> 2. Extract `trio_proc` → `spawn/_trio.py`; `_methods`
|
||||
> imports it.
|
||||
> 3. Extract `mp_proc` → `spawn/_mp.py`; `_methods`
|
||||
> imports it.
|
||||
> 4. Tidy `spawn/__init__.py` docstring to describe new
|
||||
> layout.
|
||||
>
|
||||
> No pyproject pin bump in Phase A (that's Phase B).
|
||||
>
|
||||
> **Workflow**: I'd run this in a new worktree via
|
||||
> `/open-wkt spawn_modularize` (branch
|
||||
> `wkt/spawn_modularize`), then open a PR before returning
|
||||
> to the current `claude_bun_opus4.7_subint_spawner_backend`
|
||||
> branch for Phase B/C.
|
||||
>
|
||||
> Green-light to `/open-wkt` and start Phase A?
|
||||
|
||||
## User final response
|
||||
|
||||
> all this first then yes, green light.
|
||||
|
||||
— log this `/prompt-io` entry first, then proceed with
|
||||
Phase A in a worktree.
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
---
|
||||
model: claude-opus-4-7[1m]
|
||||
service: claude
|
||||
session: subints-spawner-phase-a-impl
|
||||
timestamp: 2026-04-17T03:58:05Z
|
||||
git_ref: 61a73ba
|
||||
scope: code
|
||||
substantive: true
|
||||
raw_file: 20260417T035800Z_61a73ba_prompt_io.raw.md
|
||||
---
|
||||
|
||||
## Prompt
|
||||
|
||||
Follow-up from the design session logged in
|
||||
`20260417T034918Z_9703210_prompt_io.md`: the user
|
||||
greenlit Phase A after `/prompt-io` tracking, so
|
||||
execute the Phase A "modularize `spawn._spawn`" plan
|
||||
end-to-end in a new worktree — four logical steps
|
||||
(pytest harness prep, `trio_proc` extraction,
|
||||
`mp_proc` extraction, docstring refresh), each green
|
||||
via `/run-tests`.
|
||||
|
||||
User directives during execution:
|
||||
- **Option 3** for the test cadence ("continue A.2–A.4
|
||||
first, then run the full suite once at the end of
|
||||
Phase A").
|
||||
- **One commit** for the whole phase ("can't we just
|
||||
commit the whole patch in one commit?") instead of
|
||||
the 3/4-commit split I initially proposed.
|
||||
- **Don't pre-draft** commit messages — wait for the
|
||||
user to invoke `/commit-msg` (captured as feedback
|
||||
memory `feedback_no_auto_draft_commit_msgs.md`).
|
||||
|
||||
## Response summary
|
||||
|
||||
Produced the cohesive Phase A modularization patch,
|
||||
landed as commit `61a73bae` (subject: `Mv
|
||||
trio_proc`/`mp_proc` to per-backend submods`). Five
|
||||
files changed, +565 / -418 lines.
|
||||
|
||||
Key pieces of the patch (generated by claude,
|
||||
reviewed by the human before commit):
|
||||
- `tractor/spawn/_trio.py` — **new**; receives
|
||||
`trio_proc()` verbatim from `_spawn.py`; imports
|
||||
cross-backend helpers back from `._spawn`.
|
||||
- `tractor/spawn/_mp.py` — **new**; receives
|
||||
`mp_proc()` verbatim; uses `from . import _spawn`
|
||||
for late-binding access to the mutable `_ctx` /
|
||||
`_spawn_method` globals (design decision made
|
||||
during impl, not the original plan).
|
||||
- `tractor/spawn/_spawn.py` — shrunk 847 → 448 LOC;
|
||||
import pruning; bottom-of-module late imports for
|
||||
`trio_proc` / `mp_proc` with a one-line comment
|
||||
explaining the circular-dep reason.
|
||||
- `tractor/spawn/__init__.py` — docstring refresh
|
||||
describing the new layout.
|
||||
- `tractor/_testing/pytest.py` — the valid-backend
|
||||
set now comes from `typing.get_args(SpawnMethodKey)`
|
||||
so future additions (`'subint'`) don't need harness
|
||||
edits.
|
||||
|
||||
## Files changed
|
||||
|
||||
See `git diff 61a73ba~1..61a73ba --stat`:
|
||||
|
||||
```
|
||||
tractor/_testing/pytest.py | 12 +-
|
||||
tractor/spawn/__init__.py | 31 +++-
|
||||
tractor/spawn/_mp.py | 235 ++++++++++++++++++++++++
|
||||
tractor/spawn/_spawn.py | 413 +-------------------------------
|
||||
tractor/spawn/_trio.py | 292 ++++++++++++++++++++++++++++
|
||||
5 files changed, 565 insertions(+), 418 deletions(-)
|
||||
```
|
||||
|
||||
Validation:
|
||||
- import probe + `_methods` wiring check — OK
|
||||
- spawn-relevant test subset — 37 passed, 1 skipped
|
||||
- full suite — 350 passed, 14 skipped, 7 xfailed, 1
|
||||
xpassed
|
||||
|
||||
## Human edits
|
||||
|
||||
None — committed as generated by claude (no diff
|
||||
between `.claude/git_commit_msg_LATEST.md` and the
|
||||
committed body, as far as the assistant could
|
||||
observe).
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
---
|
||||
model: claude-opus-4-7[1m]
|
||||
service: claude
|
||||
timestamp: 2026-04-17T03:58:05Z
|
||||
git_ref: 61a73ba
|
||||
diff_cmd: git diff 61a73ba~1..61a73ba
|
||||
---
|
||||
|
||||
Code generated in this turn was committed verbatim as
|
||||
`61a73bae` ("Mv `trio_proc`/`mp_proc` to per-backend
|
||||
submods"). Per diff-ref mode, per-file code is captured
|
||||
via the pointers below, each followed by a prose
|
||||
summary of what the AI generated. Non-code output
|
||||
(sanity-check results, design rationale) is included
|
||||
verbatim.
|
||||
|
||||
## Per-file generated content
|
||||
|
||||
### `tractor/spawn/_trio.py` (new, 292 lines)
|
||||
|
||||
> `git diff 61a73ba~1..61a73ba -- tractor/spawn/_trio.py`
|
||||
|
||||
Pure lift-and-shift of `trio_proc()` out of
|
||||
`tractor/spawn/_spawn.py` (previously lines 448–670).
|
||||
Added AGPL header + module docstring describing the
|
||||
backend; imports include local `from ._spawn import
|
||||
cancel_on_completion, hard_kill, soft_kill` which
|
||||
creates the bottom-of-module late-import pattern in
|
||||
the core file to avoid a cycle. All call sites,
|
||||
log-format strings, and body logic are byte-identical
|
||||
to the originals — no semantic change.
|
||||
|
||||
### `tractor/spawn/_mp.py` (new, 235 lines)
|
||||
|
||||
> `git diff 61a73ba~1..61a73ba -- tractor/spawn/_mp.py`
|
||||
|
||||
Pure lift-and-shift of `mp_proc()` out of
|
||||
`tractor/spawn/_spawn.py` (previously lines 672–842).
|
||||
Same AGPL header convention. Key difference from
|
||||
`_trio.py`: uses `from . import _spawn` (module
|
||||
import, not from-import) for `_ctx` and
|
||||
`_spawn_method` references — these are mutated at
|
||||
runtime by `try_set_start_method()`, so late binding
|
||||
via `_spawn._ctx` / `_spawn._spawn_method` is required
|
||||
for correctness. Also imports `cancel_on_completion`,
|
||||
`soft_kill`, `proc_waiter` from `._spawn`.
|
||||
|
||||
### `tractor/spawn/_spawn.py` (modified, 847 → 448 LOC)
|
||||
|
||||
> `git diff 61a73ba~1..61a73ba -- tractor/spawn/_spawn.py`
|
||||
|
||||
- removed `trio_proc()` body (moved to `_trio.py`)
|
||||
- removed `mp_proc()` body (moved to `_mp.py`)
|
||||
- pruned imports now unused in core: `sys`,
|
||||
`is_root_process`, `current_actor`,
|
||||
`is_main_process`, `_mp_main`, `ActorFailure`,
|
||||
`pretty_struct`, `_pformat`
|
||||
- added bottom-of-file late imports
|
||||
`from ._trio import trio_proc` and
|
||||
`from ._mp import mp_proc` with a one-line
|
||||
comment explaining why (circular dep)
|
||||
- `_methods` dict unchanged structurally; still binds
|
||||
`'trio' → trio_proc`, `'mp_spawn' → mp_proc`,
|
||||
`'mp_forkserver' → mp_proc`
|
||||
|
||||
### `tractor/spawn/__init__.py` (modified)
|
||||
|
||||
> `git diff 61a73ba~1..61a73ba -- tractor/spawn/__init__.py`
|
||||
|
||||
Docstring rewrite only — no code. New text describes
|
||||
the per-backend `Layout` with `- ._spawn` (core),
|
||||
`- ._trio` (default), `- ._mp` (two mp variants), and
|
||||
reiterates the "no eager import" NOTE.
|
||||
|
||||
### `tractor/_testing/pytest.py` (modified)
|
||||
|
||||
> `git diff 61a73ba~1..61a73ba -- tractor/_testing/pytest.py`
|
||||
|
||||
Two-line logical change (plus a comment):
|
||||
- `from typing import` grows `get_args`
|
||||
- `pytest_generate_tests()` replaces the hardcoded
|
||||
`('mp_spawn','mp_forkserver','trio')` tuple with
|
||||
`get_args(SpawnMethodKey)` via lazy import
|
||||
`from tractor.spawn._spawn import SpawnMethodKey`
|
||||
inside the function body (matching the project's
|
||||
existing lazy-import convention for the spawn
|
||||
module)
|
||||
|
||||
## Non-code output (verbatim)
|
||||
|
||||
### Design rationale for the file split
|
||||
|
||||
From my Phase A plan turn (already captured in
|
||||
`20260417T034918Z_9703210_prompt_io.md`):
|
||||
- `_spawn.py` stays as the "core" rather than renamed
|
||||
to `_core.py` so external imports
|
||||
(`_testing/pytest.py:228` imports
|
||||
`try_set_start_method` from `tractor.spawn._spawn`)
|
||||
keep working without churn.
|
||||
- Per-backend extraction chosen over alternatives
|
||||
(e.g. splitting generic helpers further) because
|
||||
the immediate motivation is hosting a 3rd
|
||||
`_subint.py` sibling cleanly in Phase B.
|
||||
|
||||
### Sanity-check output (verbatim terminal excerpts)
|
||||
|
||||
Post-extraction import probe:
|
||||
```
|
||||
extraction OK
|
||||
_methods: {'trio': 'tractor.spawn._trio.trio_proc',
|
||||
'mp_spawn': 'tractor.spawn._mp.mp_proc',
|
||||
'mp_forkserver': 'tractor.spawn._mp.mp_proc'}
|
||||
```
|
||||
|
||||
Spawn-relevant test subset (`tests/test_local.py
|
||||
test_rpc.py test_spawning.py test_multi_program.py
|
||||
test_discovery.py`):
|
||||
```
|
||||
37 passed, 1 skipped, 14 warnings in 55.37s
|
||||
```
|
||||
|
||||
Full suite:
|
||||
```
|
||||
350 passed, 14 skipped, 7 xfailed, 1 xpassed,
|
||||
151 warnings in 437.73s (0:07:17)
|
||||
```
|
||||
|
||||
No regressions vs. `main`. One transient `-x`
|
||||
early-stop `ERROR` on
|
||||
`test_close_channel_explicit_remote_registrar[trio-True]`
|
||||
was flaky (passed solo, passed without `-x`), not
|
||||
caused by this refactor.
|
||||
|
||||
### Commit message
|
||||
|
||||
Also AI-drafted (via `/commit-msg`) — the 40-line
|
||||
message on commit `61a73bae` itself. Not reproduced
|
||||
here; see `git log -1 61a73bae`.
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
---
|
||||
model: claude-opus-4-7[1m]
|
||||
service: claude
|
||||
session: subints-spawner-phase-b1-impl
|
||||
timestamp: 2026-04-17T12:44:37Z
|
||||
git_ref: 5cd6df5
|
||||
scope: code
|
||||
substantive: true
|
||||
raw_file: 20260417T124437Z_5cd6df5_prompt_io.raw.md
|
||||
---
|
||||
|
||||
## Prompt
|
||||
|
||||
Continuing the Phase B work from the design sessions
|
||||
logged in `20260417T034918Z_9703210_prompt_io.md` and
|
||||
the Phase A impl in `20260417T035800Z_61a73ba_...`.
|
||||
|
||||
User direction at this step:
|
||||
- "ok now let's continue with the subints part of the
|
||||
plan ya?" → kick off Phase B.
|
||||
- After I proposed a 4-sub-phase breakdown (B.1
|
||||
scaffolding, B.2 min-viable spawn, B.3 lifecycle,
|
||||
B.4 test enablement), user: "ok can we continue
|
||||
with B" → approving the phasing and design
|
||||
defaults (thread-per-subint, UDS default,
|
||||
code-string bootstrap via `interp.exec()`).
|
||||
- User switched the worktree to
|
||||
`subint_spawner_backend` (instead of opening a
|
||||
fresh worktree as I'd proposed): "i already
|
||||
switched branches in the wkt, you should work off
|
||||
`subint_spawner_backend` now".
|
||||
|
||||
Scope of this turn: **B.1 scaffolding only** —
|
||||
wire the `'subint'` name through the spawn-method
|
||||
registry + harness, add a feature-detected stub
|
||||
module, bump pyproject. No actual spawn flow.
|
||||
|
||||
## Response summary
|
||||
|
||||
Produced the Phase B.1 scaffolding patch, landed as
|
||||
commit `5cd6df58` (subject: `Add `'subint'` spawn
|
||||
backend scaffold (#379)`). Four files changed,
|
||||
+124 / -2 lines.
|
||||
|
||||
Key pieces (all generated by claude, reviewed by
|
||||
human before commit):
|
||||
- `tractor/spawn/_subint.py` — **new**; feature-
|
||||
detects `concurrent.interpreters`; `subint_proc()`
|
||||
stub raises `RuntimeError` on py<3.14 or
|
||||
`NotImplementedError` with issue-#379 URL on
|
||||
py≥3.14. Signature mirrors `trio_proc`/`mp_proc`
|
||||
so B.2 can drop the impl in without touching
|
||||
`_methods`.
|
||||
- `tractor/spawn/_spawn.py` — adds `'subint'` to
|
||||
`SpawnMethodKey`, grows a `case 'subint'` arm in
|
||||
`try_set_start_method()` with feature-gate, re-
|
||||
imports `sys` for the gate-error msg, adds late
|
||||
`from ._subint import subint_proc` import and
|
||||
`_methods` entry.
|
||||
- `tractor/_testing/pytest.py` — converts the
|
||||
gate-error into `pytest.UsageError` via a
|
||||
`try/except` around `try_set_start_method()` so
|
||||
`--spawn-backend=subint` on py<3.14 prints a
|
||||
clean banner instead of a traceback.
|
||||
- `pyproject.toml` — pin `requires-python` `<3.14`
|
||||
→ `<3.15`, add `3.14` trove classifier.
|
||||
|
||||
## Files changed
|
||||
|
||||
See `git diff 5cd6df5~1..5cd6df5 --stat`:
|
||||
|
||||
```
|
||||
pyproject.toml | 3 +-
|
||||
tractor/_testing/pytest.py | 8 +++-
|
||||
tractor/spawn/_spawn.py | 15 ++++++
|
||||
tractor/spawn/_subint.py | 100 +++++++++++++++++++++++++
|
||||
4 files changed, 124 insertions(+), 2 deletions(-)
|
||||
```
|
||||
|
||||
Validation on py3.13:
|
||||
- registry/gate probe — OK; `_has_subints = False`,
|
||||
`subint_proc()` raises `RuntimeError` as expected.
|
||||
- `try_set_start_method('subint')` — raises cleanly.
|
||||
- `pytest --spawn-backend=subint` — exits with
|
||||
`pytest.UsageError` banner.
|
||||
- Spawn-relevant test subset — 69 passed, 1 skipped.
|
||||
- Full suite on py3.13 deferred (scaffolding is
|
||||
no-op on <3.14).
|
||||
- Full-suite on py3.14 pending user setting up the
|
||||
venv.
|
||||
|
||||
## Human edits
|
||||
|
||||
None — committed as generated by claude. `uv.lock`
|
||||
was intentionally left unstaged by the user (pin-
|
||||
bump regenerated cp314 wheel entries but they were
|
||||
deferred to a follow-up).
|
||||
|
|
@ -0,0 +1,168 @@
|
|||
---
|
||||
model: claude-opus-4-7[1m]
|
||||
service: claude
|
||||
timestamp: 2026-04-17T12:44:37Z
|
||||
git_ref: 5cd6df5
|
||||
diff_cmd: git diff 5cd6df5~1..5cd6df5
|
||||
---
|
||||
|
||||
Code generated in this turn was committed verbatim as
|
||||
`5cd6df58` ("Add `'subint'` spawn backend scaffold
|
||||
(#379)"). Per diff-ref mode, per-file code is captured
|
||||
via the pointers below, each followed by a prose
|
||||
summary. Non-code output (sanity-check results,
|
||||
design rationale) is included verbatim.
|
||||
|
||||
## Per-file generated content
|
||||
|
||||
### `tractor/spawn/_subint.py` (new, 100 lines)
|
||||
|
||||
> `git diff 5cd6df5~1..5cd6df5 -- tractor/spawn/_subint.py`
|
||||
|
||||
New scaffolding module for the PEP 734 subinterpreter
|
||||
backend. Contents:
|
||||
- AGPL header + module docstring (describes backend
|
||||
intent, 3.14+ availability gate, and explicit
|
||||
"SCAFFOLDING STUB" status pointing at issue #379).
|
||||
- Top-level `try/except ImportError` wrapping
|
||||
`from concurrent import interpreters as
|
||||
_interpreters` → sets module-global
|
||||
`_has_subints: bool`. This lets the registry stay
|
||||
introspectable on py<3.14 while spawn-time still
|
||||
fails cleanly.
|
||||
- `subint_proc()` coroutine with signature matching
|
||||
`trio_proc`/`mp_proc` exactly (same param names,
|
||||
defaults, and `TaskStatus[Portal]` typing) —
|
||||
intentional so Phase B.2 can drop the impl in
|
||||
without touching `_methods` or changing call-site
|
||||
binding.
|
||||
- Body raises `RuntimeError` on py<3.14 (with
|
||||
`sys.version` printed) or `NotImplementedError`
|
||||
with issue-#379 URL on py≥3.14.
|
||||
|
||||
### `tractor/spawn/_spawn.py` (modified, +15 LOC)
|
||||
|
||||
> `git diff 5cd6df5~1..5cd6df5 -- tractor/spawn/_spawn.py`
|
||||
|
||||
- `import sys` re-added (pruned during Phase A, now
|
||||
needed again for the py-version string in the
|
||||
`'subint'` gate-error).
|
||||
- `SpawnMethodKey = Literal[...]` grows `'subint'` as
|
||||
the 4th member, with inline comment `# py3.14+ via
|
||||
`concurrent.interpreters` (PEP 734)`.
|
||||
- `try_set_start_method()` match-block grows a new
|
||||
`case 'subint':` arm that imports
|
||||
`from ._subint import _has_subints` lazily and
|
||||
raises `RuntimeError` with a multi-line gate msg
|
||||
if unavailable.
|
||||
- Bottom-of-module late-import section grows
|
||||
`from ._subint import subint_proc` alongside the
|
||||
existing `_trio` / `_mp` imports.
|
||||
- `_methods` dict grows `'subint': subint_proc`.
|
||||
|
||||
### `tractor/_testing/pytest.py` (modified, +8 LOC)
|
||||
|
||||
> `git diff 5cd6df5~1..5cd6df5 -- tractor/_testing/pytest.py`
|
||||
|
||||
`pytest_configure()` wraps the
|
||||
`try_set_start_method(backend)` call in a
|
||||
`try/except RuntimeError` that re-raises as
|
||||
`pytest.UsageError(str(err))`. Rationale: the gate
|
||||
error on py<3.14 is legitimately a configuration
|
||||
problem, not a test failure, so pytest's UsageError
|
||||
path (exit code 4) gives a clean single-line banner
|
||||
instead of a traceback.
|
||||
|
||||
### `pyproject.toml` (modified, +2 / -1)
|
||||
|
||||
> `git diff 5cd6df5~1..5cd6df5 -- pyproject.toml`
|
||||
|
||||
- `requires-python` pin relaxed `>=3.12, <3.14` →
|
||||
`>=3.12, <3.15` to admit 3.14 as a valid target.
|
||||
- Added `"Programming Language :: Python :: 3.14"`
|
||||
to the trove classifiers.
|
||||
|
||||
## Non-code output (verbatim)
|
||||
|
||||
### Design choices captured in the prior turn
|
||||
|
||||
(Previously confirmed by the user on April 17 in this
|
||||
session — relevant excerpts captured here for
|
||||
provenance since they drove the shape of the impl):
|
||||
|
||||
1. **Python pin**: `<3.15` + feature-gate the subint
|
||||
backend at spawn time — user answered option (a)
|
||||
from the design triage.
|
||||
2. **Deferred `fork()`-via-subint hack** from issue
|
||||
#379's "Our own thoughts" section.
|
||||
3. **Phase B phasing**: user approved the B.1 / B.2 /
|
||||
B.3 / B.4 breakdown — this commit is strictly B.1
|
||||
(scaffolding only, no spawn-flow impl).
|
||||
4. **Option (B) worktree strategy**: new worktree
|
||||
branched from `wkt/spawn_modularize`. *(Amended by
|
||||
user at runtime: user switched the existing
|
||||
`spawn_modularize` worktree to the
|
||||
`subint_spawner_backend` branch instead.)*
|
||||
|
||||
### Sanity-check output (verbatim terminal excerpts)
|
||||
|
||||
Registry / feature-gate verification on py3.13:
|
||||
```
|
||||
SpawnMethodKey values: ('trio', 'mp_spawn',
|
||||
'mp_forkserver', 'subint')
|
||||
_methods keys: ['trio', 'mp_spawn',
|
||||
'mp_forkserver', 'subint']
|
||||
_has_subints: False (py version: (3, 13) )
|
||||
[expected] RuntimeError: The 'subint' spawn backend
|
||||
requires Python 3.14+ (stdlib
|
||||
`concurrent.interpreters`, PEP 734).
|
||||
```
|
||||
|
||||
`try_set_start_method('subint')` gate on py3.13:
|
||||
```
|
||||
[expected] RuntimeError: Spawn method 'subint'
|
||||
requires Python 3.14+ (stdlib
|
||||
`concurrent.interpreters`, PEP 734).
|
||||
```
|
||||
|
||||
Pytest `--spawn-backend=subint` on py3.13 (the new
|
||||
UsageError wrapper kicking in):
|
||||
```
|
||||
ERROR: Spawn method 'subint' requires Python 3.14+
|
||||
(stdlib `concurrent.interpreters`, PEP 734).
|
||||
Current runtime: 3.13.11 (main, Dec 5 2025,
|
||||
16:06:33) [GCC 15.2.0]
|
||||
```
|
||||
|
||||
Collection probe: `404 tests collected in 0.18s`
|
||||
(no import errors from the new module).
|
||||
|
||||
Spawn-relevant test subset (`tests/test_local.py
|
||||
test_rpc.py test_spawning.py test_multi_program.py
|
||||
tests/discovery/`):
|
||||
```
|
||||
69 passed, 1 skipped, 10 warnings in 61.38s
|
||||
```
|
||||
|
||||
Full suite was **not** run on py3.13 for this commit
|
||||
— the scaffolding is no-op on <3.14 and full-suite
|
||||
validation under py3.14 is pending that venv being
|
||||
set up by the user.
|
||||
|
||||
### Commit message
|
||||
|
||||
Also AI-drafted (via `/commit-msg`, with the prose
|
||||
rewrapped through `/home/goodboy/.claude/skills/pr-msg/
|
||||
scripts/rewrap.py --width 67`) — the 33-line message
|
||||
on commit `5cd6df58` itself. Not reproduced here; see
|
||||
`git log -1 5cd6df58`.
|
||||
|
||||
### Known follow-ups flagged to user
|
||||
|
||||
- **`uv.lock` deferred**: pin-bump regenerated cp314
|
||||
wheel entries in `uv.lock`, but the user chose to
|
||||
not stage `uv.lock` for this commit. Warned
|
||||
explicitly.
|
||||
- **Phase B.2 needs py3.14 venv** — running the
|
||||
actual subint impl requires it; user said they'd
|
||||
set it up separately.
|
||||
|
|
@ -0,0 +1,117 @@
|
|||
---
|
||||
model: claude-opus-4-7[1m]
|
||||
service: claude
|
||||
session: subints-phase-b2-destroy-race-fix
|
||||
timestamp: 2026-04-18T04:25:26Z
|
||||
git_ref: 26fb820
|
||||
scope: code
|
||||
substantive: true
|
||||
raw_file: 20260418T042526Z_26fb820_prompt_io.raw.md
|
||||
---
|
||||
|
||||
## Prompt
|
||||
|
||||
Follow-up to Phase B.2 (`5cd6df58`) after the user
|
||||
observed intermittent mid-suite hangs when running
|
||||
the tractor test suite under `--spawn-backend=subint`
|
||||
on py3.14. The specific sequence of prompts over
|
||||
several turns:
|
||||
|
||||
1. User pointed at the `test_context_stream_semantics.py`
|
||||
suite as the first thing to make run clean under
|
||||
`--spawn-backend=subint`.
|
||||
2. After a series of `timeout`-terminated runs that
|
||||
gave no diagnostic info, user nudged me to stop
|
||||
relying on `timeout` and get actual runtime
|
||||
diagnostics ("the suite hangs indefinitely, so i
|
||||
don't think this `timeout 30` is helping you at
|
||||
all.."). Switched to
|
||||
`faulthandler.dump_traceback_later(...)` and a
|
||||
resource-tracker fixture to rule out leaks.
|
||||
3. Captured a stack pinning the hang on
|
||||
`_interpreters.destroy(interp_id)` in the subint
|
||||
teardown finally block.
|
||||
4. Proposed dedicated-OS-thread fix. User greenlit.
|
||||
5. Implemented + verified on-worktree; user needed
|
||||
to be pointed at the *worktree*'s `./py313` venv
|
||||
because bare `pytest` was picking up the main
|
||||
repo's venv (running un-patched `_subint.py`) and
|
||||
still hanging.
|
||||
|
||||
Running theme over the whole exchange: this patch
|
||||
only closes the *destroy race*. The user and I also
|
||||
traced through the deeper cancellation story — SIGINT
|
||||
can't reach subints, legacy-mode shares the GIL,
|
||||
portal-cancel dies when the IPC channel is already
|
||||
broken — and agreed the next step is a bounded
|
||||
hard-kill in `subint_proc`'s teardown plus a
|
||||
dedicated cancellation test suite. Those land as
|
||||
separate commits.
|
||||
|
||||
## Response summary
|
||||
|
||||
Produced the `tractor/spawn/_subint.py` patch landed
|
||||
as commit `26fb8206` ("Fix subint destroy race via
|
||||
dedicated OS thread"). One file, +110/-84 LOC.
|
||||
|
||||
Mechanism: swap `trio.to_thread.run_sync(_interpreters
|
||||
.exec, ...)` for a plain `threading.Thread(target=...
|
||||
, daemon=False)`. The trio thread cache recycles
|
||||
workers — so the OS thread that ran `_interpreters
|
||||
.exec()` remained alive in the cache holding a
|
||||
stale subint tstate, blocking
|
||||
`_interpreters.destroy()` in the finally indefinitely.
|
||||
A dedicated one-shot thread exits naturally after
|
||||
the sync target returns, releasing tstate and
|
||||
unblocking destroy.
|
||||
|
||||
Coordination across the trio↔thread boundary:
|
||||
- `trio.lowlevel.current_trio_token()` captured at
|
||||
`subint_proc` entry
|
||||
- driver thread signals `subint_exited.set()` back
|
||||
to parent trio via `trio.from_thread.run_sync(...,
|
||||
trio_token=token)` (synchronous from the thread's
|
||||
POV; the call returns after trio has run `.set()`)
|
||||
- `trio.RunFinishedError` swallowed in that path for
|
||||
the process-teardown case where parent trio already
|
||||
exited
|
||||
- teardown `finally` off-loads the sync
|
||||
`driver_thread.join()` via `to_thread.run_sync` (a
|
||||
cache thread carries no subint tstate — safe)
|
||||
|
||||
## Files changed
|
||||
|
||||
See `git diff 26fb820~1..26fb820 --stat`:
|
||||
|
||||
```
|
||||
tractor/spawn/_subint.py | 194 +++++++++++++++++++------------
|
||||
1 file changed, 110 insertions(+), 84 deletions(-)
|
||||
```
|
||||
|
||||
Validation:
|
||||
- `test_parent_cancels[chk_ctx_result_before_exit=True-
|
||||
cancel_method=ctx-child_returns_early=False]`
|
||||
(the specific test that was hanging for the user)
|
||||
— passed in 1.06s.
|
||||
- Full `tests/test_context_stream_semantics.py` under
|
||||
subint — 61 passed in 100.35s (clean-cache re-run:
|
||||
100.82s).
|
||||
- Trio backend regression subset — 69 passed / 1
|
||||
skipped / 89.19s — no regressions from this change.
|
||||
|
||||
## Files changed
|
||||
|
||||
Beyond the `_subint.py` patch, the raw log also
|
||||
records the cancellation-semantics research that
|
||||
spanned this conversation but did not ship as code
|
||||
in *this* commit. Preserving it inline under "Non-
|
||||
code output" because it directly informs the
|
||||
Phase B.3 hard-kill impl that will follow (and any
|
||||
upstream CPython bug reports we end up filing).
|
||||
|
||||
## Human edits
|
||||
|
||||
None — committed as generated. The commit message
|
||||
itself was also AI-drafted via `/commit-msg` and
|
||||
rewrapped via the project's `rewrap.py --width 67`
|
||||
tooling; user landed it without edits.
|
||||
|
|
@ -0,0 +1,220 @@
|
|||
---
|
||||
model: claude-opus-4-7[1m]
|
||||
service: claude
|
||||
timestamp: 2026-04-18T04:25:26Z
|
||||
git_ref: 26fb820
|
||||
diff_cmd: git diff 26fb820~1..26fb820
|
||||
---
|
||||
|
||||
Code generated in this turn was committed as `26fb8206`
|
||||
("Fix subint destroy race via dedicated OS thread").
|
||||
Per diff-ref mode, per-file code is captured via the
|
||||
pointer below; the rest is verbatim non-code output
|
||||
— test-session diagnostics, CPython-internals
|
||||
reasoning, and the design synthesis that's worth
|
||||
keeping alongside the code since it informs the
|
||||
still-pending Phase B.3 hard-kill work.
|
||||
|
||||
## Per-file generated content
|
||||
|
||||
### `tractor/spawn/_subint.py` (modified, +110/-84 LOC)
|
||||
|
||||
> `git diff 26fb820~1..26fb820 -- tractor/spawn/_subint.py`
|
||||
|
||||
Rewrites the subint driver-thread strategy:
|
||||
- replaces `trio.to_thread.run_sync(_interpreters.exec, ...)`
|
||||
with a plain `threading.Thread(target=_subint_target,
|
||||
daemon=False)` so the OS thread truly exits after
|
||||
`_interpreters.exec()` returns
|
||||
- captures a `trio.lowlevel.current_trio_token()` at
|
||||
`subint_proc` entry; the driver thread signals
|
||||
completion back via `trio.from_thread.run_sync(
|
||||
subint_exited.set, trio_token=...)`
|
||||
- swallows `trio.RunFinishedError` in the signal path
|
||||
for the case where the parent trio loop has already
|
||||
exited (process teardown)
|
||||
- in the teardown `finally` off-loads the sync
|
||||
`driver_thread.join()` call to `trio.to_thread.run_sync`
|
||||
(a cache thread w/ no subint tstate — so no cache
|
||||
conflict) to wait for the driver thread to fully
|
||||
exit before calling `_interpreters.destroy()`
|
||||
|
||||
## Non-code output (verbatim) — the CPython-internals research
|
||||
|
||||
### What went wrong before this commit
|
||||
|
||||
Under `--spawn-backend=subint` on py3.14, most single
|
||||
tests passed but longer runs hung intermittently. The
|
||||
position of the hang moved between runs (test #22 on
|
||||
one run, test #53 on another) suggesting a timing-
|
||||
dependent race rather than a deterministic bug.
|
||||
|
||||
`faulthandler.dump_traceback_later()` eventually
|
||||
caught a stack with the main thread blocked in
|
||||
`_interpreters.destroy(interp_id)` at `_subint.py:293`.
|
||||
Only 2 threads were alive:
|
||||
- main thread waiting in `_interpreters.destroy()`
|
||||
- one idle trio thread-cache worker in
|
||||
`trio._core._thread_cache._work`
|
||||
|
||||
No subint was still running (`_interpreters.list_all()`
|
||||
showed only the main interp). A resource-tracker
|
||||
pytest fixture confirmed threads/subints did NOT
|
||||
accumulate across tests — this was not a leak but a
|
||||
specific "destroy blocks on cached thread w/ stale
|
||||
tstate" race.
|
||||
|
||||
### Why the race exists
|
||||
|
||||
`trio.to_thread.run_sync` uses a thread *cache* to
|
||||
avoid OS-thread creation overhead. When the sync
|
||||
callable returns, the OS thread is NOT terminated —
|
||||
it's parked in `_thread_cache._work` waiting for the
|
||||
next job. CPython's subinterpreter implementation
|
||||
attaches a **tstate** (thread-state object) to each
|
||||
OS thread that ever entered a subint via
|
||||
`_interpreters.exec()`. That tstate is released
|
||||
lazily — either when the thread picks up a new job
|
||||
(which re-attaches a new tstate, evicting the old
|
||||
one) or when the thread truly exits.
|
||||
|
||||
`_interpreters.destroy(interp_id)` waits for *all*
|
||||
tstates associated w/ that subint to be released
|
||||
before it can proceed. If the cached worker is idle
|
||||
holding the stale tstate, destroy blocks indefinitely.
|
||||
Whether the race manifests depends on timing — if
|
||||
the cached thread happens to pick up another job
|
||||
quickly, destroy unblocks; if it sits idle, we hang.
|
||||
|
||||
### Why a dedicated `threading.Thread` fixes it
|
||||
|
||||
A plain `threading.Thread(target=_subint_target,
|
||||
daemon=False)` runs its target once and exits. When
|
||||
the target returns, OS-thread teardown (`_bootstrap_inner`
|
||||
→ `_bootstrap`) fires and CPython releases the
|
||||
tstate for that thread. `_interpreters.destroy()`
|
||||
then has no blocker.
|
||||
|
||||
### Diagnostic tactics that actually helped
|
||||
|
||||
1. `faulthandler.dump_traceback_later(n, repeat=False,
|
||||
file=open(path, 'w'))` for captured stack dumps on
|
||||
hang. Critically, pipe to a `file=` not stderr —
|
||||
pytest captures stderr weirdly and the dump is
|
||||
easy to miss.
|
||||
2. A resource-tracker autouse fixture printing
|
||||
per-test `threading.active_count()` +
|
||||
`len(_interpreters.list_all())` deltas → ruled out
|
||||
leak-accumulation theories quickly.
|
||||
3. Running the hanging test *solo* vs in-suite —
|
||||
when solo passes but in-suite hangs, you know
|
||||
it's a cross-test state-transfer bug rather than
|
||||
a test-internal bug.
|
||||
|
||||
### Design synthesis — SIGINT + subints + SC
|
||||
|
||||
The user and I walked through the cancellation
|
||||
semantics of PEP 684/734 subinterpreters in detail.
|
||||
Key findings we want to preserve:
|
||||
|
||||
**Signal delivery in subints (stdlib limitation).**
|
||||
CPython's signal machinery only delivers signals
|
||||
(SIGINT included) to the *main thread of the main
|
||||
interpreter*. Subints cannot install signal handlers
|
||||
that will ever fire. This is an intentional design
|
||||
choice in PEP 684 and not expected to change. For
|
||||
tractor's subint actors, this means:
|
||||
|
||||
- Ctrl-C never reaches a subint directly.
|
||||
- `trio.run()` running on a worker thread (as we do
|
||||
for subints) already skips SIGINT handler install
|
||||
because `signal.signal()` raises on non-main
|
||||
threads.
|
||||
- The only cancellation surface into a subint is
|
||||
our IPC `Portal.cancel_actor()`.
|
||||
|
||||
**Legacy-mode subints share the main GIL** (which
|
||||
our impl uses since `msgspec` lacks PEP 684 support
|
||||
per `jcrist/msgspec#563`). This means a stuck subint
|
||||
thread can starve the parent's trio loop during
|
||||
cancellation — the parent can't even *start* its
|
||||
teardown handling until the subint yields the GIL.
|
||||
|
||||
**Failure modes identified for Phase B.3 audit:**
|
||||
|
||||
1. Portal cancel lands cleanly → subint unwinds →
|
||||
thread exits → destroy succeeds. (Happy path.)
|
||||
2. IPC channel is already broken when we try to
|
||||
send cancel (e.g., `test_ipc_channel_break_*`)
|
||||
→ cancel raises `BrokenResourceError` → subint
|
||||
keeps running unaware → parent hangs waiting for
|
||||
`subint_exited`. This is what breaks
|
||||
`test_advanced_faults.py` under subint.
|
||||
3. Subint is stuck in non-checkpointing Python code
|
||||
→ portal-cancel msg queued but never processed.
|
||||
4. Subint is in a shielded cancel scope when cancel
|
||||
arrives → delay until shield exits.
|
||||
|
||||
**Current teardown has a shield-bug too:**
|
||||
`trio.CancelScope(shield=True)` wrapping the `finally`
|
||||
block absorbs Ctrl-C, so even when the user tries
|
||||
to break out they can't. This is the reason
|
||||
`test_ipc_channel_break_during_stream[break_parent-...
|
||||
no_msgstream_aclose]` locks up unkillable.
|
||||
|
||||
**B.3 hard-kill fix plan (next commit):**
|
||||
|
||||
1. Bound `driver_thread.join()` with
|
||||
`trio.move_on_after(HARD_KILL_TIMEOUT)`.
|
||||
2. If it times out, log a warning naming the
|
||||
`interp_id` and switch the driver thread to
|
||||
`daemon=True` mode (not actually possible after
|
||||
start — so instead create as daemon=True upfront
|
||||
and accept the tradeoff of proc-exit not waiting
|
||||
for a stuck subint).
|
||||
3. Best-effort `_interpreters.destroy()`; catch the
|
||||
`InterpreterError` if the subint is still running.
|
||||
4. Document that the leak is real and the only
|
||||
escape hatch we have without upstream cooperation.
|
||||
|
||||
**Test plan for Phase B.3:**
|
||||
|
||||
New `tests/test_subint_cancellation.py` covering:
|
||||
- SIGINT at spawn
|
||||
- SIGINT mid-portal-RPC
|
||||
- SIGINT during shielded section in subint
|
||||
- Dead-channel cancel (mirror of `test_ipc_channel_
|
||||
break_during_stream` minimized)
|
||||
- Non-checkpointing subint (tight `while True` in
|
||||
user code)
|
||||
- Per-test `pytest-timeout`-style bounds so the
|
||||
tests visibly fail instead of wedging the runner
|
||||
|
||||
### Sanity-check output (verbatim terminal excerpts)
|
||||
|
||||
Post-fix single-test validation:
|
||||
```
|
||||
1 passed, 1 warning in 1.06s
|
||||
```
|
||||
(same test that was hanging pre-fix:
|
||||
`test_parent_cancels[...cancel_method=ctx-...False]`)
|
||||
|
||||
Full `tests/test_context_stream_semantics.py`
|
||||
under subint:
|
||||
```
|
||||
61 passed, 1 warning in 100.35s (0:01:40)
|
||||
```
|
||||
and a clean-cache re-run:
|
||||
```
|
||||
61 passed, 1 warning in 100.82s (0:01:40)
|
||||
```
|
||||
|
||||
No regressions on trio backend (same subset):
|
||||
```
|
||||
69 passed, 1 skipped, 3 warnings in 89.19s
|
||||
```
|
||||
|
||||
### Commit msg
|
||||
|
||||
Also AI-drafted via `/commit-msg` + `rewrap.py
|
||||
--width 67`. See `git log -1 26fb820`.
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
---
|
||||
model: claude-opus-4-7[1m]
|
||||
service: claude
|
||||
session: subint-phase-b-hang-classification
|
||||
timestamp: 2026-04-20T19:27:39Z
|
||||
git_ref: HEAD (pre-commit; on branch `subint_spawner_backend`)
|
||||
scope: docs
|
||||
substantive: true
|
||||
raw_file: 20260420T192739Z_5e8cd8b2_prompt_io.raw.md
|
||||
---
|
||||
|
||||
## Prompt
|
||||
|
||||
Collab between user (`goodboy`) and `claude` to
|
||||
disambiguate two distinct hang modes hit during
|
||||
Phase B subint-spawn-backend bringup (issue #379).
|
||||
The user ran the failing suites, captured `strace`
|
||||
evidence on hung pytest pids, and set the framing:
|
||||
|
||||
> "These cannot be the same bug. Different fix
|
||||
> paths. Write them up separately or we'll keep
|
||||
> conflating them."
|
||||
|
||||
Follow-on asks:
|
||||
- Cross-link each doc from its reproducer test so
|
||||
a future triager lands on the analysis without
|
||||
needing to grep `ai/conc-anal/`.
|
||||
- On `test_stale_entry_is_deleted`: arm
|
||||
`dump_on_hang(seconds=20, path=...)` so a
|
||||
regression captures a stack dump (pytest stderr
|
||||
capture otherwise eats `faulthandler` output).
|
||||
- Keep `test_stale_entry_is_deleted` un-`skip`ped
|
||||
so the dump file remains inspectable.
|
||||
|
||||
Defer the actual fix for the class-2 hang
|
||||
(Ctrl-C-able, tractor-side bug) to a later commit
|
||||
per the incremental-commits preference.
|
||||
|
||||
## Response summary
|
||||
|
||||
Wrote two new `ai/conc-anal/` classification docs
|
||||
and cross-linked them from the reproducer tests:
|
||||
|
||||
1. **`subint_sigint_starvation_issue.md`** —
|
||||
class 1: abandoned-legacy-subint thread +
|
||||
shared GIL starves main trio loop →
|
||||
signal-wakeup-fd pipe fills → SIGINT silently
|
||||
dropped (`write() = EAGAIN`). Pytest process
|
||||
un-Ctrl-C-able. Structurally a CPython limit;
|
||||
blocked on `msgspec` PEP 684 support
|
||||
(jcrist/msgspec#563). Reproducer:
|
||||
`test_stale_entry_is_deleted[subint]`.
|
||||
|
||||
2. **`subint_cancel_delivery_hang_issue.md`** —
|
||||
class 2: parent-side trio task parks on an
|
||||
orphaned IPC channel after subint teardown;
|
||||
no clean EOF delivered to waiting receiver.
|
||||
Ctrl-C-able (main trio loop iterating fine).
|
||||
OUR bug to fix. Candidate fix: explicit
|
||||
parent-side channel abort in `subint_proc`'s
|
||||
hard-kill teardown. Reproducer:
|
||||
`test_subint_non_checkpointing_child`.
|
||||
|
||||
Test-side cross-links:
|
||||
- `tests/discovery/test_registrar.py`:
|
||||
`test_stale_entry_is_deleted` → `trio.run(main)`
|
||||
wrapped in `dump_on_hang(seconds=20,
|
||||
path=<per-method-tmp>)`; long inline comment
|
||||
summarizes `strace` evidence + root-cause chain
|
||||
and points at both docs.
|
||||
- `tests/test_subint_cancellation.py`:
|
||||
`test_subint_non_checkpointing_child` docstring
|
||||
extended with "KNOWN ISSUE (Ctrl-C-able hang)"
|
||||
section pointing at the class-2 doc + noting
|
||||
the class-1 doc is NOT what this test hits.
|
||||
|
||||
## Files changed
|
||||
|
||||
- `ai/conc-anal/subint_sigint_starvation_issue.md`
|
||||
— new, 205 LOC
|
||||
- `ai/conc-anal/subint_cancel_delivery_hang_issue.md`
|
||||
— new, 161 LOC
|
||||
- `tests/discovery/test_registrar.py` — +52/-1
|
||||
(arm `dump_on_hang`, inline-comment cross-link)
|
||||
- `tests/test_subint_cancellation.py` — +26
|
||||
(docstring "KNOWN ISSUE" block)
|
||||
|
||||
## Human edits
|
||||
|
||||
Substantive collab — prose was jointly iterated:
|
||||
|
||||
- User framed the two-doc split, set the
|
||||
classification criteria (Ctrl-C-able vs not),
|
||||
and provided the `strace` evidence.
|
||||
- User decided to keep `test_stale_entry_is_deleted`
|
||||
un-`skip`ped (my initial suggestion was
|
||||
`pytestmark.skipif(spawn_backend=='subint')`).
|
||||
- User chose the candidate fix ordering for
|
||||
class 2 and marked "explicit parent-side channel
|
||||
abort" as the surgical preferred fix.
|
||||
- User picked the file naming convention
|
||||
(`subint_<hang-shape>_issue.md`) over my initial
|
||||
`hang_class_{1,2}.md`.
|
||||
- Assistant drafted the prose, aggregated prior-
|
||||
session root-cause findings from Phase B.2/B.3
|
||||
bringup, and wrote the test-side cross-linking
|
||||
comments.
|
||||
|
||||
No further mechanical edits expected before
|
||||
commit; user may still rewrap via
|
||||
`scripts/rewrap.py` if preferred.
|
||||
|
|
@ -0,0 +1,198 @@
|
|||
---
|
||||
model: claude-opus-4-7[1m]
|
||||
service: claude
|
||||
timestamp: 2026-04-20T19:27:39Z
|
||||
git_ref: HEAD (pre-commit; will land on branch `subint_spawner_backend`)
|
||||
diff_cmd: git diff HEAD~1..HEAD
|
||||
---
|
||||
|
||||
Collab between `goodboy` (user) and `claude` (this
|
||||
assistant) spanning multiple test-run iterations on
|
||||
branch `subint_spawner_backend`. The user ran the
|
||||
failing suites, captured `strace` evidence on the
|
||||
hung pytest pids, and set the direction ("these are
|
||||
two different hangs — write them up separately so
|
||||
we don't re-confuse ourselves later"). The assistant
|
||||
aggregated prior-session findings (Phase B.2/B.3
|
||||
bringup) into two classification docs + test-side
|
||||
cross-links. All prose was jointly iterated; the
|
||||
user had final say on framing and decided which
|
||||
candidate fix directions to list.
|
||||
|
||||
## Per-file generated content
|
||||
|
||||
### `ai/conc-anal/subint_sigint_starvation_issue.md` (new, 205 LOC)
|
||||
|
||||
> `git diff HEAD~1..HEAD -- ai/conc-anal/subint_sigint_starvation_issue.md`
|
||||
|
||||
Writes up the "abandoned-legacy-subint thread wedges
|
||||
the parent trio loop" class. Key sections:
|
||||
|
||||
- **Symptom** — `test_stale_entry_is_deleted[subint]`
|
||||
hangs indefinitely AND is un-Ctrl-C-able.
|
||||
- **Evidence** — annotated `strace` excerpt showing
|
||||
SIGINT delivered to pytest, C-level signal handler
|
||||
tries to write to the signal-wakeup-fd pipe, gets
|
||||
`write() = -1 EAGAIN (Resource temporarily
|
||||
unavailable)`. Pipe is full because main trio loop
|
||||
isn't iterating often enough to drain it.
|
||||
- **Root-cause chain** — our hard-kill abandons the
|
||||
`daemon=True` driver OS thread after
|
||||
`_HARD_KILL_TIMEOUT`; the subint *inside* that
|
||||
thread is still running `trio.run()`;
|
||||
`_interpreters.destroy()` cannot force-stop a
|
||||
running subint (raises `InterpreterError`); legacy
|
||||
subints share the main GIL → abandoned subint
|
||||
starves main trio loop → wakeup-fd fills → SIGINT
|
||||
silently dropped.
|
||||
- **Why it's structurally a CPython limit** — no
|
||||
public force-destroy primitive for a running
|
||||
subint; the only escape is per-interpreter GIL
|
||||
isolation, gated on msgspec PEP 684 adoption
|
||||
(jcrist/msgspec#563).
|
||||
- **Current escape hatch** — harness-side SIGINT
|
||||
loop in the `daemon` fixture teardown that kills
|
||||
the bg registrar subproc, eventually unblocking
|
||||
a parent-side recv enough for the main loop to
|
||||
drain the wakeup pipe.
|
||||
|
||||
### `ai/conc-anal/subint_cancel_delivery_hang_issue.md` (new, 161 LOC)
|
||||
|
||||
> `git diff HEAD~1..HEAD -- ai/conc-anal/subint_cancel_delivery_hang_issue.md`
|
||||
|
||||
Writes up the *sibling* hang class — same subint
|
||||
backend, distinct root cause:
|
||||
|
||||
- **TL;DR** — Ctrl-C-able, so NOT the SIGINT-
|
||||
starvation class; main trio loop iterates fine;
|
||||
ours to fix.
|
||||
- **Symptom** — `test_subint_non_checkpointing_child`
|
||||
hangs past the expected `_HARD_KILL_TIMEOUT`
|
||||
budget even after the subint is torn down.
|
||||
- **Diagnosis** — a parent-side trio task (likely
|
||||
a `chan.recv()` in `process_messages`) parks on
|
||||
an orphaned IPC channel; channel was torn down
|
||||
without emitting a clean EOF /
|
||||
`BrokenResourceError` to the waiting receiver.
|
||||
- **Candidate fix directions** — listed in rough
|
||||
order of preference:
|
||||
1. Explicit parent-side channel abort in
|
||||
`subint_proc`'s hard-kill teardown (surgical;
|
||||
most likely).
|
||||
2. Audit `process_messages` to add a timeout or
|
||||
cancel-scope protection that catches the
|
||||
orphaned-recv state.
|
||||
3. Wrap subint IPC channel construction in a
|
||||
sentinel that can force-close from the parent
|
||||
side regardless of subint liveness.
|
||||
|
||||
### `tests/discovery/test_registrar.py` (modified, +52/-1 LOC)
|
||||
|
||||
> `git diff HEAD~1..HEAD -- tests/discovery/test_registrar.py`
|
||||
|
||||
Wraps the `trio.run(main)` call at the bottom of
|
||||
`test_stale_entry_is_deleted` in
|
||||
`dump_on_hang(seconds=20, path=<per-method-tmp>)`.
|
||||
Adds a long inline comment that:
|
||||
- Enumerates variant-by-variant status
|
||||
(`[trio]`/`[mp_*]` = clean; `[subint]` = hangs
|
||||
+ un-Ctrl-C-able)
|
||||
- Summarizes the `strace` evidence and root-cause
|
||||
chain inline (so a future reader hitting this
|
||||
test doesn't need to cross-ref the doc to
|
||||
understand the hang shape)
|
||||
- Points at
|
||||
`ai/conc-anal/subint_sigint_starvation_issue.md`
|
||||
for full analysis
|
||||
- Cross-links to the *sibling*
|
||||
`subint_cancel_delivery_hang_issue.md` so
|
||||
readers can tell the two classes apart
|
||||
- Explains why it's kept un-`skip`ped: the dump
|
||||
file is useful if the hang ever returns after
|
||||
a refactor. pytest stderr capture would
|
||||
otherwise eat `faulthandler` output, hence the
|
||||
file path.
|
||||
|
||||
### `tests/test_subint_cancellation.py` (modified, +26 LOC)
|
||||
|
||||
> `git diff HEAD~1..HEAD -- tests/test_subint_cancellation.py`
|
||||
|
||||
Extends the docstring of
|
||||
`test_subint_non_checkpointing_child` with a
|
||||
"KNOWN ISSUE (Ctrl-C-able hang)" block:
|
||||
- Describes the current hang: parent-side orphaned
|
||||
IPC recv after hard-kill; distinct from the
|
||||
SIGINT-starvation sibling class.
|
||||
- Cites `strace` distinguishing signal: wakeup-fd
|
||||
`write() = 1` (not `EAGAIN`) — i.e. main loop
|
||||
iterating.
|
||||
- Points at
|
||||
`ai/conc-anal/subint_cancel_delivery_hang_issue.md`
|
||||
for full analysis + candidate fix directions.
|
||||
- Clarifies that the *other* sibling doc
|
||||
(SIGINT-starvation) is NOT what this test hits.
|
||||
|
||||
## Non-code output
|
||||
|
||||
### Classification reasoning (why two docs, not one)
|
||||
|
||||
The user and I converged on the two-doc split after
|
||||
running the suites and noticing two *qualitatively
|
||||
different* hang symptoms:
|
||||
|
||||
1. `test_stale_entry_is_deleted[subint]` — pytest
|
||||
process un-Ctrl-C-able. Ctrl-C at the terminal
|
||||
does nothing. Must kill-9 from another shell.
|
||||
2. `test_subint_non_checkpointing_child` — pytest
|
||||
process Ctrl-C-able. One Ctrl-C at the prompt
|
||||
unblocks cleanly and the test reports a hang
|
||||
via pytest-timeout.
|
||||
|
||||
From the user: "These cannot be the same bug.
|
||||
Different fix paths. Write them up separately or
|
||||
we'll keep conflating them."
|
||||
|
||||
`strace` on the `[subint]` hang gave the decisive
|
||||
signal for the first class:
|
||||
|
||||
```
|
||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||
write(5, "\2", 1) = -1 EAGAIN (Resource temporarily unavailable)
|
||||
```
|
||||
|
||||
fd 5 is Python's signal-wakeup-fd pipe. `EAGAIN`
|
||||
on a `write()` of 1 byte to a pipe means the pipe
|
||||
buffer is full → reader side (main Python thread
|
||||
inside `trio.run()`) isn't consuming. That's the
|
||||
GIL-hostage signature.
|
||||
|
||||
The second class's `strace` showed `write(5, "\2",
|
||||
1) = 1` — clean drain — so the main trio loop was
|
||||
iterating and the hang had to be on the application
|
||||
side of things, not the kernel-↔-Python signal
|
||||
boundary.
|
||||
|
||||
### Why the candidate fix for class 2 is "explicit parent-side channel abort"
|
||||
|
||||
The second hang class has the trio loop alive. A
|
||||
parked `chan.recv()` that will never get bytes is
|
||||
fundamentally a tractor-side resource-lifetime bug
|
||||
— the IPC channel was torn down (subint destroyed)
|
||||
but no one explicitly raised
|
||||
`BrokenResourceError` at the parent-side receiver.
|
||||
The `subint_proc` hard-kill path is the natural
|
||||
place to add that notification, because it already
|
||||
knows the subint is unreachable at that point.
|
||||
|
||||
Alternative fix paths (blanket timeouts on
|
||||
`process_messages`, sentinel-wrapped channels) are
|
||||
less surgical and risk masking unrelated bugs —
|
||||
hence the preference ordering in the doc.
|
||||
|
||||
### Why we're not just patching the code now
|
||||
|
||||
The user explicitly deferred the fix to a later
|
||||
commit: "Document both classes now, land the fix
|
||||
for class 2 separately so the diff reviews clean."
|
||||
This matches the incremental-commits preference
|
||||
from memory.
|
||||
|
|
@ -0,0 +1,155 @@
|
|||
---
|
||||
model: claude-opus-4-7[1m]
|
||||
service: claude
|
||||
session: subints-phase-b-hardening-and-fork-block
|
||||
timestamp: 2026-04-22T20:07:23Z
|
||||
git_ref: 797f57c
|
||||
scope: code
|
||||
substantive: true
|
||||
raw_file: 20260422T200723Z_797f57c_prompt_io.raw.md
|
||||
---
|
||||
|
||||
## Prompt
|
||||
|
||||
Session-spanning work on the Phase B `subint` spawn-backend.
|
||||
Three distinct sub-phases in one log:
|
||||
|
||||
1. **Py3.13 gate tightening** — diagnose a reproducible hang
|
||||
of subint spawn flow under py3.13 (works on py3.14), trace
|
||||
to a private `_interpreters` module vintage issue, tighten
|
||||
our feature gate from "`_interpreters` present" to "public
|
||||
`concurrent.interpreters` present" (i.e. py3.14+).
|
||||
2. **Test-harness hardening** — add `pytest-timeout` dep, put
|
||||
`@pytest.mark.timeout(30, method='thread')` on the
|
||||
three known-hanging subint tests cataloged in
|
||||
`ai/conc-anal/subint_sigint_starvation_issue.md`. Separately,
|
||||
code-review the user's in-flight `skipon_spawn_backend`
|
||||
marker implementation; find four bugs; refactor to use
|
||||
`item.iter_markers()`.
|
||||
3. **`subint_fork` prototype → CPython-block finding** — draft
|
||||
a WIP `subint_fork_proc` backend using a sub-interpreter as
|
||||
a launchpad for `os.fork()` (to sidestep trio#1614). User
|
||||
tests on py3.14, hits
|
||||
`Fatal Python error: _PyInterpreterState_DeleteExceptMain:
|
||||
not main interpreter`. Walk CPython sources (local clone at
|
||||
`~/repos/cpython/`) to pinpoint the refusal
|
||||
(`Modules/posixmodule.c:728` → `Python/pystate.c:1040`).
|
||||
Revert implementation to a `NotImplementedError` stub in a
|
||||
new `_subint_fork.py` submodule, document the finding in a
|
||||
third `conc-anal/` doc with an upstream-report draft for
|
||||
the CPython issue tracker. Finally, discuss user's proposed
|
||||
workaround architecture (main-interp worker-thread
|
||||
forkserver) and draft a standalone smoke-test script for
|
||||
feasibility validation.
|
||||
|
||||
## Response summary
|
||||
|
||||
All three sub-phases landed concrete artifacts:
|
||||
|
||||
**Sub-phase 1** — `_subint.py` + `_spawn.py` gates + error
|
||||
messages updated to require py3.14+ via the public
|
||||
`concurrent.interpreters` module presence check. Module
|
||||
docstring revised to explain the empirical reason
|
||||
(py3.13's private `_interpreters` vintage wedges under
|
||||
multi-trio-task usage even though minimal standalone
|
||||
reproducers work fine there). Test-module
|
||||
`pytest.importorskip` likewise switched.
|
||||
|
||||
**Sub-phase 2** — `pytest-timeout>=2.3` added to `testing`
|
||||
dep group. `@pytest.mark.timeout(30, method='thread')`
|
||||
applied on:
|
||||
- `tests/discovery/test_registrar.py::test_stale_entry_is_deleted`
|
||||
- `tests/test_cancellation.py::test_cancel_while_childs_child_in_sync_sleep`
|
||||
- `tests/test_cancellation.py::test_multierror_fast_nursery`
|
||||
- `tests/test_subint_cancellation.py::test_subint_non_checkpointing_child`
|
||||
|
||||
`method='thread'` documented inline as load-bearing — the
|
||||
GIL-starvation path that drops `SIGINT` would equally drop
|
||||
`SIGALRM`, so only a watchdog-thread timeout can reliably
|
||||
escape.
|
||||
|
||||
`skipon_spawn_backend` plugin refactored into a single
|
||||
`iter_markers`-driven loop in `pytest_collection_modifyitems`
|
||||
(~30 LOC replacing ~30 LOC of nested conditionals). Four
|
||||
bugs dissolved: wrong `.get()` key, module-level `pytestmark`
|
||||
suppressing per-test marks, unhandled `pytestmark = [list]`
|
||||
form, `pytest.Makr` typo. Marker help text updated to
|
||||
document the variadic backend-list + `reason=` kwarg
|
||||
surface.
|
||||
|
||||
**Sub-phase 3** — Prototype drafted (then reverted):
|
||||
|
||||
- `tractor/spawn/_subint_fork.py` — new dedicated submodule
|
||||
housing the `subint_fork_proc` stub. Module docstring +
|
||||
fn docstring explain the attempt, the CPython-level
|
||||
block, and the reason for keeping the stub in-tree
|
||||
(documentation of the attempt + starting point if CPython
|
||||
ever lifts the restriction).
|
||||
- `tractor/spawn/_spawn.py` — `'subint_fork'` registered as a
|
||||
`SpawnMethodKey` literal + in `_methods`, so
|
||||
`--spawn-backend=subint_fork` routes to a clean
|
||||
`NotImplementedError` pointing at the analysis doc rather
|
||||
than an "invalid backend" error.
|
||||
- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md` —
|
||||
third sibling conc-anal doc. Full annotated CPython
|
||||
source walkthrough from user-visible
|
||||
`Fatal Python error` → `Modules/posixmodule.c:728
|
||||
PyOS_AfterFork_Child()` → `Python/pystate.c:1040
|
||||
_PyInterpreterState_DeleteExceptMain()` gate. Includes a
|
||||
copy-paste-ready upstream-report draft for the CPython
|
||||
issue tracker with a two-tier ask (ideally "make it work",
|
||||
minimally "cleaner error than `Fatal Python error`
|
||||
aborting the child").
|
||||
- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py` —
|
||||
standalone zero-tractor-import CPython-level smoke test
|
||||
for the user's proposed workaround architecture
|
||||
(forkserver on a main-interp worker thread). Four
|
||||
argparse-driven scenarios: `control_subint_thread_fork`
|
||||
(reproduces the known-broken case as a test-harness
|
||||
sanity), `main_thread_fork` (baseline), `worker_thread_fork`
|
||||
(architectural assertion), `full_architecture`
|
||||
(end-to-end trio-in-subint in forked child). User will
|
||||
run on py3.14 next.
|
||||
|
||||
## Files changed
|
||||
|
||||
See `git log 26fb820..HEAD --stat` for the canonical list.
|
||||
New files this session:
|
||||
- `tractor/spawn/_subint_fork.py`
|
||||
- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
||||
- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`
|
||||
|
||||
Modified (diff pointers in raw log):
|
||||
- `tractor/spawn/_subint.py` (py3.14 gate)
|
||||
- `tractor/spawn/_spawn.py` (`subint_fork` registration)
|
||||
- `tractor/_testing/pytest.py` (`skipon_spawn_backend` refactor)
|
||||
- `pyproject.toml` (`pytest-timeout` dep)
|
||||
- `tests/discovery/test_registrar.py`,
|
||||
`tests/test_cancellation.py`,
|
||||
`tests/test_subint_cancellation.py` (timeout marks,
|
||||
cross-refs to conc-anal docs)
|
||||
|
||||
## Human edits
|
||||
|
||||
Several back-and-forth iterations with user-driven
|
||||
adjustments during the session:
|
||||
|
||||
- User corrected my initial mis-classification of
|
||||
`test_cancel_while_childs_child_in_sync_sleep[subint-False]`
|
||||
as Ctrl-C-able — second strace showed `EAGAIN`, putting
|
||||
it squarely in class A (GIL-starvation). Re-analysis
|
||||
preserved in the raw log.
|
||||
- User independently fixed the `.get(reason)` → `.get('reason', reason)`
|
||||
bug in the marker plugin before my review; preserved their
|
||||
fix.
|
||||
- User suggested moving the `subint_fork_proc` stub from
|
||||
the bottom of `_subint.py` into its own
|
||||
`_subint_fork.py` submodule — applied.
|
||||
- User asked to keep the forkserver-architecture
|
||||
discussion as background for the smoke-test rather than
|
||||
committing to a tractor-side refactor until the smoke
|
||||
test validates the CPython-level assumptions.
|
||||
|
||||
Commit messages in this range (b025c982 … 797f57c) were
|
||||
drafted via `/commit-msg` + `rewrap.py --width 67`; user
|
||||
landed them with the usual review.
|
||||
|
|
@ -0,0 +1,343 @@
|
|||
---
|
||||
model: claude-opus-4-7[1m]
|
||||
service: claude
|
||||
timestamp: 2026-04-22T20:07:23Z
|
||||
git_ref: 797f57c
|
||||
diff_cmd: git log 26fb820..HEAD # all session commits since the destroy-race fix log
|
||||
---
|
||||
|
||||
Session-spanning conversation covering the Phase B hardening
|
||||
of the `subint` spawn-backend and an investigation into a
|
||||
proposed `subint_fork` follow-up which turned out to be
|
||||
blocked at the CPython level. This log is a narrative capture
|
||||
of the substantive turns (not every message) and references
|
||||
the concrete code + docs the session produced. Per diff-ref
|
||||
mode the actual code diffs are pointed at via `git log` on
|
||||
each ref rather than duplicated inline.
|
||||
|
||||
## Narrative of the substantive turns
|
||||
|
||||
### Py3.13 hang / gate tightening
|
||||
|
||||
Diagnosed a reproducible hang of the `subint` backend under
|
||||
py3.13 (test_spawning tests wedge after root-actor bringup).
|
||||
Root cause: py3.13's vintage of the private `_interpreters` C
|
||||
module has a latent thread/subint-interaction issue that
|
||||
`_interpreters.exec()` silently fails to progress under
|
||||
tractor's multi-trio usage pattern — even though a minimal
|
||||
standalone `threading.Thread` + `_interpreters.exec()`
|
||||
reproducer works fine on the same Python. Empirically
|
||||
py3.14 fixes it.
|
||||
|
||||
Fix (from this session): tighten the `_has_subints` gate in
|
||||
`tractor.spawn._subint` from "private module importable" to
|
||||
"public `concurrent.interpreters` present" — which is 3.14+
|
||||
only. This leaves `subint_proc()` unchanged in behavior (we
|
||||
still call the *private* `_interpreters.create('legacy')`
|
||||
etc. under the hood) but refuses to engage on 3.13.
|
||||
|
||||
Also tightened the matching gate in
|
||||
`tractor.spawn._spawn.try_set_start_method('subint')` and
|
||||
rev'd the corresponding error messages from "3.13+" to
|
||||
"3.14+" with a sentence explaining why. Test-module
|
||||
`pytest.importorskip` switched from `_interpreters` →
|
||||
`concurrent.interpreters` to match.
|
||||
|
||||
### `pytest-timeout` dep + `skipon_spawn_backend` marker plumbing
|
||||
|
||||
Added `pytest-timeout>=2.3` to the `testing` dep group with
|
||||
an inline comment pointing at the `ai/conc-anal/*.md` docs.
|
||||
Applied `@pytest.mark.timeout(30, method='thread')` (the
|
||||
`method='thread'` is load-bearing — `signal`-method
|
||||
`SIGALRM` suffers the same GIL-starvation path that drops
|
||||
`SIGINT` in the class-A hang pattern) to the three known-
|
||||
hanging subint tests cataloged in
|
||||
`subint_sigint_starvation_issue.md`.
|
||||
|
||||
Separately code-reviewed the user's newly-staged
|
||||
`skipon_spawn_backend` pytest marker implementation in
|
||||
`tractor/_testing/pytest.py`. Found four bugs:
|
||||
|
||||
1. `modmark.kwargs.get(reason)` called `.get()` with the
|
||||
*variable* `reason` as the dict key instead of the string
|
||||
`'reason'` — user-supplied `reason=` was never picked up.
|
||||
(User had already fixed this locally via `.get('reason',
|
||||
reason)` by the time my review happened — preserved that
|
||||
fix.)
|
||||
2. The module-level `pytestmark` branch suppressed per-test
|
||||
marker handling (the `else:` was an `else:` rather than
|
||||
independent iteration).
|
||||
3. `mod_pytestmark.mark` assumed a single
|
||||
`MarkDecorator` — broke on the valid-pytest `pytestmark =
|
||||
[mark, mark]` list form.
|
||||
4. Typo: `pytest.Makr` → `pytest.Mark`.
|
||||
|
||||
Refactored the hook to use `item.iter_markers(name=...)`
|
||||
which walks function + class + module scopes uniformly and
|
||||
handles both `pytestmark` forms natively. ~30 LOC replaced
|
||||
the original ~30 LOC of nested conditionals, all four bugs
|
||||
dissolved. Also updated the marker help string to reflect
|
||||
the variadic `*start_methods` + `reason=` surface.
|
||||
|
||||
### `subint_fork_proc` prototype attempt
|
||||
|
||||
User's hypothesis: the known trio+`fork()` issues
|
||||
(python-trio/trio#1614) could be sidestepped by using a
|
||||
sub-interpreter purely as a launchpad — `os.fork()` from a
|
||||
subint that has never imported trio → child is in a
|
||||
trio-free context. In the child `execv()` back into
|
||||
`python -m tractor._child` and the downstream handshake
|
||||
matches `trio_proc()` identically.
|
||||
|
||||
Drafted the prototype at `tractor/spawn/_subint.py`'s bottom
|
||||
(originally — later moved to its own submod, see below):
|
||||
launchpad-subint creation, bootstrap code-string with
|
||||
`os.fork()` + `execv()`, driver-thread orchestration,
|
||||
parent-side `ipc_server.wait_for_peer()` dance. Registered
|
||||
`'subint_fork'` as a new `SpawnMethodKey` literal, added
|
||||
`case 'subint' | 'subint_fork':` feature-gate arm in
|
||||
`try_set_start_method()`, added entry in `_methods` dict.
|
||||
|
||||
### CPython-level block discovered
|
||||
|
||||
User tested on py3.14 and saw:
|
||||
|
||||
```
|
||||
Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
|
||||
Python runtime state: initialized
|
||||
|
||||
Current thread 0x00007f6b71a456c0 [subint-fork-lau] (most recent call first):
|
||||
File "<script>", line 2 in <module>
|
||||
<script>:2: DeprecationWarning: This process (pid=802985) is multi-threaded, use of fork() may lead to deadlocks in the child.
|
||||
```
|
||||
|
||||
Walked CPython sources (local clone at `~/repos/cpython/`):
|
||||
|
||||
- **`Modules/posixmodule.c:728` `PyOS_AfterFork_Child()`** —
|
||||
post-fork child-side cleanup. Calls
|
||||
`_PyInterpreterState_DeleteExceptMain(runtime)` with
|
||||
`goto fatal_error` on non-zero status. Has the
|
||||
`// Ideally we could guarantee tstate is running main.`
|
||||
self-acknowledging-fragile comment directly above.
|
||||
|
||||
- **`Python/pystate.c:1040`
|
||||
`_PyInterpreterState_DeleteExceptMain()`** — the
|
||||
refusal. Hard `PyStatus_ERR("not main interpreter")` gate
|
||||
when `tstate->interp != interpreters->main`. Docstring
|
||||
formally declares the precondition ("If there is a
|
||||
current interpreter state, it *must* be the main
|
||||
interpreter"). `XXX` comments acknowledge further latent
|
||||
issues within.
|
||||
|
||||
Definitive answer to "Open Question 1" of the prototype
|
||||
docstring: **no, CPython does not support `os.fork()` from
|
||||
a non-main sub-interpreter**. Not because the fork syscall
|
||||
is blocked (it isn't — the parent returns a valid pid),
|
||||
but because the child cannot survive CPython's post-fork
|
||||
initialization. This is an enforced invariant, not an
|
||||
incidental limitation.
|
||||
|
||||
### Revert: move to stub submod + doc the finding
|
||||
|
||||
Per user request:
|
||||
|
||||
1. Reverted the working `subint_fork_proc` body to a
|
||||
`NotImplementedError` stub, MOVED to its own submod
|
||||
`tractor/spawn/_subint_fork.py` (keeps `_subint.py`
|
||||
focused on the working `subint_proc` backend).
|
||||
2. Updated `_spawn.py` to import the stub from the new
|
||||
submod path; kept `'subint_fork'` in `SpawnMethodKey` +
|
||||
`_methods` so `--spawn-backend=subint_fork` routes to a
|
||||
clean `NotImplementedError` with pointer to the analysis
|
||||
doc rather than an "invalid backend" error.
|
||||
3. Wrote
|
||||
`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
||||
with the full annotated CPython walkthrough + an
|
||||
upstream-report draft for the CPython issue tracker.
|
||||
Draft has a two-tier ask: ideally "make it work"
|
||||
(pre-fork tstate-swap hook or `DeleteExceptFor(interp)`
|
||||
variant), minimally "give us a clean `RuntimeError` in
|
||||
the parent instead of a `Fatal Python error` aborting
|
||||
the child silently".
|
||||
|
||||
### Design discussion — main-interp-thread forkserver workaround
|
||||
|
||||
User proposed: set up a "subint forking server" that fork()s
|
||||
on behalf of subint callers. Core insight: the CPython gate
|
||||
is on `tstate->interp`, not thread identity, so **any thread
|
||||
whose tstate is main-interp** can fork cleanly. A worker
|
||||
thread attached to main-interp (never entering a subint)
|
||||
satisfies the precondition.
|
||||
|
||||
Structurally this is `mp.forkserver` (which tractor already
|
||||
has as `mp_forkserver`) but **in-process**: instead of a
|
||||
separate Python subproc as the fork server, we'd put the
|
||||
forkserver on a thread in the tractor parent process. Pros:
|
||||
faster spawn (no IPC marshalling to external server + no
|
||||
separate Python startup), inherits already-imported modules
|
||||
for free. Cons: less crash isolation (forkserver failure
|
||||
takes the whole process).
|
||||
|
||||
Required tractor-side refactor: move the root actor's
|
||||
`trio.run()` off main-interp-main-thread (so main-thread can
|
||||
run the forkserver loop). Nontrivial; approximately the same
|
||||
magnitude as "Phase C".
|
||||
|
||||
The design would also not fully resolve the class-A
|
||||
GIL-starvation issue because child actors' trio still runs
|
||||
inside subints (legacy config, msgspec PEP 684 pending).
|
||||
Would mitigate SIGINT-starvation specifically if signal
|
||||
handling moves to the forkserver thread.
|
||||
|
||||
Recommended pre-commitment: a standalone CPython-only smoke
|
||||
test validating the four assumptions the arch rests on,
|
||||
before any tractor-side work.
|
||||
|
||||
### Smoke-test script drafted
|
||||
|
||||
Wrote `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`:
|
||||
argparse-driven, four scenarios (`control_subint_thread_fork`
|
||||
reproducing the known-broken case, `main_thread_fork`
|
||||
baseline, `worker_thread_fork` the architectural assertion,
|
||||
`full_architecture` end-to-end with trio in a subint in the
|
||||
forked child). No `tractor` imports; pure CPython + `_interpreters`
|
||||
+ `trio`. Bails cleanly on py<3.14. Pass/fail banners per
|
||||
scenario.
|
||||
|
||||
User will validate on their py3.14 env next.
|
||||
|
||||
## Per-code-artifact provenance
|
||||
|
||||
### `tractor/spawn/_subint_fork.py` (new submod)
|
||||
|
||||
> `git show 797f57c -- tractor/spawn/_subint_fork.py`
|
||||
|
||||
NotImplementedError stub for the subint-fork backend. Module
|
||||
docstring + fn docstring explain the attempt, the CPython
|
||||
block, and why the stub is kept in-tree. No runtime behavior
|
||||
beyond raising with a pointer at the conc-anal doc.
|
||||
|
||||
### `tractor/spawn/_spawn.py` (modified)
|
||||
|
||||
> `git log 26fb820..HEAD -- tractor/spawn/_spawn.py`
|
||||
|
||||
- Added `'subint_fork'` to `SpawnMethodKey` literal with a
|
||||
block comment explaining the CPython-level block.
|
||||
- Generalized the `case 'subint':` arm to `case 'subint' |
|
||||
'subint_fork':` since both use the same py3.14+ gate.
|
||||
- Registered `subint_fork_proc` in `_methods` with a
|
||||
pointer-comment at the analysis doc.
|
||||
|
||||
### `tractor/spawn/_subint.py` (modified across session)
|
||||
|
||||
> `git log 26fb820..HEAD -- tractor/spawn/_subint.py`
|
||||
|
||||
- Tightened `_has_subints` gate: dual-requires public
|
||||
`concurrent.interpreters` + private `_interpreters`
|
||||
(tests for py3.14-or-newer on the public-API presence,
|
||||
then uses the private one for legacy-config subints
|
||||
because `msgspec` still blocks the public isolated mode
|
||||
per jcrist/msgspec#563).
|
||||
- Updated module docstring, `subint_proc()` docstring, and
|
||||
gate-error messages to reflect the 3.14+ requirement and
|
||||
the reason (py3.13 wedges under multi-trio usage even
|
||||
though the private module exists there).
|
||||
|
||||
### `tractor/_testing/pytest.py` (modified)
|
||||
|
||||
> `git log 26fb820..HEAD -- tractor/_testing/pytest.py`
|
||||
|
||||
- New `skipon_spawn_backend(*start_methods, reason=...)`
|
||||
pytest marker expanded into `pytest.mark.skip(reason=...)`
|
||||
at collection time via
|
||||
`pytest_collection_modifyitems()`.
|
||||
- Implementation uses `item.iter_markers(name=...)` which
|
||||
walks function + class + module scopes uniformly and
|
||||
handles both `pytestmark = <single Mark>` and
|
||||
`pytestmark = [mark, ...]` forms natively. ~30-LOC
|
||||
single-loop refactor replacing a prior nested
|
||||
conditional that had four bugs (see "Review" narrative
|
||||
above).
|
||||
- Added `pytest.Config` / `pytest.Function` /
|
||||
`pytest.FixtureRequest` type annotations on fixture
|
||||
signatures while touching the file.
|
||||
|
||||
### `pyproject.toml` (modified)
|
||||
|
||||
> `git log 26fb820..HEAD -- pyproject.toml`
|
||||
|
||||
Added `pytest-timeout>=2.3` to `testing` dep group with
|
||||
comment pointing at the `ai/conc-anal/` docs.
|
||||
|
||||
### `tests/discovery/test_registrar.py`,
|
||||
`tests/test_subint_cancellation.py`,
|
||||
`tests/test_cancellation.py` (modified)
|
||||
|
||||
> `git log 26fb820..HEAD -- tests/`
|
||||
|
||||
Applied `@pytest.mark.timeout(30, method='thread')` on
|
||||
known-hanging subint tests. Extended comments to cross-
|
||||
reference the `ai/conc-anal/*.md` docs. `method='thread'`
|
||||
is documented inline as load-bearing (`signal`-method
|
||||
SIGALRM suffers the same GIL-starvation path that drops
|
||||
SIGINT).
|
||||
|
||||
### `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md` (new)
|
||||
|
||||
> `git show 797f57c -- ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
||||
|
||||
Third sibling doc under `conc-anal/`. Structure: TL;DR,
|
||||
context ("what we tried"), symptom (the user's exact
|
||||
`Fatal Python error` output), CPython source walkthrough
|
||||
with excerpted snippets from `posixmodule.c` +
|
||||
`pystate.c`, chain summary, definitive answer to Open
|
||||
Question 1, `## Upstream-report draft (for CPython issue
|
||||
tracker)` section with a two-tier ask, references.
|
||||
|
||||
### `ai/conc-anal/subint_fork_from_main_thread_smoketest.py` (new, THIS turn)
|
||||
|
||||
Zero-tractor-import smoke test for the proposed workaround
|
||||
architecture. Four argparse-driven scenarios covering the
|
||||
control case + baseline + arch-critical case + end-to-end.
|
||||
Pass/fail banners per scenario; clean `--help` output;
|
||||
py3.13 early-exit.
|
||||
|
||||
## Non-code output (verbatim)
|
||||
|
||||
### The `strace` signature that kicked off the CPython
|
||||
walkthrough
|
||||
|
||||
```
|
||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||
write(16, "\2", 1) = -1 EAGAIN (Resource temporarily unavailable)
|
||||
rt_sigreturn({mask=[WINCH]}) = 139801964688928
|
||||
```
|
||||
|
||||
### Key user quotes framing the direction
|
||||
|
||||
> ok actually we get this [fatal error] ... see if you can
|
||||
> take a look at what's going on, in particular wrt to
|
||||
> cpython's sources. pretty sure there's a local copy at
|
||||
> ~/repos/cpython/
|
||||
|
||||
(Drove the CPython walkthrough that produced the
|
||||
definitive refusal chain.)
|
||||
|
||||
> is there any reason we can't just sidestep this "must fork
|
||||
> from main thread in main subint" issue by simply ensuring
|
||||
> a "subint forking server" is always setup prior to
|
||||
> invoking trio in a non-main-thread subint ...
|
||||
|
||||
(Drove the main-interp-thread-forkserver architectural
|
||||
discussion + smoke-test script design.)
|
||||
|
||||
### CPython source tags for quick jump-back
|
||||
|
||||
```
|
||||
Modules/posixmodule.c:728 PyOS_AfterFork_Child()
|
||||
Modules/posixmodule.c:753 // Ideally we could guarantee tstate is running main.
|
||||
Modules/posixmodule.c:778 status = _PyInterpreterState_DeleteExceptMain(runtime);
|
||||
|
||||
Python/pystate.c:1040 _PyInterpreterState_DeleteExceptMain()
|
||||
Python/pystate.c:1044-1047 tstate->interp != main → PyStatus_ERR("not main interpreter")
|
||||
```
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
# AI Prompt I/O Log — claude
|
||||
|
||||
This directory tracks prompt inputs and model
|
||||
outputs for AI-assisted development using
|
||||
`claude` (Claude Code).
|
||||
|
||||
## Policy
|
||||
|
||||
Prompt logging follows the
|
||||
[NLNet generative AI policy][nlnet-ai].
|
||||
All substantive AI contributions are logged
|
||||
with:
|
||||
- Model name and version
|
||||
- Timestamps
|
||||
- The prompts that produced the output
|
||||
- Unedited model output (`.raw.md` files)
|
||||
|
||||
[nlnet-ai]: https://nlnet.nl/foundation/policies/generativeAI/
|
||||
|
||||
## Usage
|
||||
|
||||
Entries are created by the `/prompt-io` skill
|
||||
or automatically via `/commit-msg` integration.
|
||||
|
||||
Human contributors remain accountable for all
|
||||
code decisions. AI-generated content is never
|
||||
presented as human-authored work.
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
ok now i want you to take a look at the most recent commit adding
|
||||
a `tpt_bind_addrs` to `open_root_actor()` and extend the existing
|
||||
tests/discovery/test_multiaddr* and friends to use this new param in
|
||||
at least one suite with parametrizations over,
|
||||
|
||||
- `registry_addrs == tpt_bind_addrs`, as in both inputs are the same.
|
||||
- `set(registry_addrs) >= set(tpt_bind_addrs)`, as in the registry
|
||||
addrs include the bind set.
|
||||
- `registry_addrs != tpt_bind_addrs`, where the reg set is disjoint from
|
||||
the bind set in all possible combos you can imagine.
|
||||
|
||||
All of the ^above cases should further be parametrized over,
|
||||
- the root being the registrar,
|
||||
- a non-registrar root using our bg `daemon` fixture.
|
||||
|
||||
once we have a fairly thorough test suite and have flushed out all
|
||||
bugs and edge cases we want to design a wrapping API which allows
|
||||
declaring full tree's of actors tpt endpoints using multiaddrs such
|
||||
that a `dict[str, list[str]]` of actor-name -> multiaddr can be used
|
||||
to configure a tree of actors-as-services given such an input
|
||||
"endpoints-table" can be matched with the number of appropriately
|
||||
named subactore spawns in a `tractor` user-app.
|
||||
|
||||
Here is a small example from piker,
|
||||
|
||||
- in piker's root conf.toml we define a `[network]` section which can
|
||||
define various actor-service-daemon names set to a maddr
|
||||
(multiaddress str).
|
||||
|
||||
- each actor whether part of the `pikerd` tree (as a sub) or spawned
|
||||
in other non-registrar rooted trees (such as `piker chart`) should
|
||||
configurable in terms of its `tractor` tpt bind addresses via
|
||||
a simple service lookup table,
|
||||
|
||||
```toml
|
||||
[network]
|
||||
pikerd = [
|
||||
'/ip4/127.0.0.1/tcp/6116', # std localhost daemon-actor tree
|
||||
'/uds/run/user/1000/piker/pikerd@6116.sock', # same but serving UDS
|
||||
]
|
||||
chart = [
|
||||
'/ip4/127.0.0.1/tcp/3333', # std localhost daemon-actor tree
|
||||
'/uds/run/user/1000/piker/chart@3333.sock',
|
||||
]
|
||||
```
|
||||
|
||||
We should take whatever common API is needed to support this and
|
||||
distill it into a
|
||||
```python
|
||||
tractor.discovery.parse_endpoints(
|
||||
) -> dict[
|
||||
str,
|
||||
list[Address]
|
||||
|dict[str, list[Address]]
|
||||
# ^recursive case, see below
|
||||
]:
|
||||
```
|
||||
|
||||
style API which can,
|
||||
|
||||
- be re-used easily across dependent projects.
|
||||
- correctly raise tpt-backend support errors when a maddr specifying
|
||||
a unsupport proto is passed.
|
||||
- be used to handle "tunnelled" maddrs per
|
||||
https://github.com/multiformats/py-multiaddr/#tunneling such that
|
||||
for any such tunneled maddr-`str`-entry we deliver a data-structure
|
||||
which can easily be passed to nested `@acm`s which consecutively
|
||||
setup nested net bindspaces for binding the endpoint addrs using
|
||||
a combo of our `.ipc.*` machinery and, say for example something like
|
||||
https://github.com/svinota/pyroute2, more precisely say for
|
||||
managing tunnelled wireguard eps within network-namespaces,
|
||||
* https://docs.pyroute2.org/
|
||||
* https://docs.pyroute2.org/netns.html
|
||||
|
||||
remember to include use of all default `.claude/skills` throughout
|
||||
this work!
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
This is your first big boi, "from GH issue" design, plan and
|
||||
implement task.
|
||||
|
||||
We need to try and add sub-interpreter (aka subint) support per the
|
||||
issue,
|
||||
|
||||
https://github.com/goodboy/tractor/issues/379
|
||||
|
||||
Part of this work should include,
|
||||
|
||||
- modularizing and thus better organizing the `.spawn.*` subpkg by
|
||||
breaking up various backends currently in `spawn._spawn` into
|
||||
separate submods where it makes sense.
|
||||
|
||||
- add a new `._subint` backend which tries to keep as much of the
|
||||
inter-process-isolation machinery in use as possible but with plans
|
||||
to optimize for localhost only benefits as offered by python's
|
||||
subints where possible.
|
||||
|
||||
* utilizing localhost-only tpts like UDS, shm-buffers for
|
||||
performant IPC between subactors but also leveraging the benefits from
|
||||
the traditional OS subprocs mem/storage-domain isolation, linux
|
||||
namespaces where possible and as available/permitted by whatever
|
||||
is happening under the hood with how cpython implements subints.
|
||||
|
||||
* default configuration should encourage state isolation as with
|
||||
subprocs, but explicit public escape hatches to enable rigorously
|
||||
managed shm channels for high performance apps.
|
||||
|
||||
- all tests should be (able to be) parameterized to use the new
|
||||
`subints` backend and enabled by flag in the harness using the
|
||||
existing `pytest --spawn-backend <spawn-backend>` support offered in
|
||||
the `open_root_actor()` and `.testing._pytest` harness override
|
||||
fixture.
|
||||
|
|
@ -420,20 +420,17 @@ Check out our experimental system for `guest`_-mode controlled
|
|||
|
||||
|
||||
async def aio_echo_server(
|
||||
to_trio: trio.MemorySendChannel,
|
||||
from_trio: asyncio.Queue,
|
||||
chan: tractor.to_asyncio.LinkedTaskChannel,
|
||||
) -> None:
|
||||
|
||||
# a first message must be sent **from** this ``asyncio``
|
||||
# task or the ``trio`` side will never unblock from
|
||||
# ``tractor.to_asyncio.open_channel_from():``
|
||||
to_trio.send_nowait('start')
|
||||
chan.started_nowait('start')
|
||||
|
||||
# XXX: this uses an ``from_trio: asyncio.Queue`` currently but we
|
||||
# should probably offer something better.
|
||||
while True:
|
||||
# echo the msg back
|
||||
to_trio.send_nowait(await from_trio.get())
|
||||
chan.send_nowait(await chan.get())
|
||||
await asyncio.sleep(0)
|
||||
|
||||
|
||||
|
|
@ -445,7 +442,7 @@ Check out our experimental system for `guest`_-mode controlled
|
|||
# message.
|
||||
async with tractor.to_asyncio.open_channel_from(
|
||||
aio_echo_server,
|
||||
) as (first, chan):
|
||||
) as (chan, first):
|
||||
|
||||
assert first == 'start'
|
||||
await ctx.started(first)
|
||||
|
|
@ -504,8 +501,10 @@ Yes, we spawn a python process, run ``asyncio``, start ``trio`` on the
|
|||
``asyncio`` loop, then send commands to the ``trio`` scheduled tasks to
|
||||
tell ``asyncio`` tasks what to do XD
|
||||
|
||||
We need help refining the `asyncio`-side channel API to be more
|
||||
`trio`-like. Feel free to sling your opinion in `#273`_!
|
||||
The ``asyncio``-side task receives a single
|
||||
``chan: LinkedTaskChannel`` handle providing a ``trio``-like
|
||||
API: ``.started_nowait()``, ``.send_nowait()``, ``.get()``
|
||||
and more. Feel free to sling your opinion in `#273`_!
|
||||
|
||||
|
||||
.. _#273: https://github.com/goodboy/tractor/issues/273
|
||||
|
|
@ -641,13 +640,15 @@ Help us push toward the future of distributed `Python`.
|
|||
- Typed capability-based (dialog) protocols ( see `#196
|
||||
<https://github.com/goodboy/tractor/issues/196>`_ with draft work
|
||||
started in `#311 <https://github.com/goodboy/tractor/pull/311>`_)
|
||||
- We **recently disabled CI-testing on windows** and need help getting
|
||||
it running again! (see `#327
|
||||
<https://github.com/goodboy/tractor/pull/327>`_). **We do have windows
|
||||
support** (and have for quite a while) but since no active hacker
|
||||
exists in the user-base to help test on that OS, for now we're not
|
||||
actively maintaining testing due to the added hassle and general
|
||||
latency..
|
||||
- **macOS is now officially supported** and tested in CI
|
||||
alongside Linux!
|
||||
- We **recently disabled CI-testing on windows** and need
|
||||
help getting it running again! (see `#327
|
||||
<https://github.com/goodboy/tractor/pull/327>`_). **We do
|
||||
have windows support** (and have for quite a while) but
|
||||
since no active hacker exists in the user-base to help
|
||||
test on that OS, for now we're not actively maintaining
|
||||
testing due to the added hassle and general latency..
|
||||
|
||||
|
||||
Feel like saying hi?
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ from tractor import (
|
|||
MsgStream,
|
||||
_testing,
|
||||
trionics,
|
||||
TransportClosed,
|
||||
)
|
||||
import trio
|
||||
import pytest
|
||||
|
|
@ -208,12 +209,16 @@ async def main(
|
|||
# TODO: is this needed or no?
|
||||
raise
|
||||
|
||||
except trio.ClosedResourceError:
|
||||
except (
|
||||
trio.ClosedResourceError,
|
||||
TransportClosed,
|
||||
) as _tpt_err:
|
||||
# NOTE: don't send if we already broke the
|
||||
# connection to avoid raising a closed-error
|
||||
# such that we drop through to the ctl-c
|
||||
# mashing by user.
|
||||
await trio.sleep(0.01)
|
||||
with trio.CancelScope(shield=True):
|
||||
await trio.sleep(0.01)
|
||||
|
||||
# timeout: int = 1
|
||||
# with trio.move_on_after(timeout) as cs:
|
||||
|
|
@ -247,6 +252,7 @@ async def main(
|
|||
await stream.send(i)
|
||||
pytest.fail('stream not closed?')
|
||||
except (
|
||||
TransportClosed,
|
||||
trio.ClosedResourceError,
|
||||
trio.EndOfChannel,
|
||||
) as send_err:
|
||||
|
|
|
|||
|
|
@ -18,15 +18,14 @@ async def aio_sleep_forever():
|
|||
|
||||
|
||||
async def bp_then_error(
|
||||
to_trio: trio.MemorySendChannel,
|
||||
from_trio: asyncio.Queue,
|
||||
chan: to_asyncio.LinkedTaskChannel,
|
||||
|
||||
raise_after_bp: bool = True,
|
||||
|
||||
) -> None:
|
||||
|
||||
# sync with `trio`-side (caller) task
|
||||
to_trio.send_nowait('start')
|
||||
chan.started_nowait('start')
|
||||
|
||||
# NOTE: what happens here inside the hook needs some refinement..
|
||||
# => seems like it's still `.debug._set_trace()` but
|
||||
|
|
@ -60,7 +59,7 @@ async def trio_ctx(
|
|||
to_asyncio.open_channel_from(
|
||||
bp_then_error,
|
||||
# raise_after_bp=not bp_before_started,
|
||||
) as (first, chan),
|
||||
) as (chan, first),
|
||||
|
||||
trio.open_nursery() as tn,
|
||||
):
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ async def sleep(
|
|||
|
||||
|
||||
async def open_ctx(
|
||||
n: tractor._supervise.ActorNursery
|
||||
n: tractor.runtime._supervise.ActorNursery
|
||||
):
|
||||
|
||||
# spawn both actors
|
||||
|
|
|
|||
|
|
@ -27,12 +27,9 @@ async def main():
|
|||
'''
|
||||
async with tractor.open_nursery(
|
||||
debug_mode=True,
|
||||
loglevel='cancel',
|
||||
# loglevel='devx',
|
||||
) as n:
|
||||
|
||||
p0 = await n.start_actor('bp_forever', enable_modules=[__name__])
|
||||
p1 = await n.start_actor('name_error', enable_modules=[__name__])
|
||||
) as an:
|
||||
p0 = await an.start_actor('bp_forever', enable_modules=[__name__])
|
||||
p1 = await an.start_actor('name_error', enable_modules=[__name__])
|
||||
|
||||
# retreive results
|
||||
async with p0.open_stream_from(breakpoint_forever) as stream:
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ async def main():
|
|||
"""
|
||||
async with tractor.open_nursery(
|
||||
debug_mode=True,
|
||||
# loglevel='cancel',
|
||||
loglevel='pdb',
|
||||
) as n:
|
||||
|
||||
# spawn both actors
|
||||
|
|
|
|||
|
|
@ -39,8 +39,8 @@ async def main():
|
|||
'''
|
||||
async with tractor.open_nursery(
|
||||
debug_mode=True,
|
||||
loglevel='devx',
|
||||
enable_transports=['uds'],
|
||||
enable_transports=['uds'], # TODO, apss this via osenv?
|
||||
loglevel='devx', # XXX, required for test!
|
||||
) as n:
|
||||
|
||||
# spawn both actors
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
import trio
|
||||
import tractor
|
||||
|
||||
|
|
@ -9,16 +8,22 @@ async def key_error():
|
|||
|
||||
|
||||
async def main():
|
||||
"""Root dies
|
||||
'''
|
||||
Root is fail-after-cancelled while blocking and child RPC fails
|
||||
simultaneously.
|
||||
|
||||
"""
|
||||
'''
|
||||
async with tractor.open_nursery(
|
||||
debug_mode=True,
|
||||
loglevel='debug'
|
||||
# loglevel='debug' # ?XXX required?
|
||||
) as n:
|
||||
|
||||
# spawn both actors
|
||||
portal = await n.run_in_actor(key_error)
|
||||
print(
|
||||
f'Child is up @ {portal.chan.aid.reprol()}'
|
||||
)
|
||||
|
||||
|
||||
# XXX: originally a bug caused by this is where root would enter
|
||||
# the debugger and clobber the tty used by the repl even though
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ Verify we can dump a `stackscope` tree on a hang.
|
|||
|
||||
'''
|
||||
import os
|
||||
import platform
|
||||
import signal
|
||||
|
||||
import trio
|
||||
|
|
@ -31,13 +32,28 @@ async def main(
|
|||
from_test: bool = False,
|
||||
) -> None:
|
||||
|
||||
if platform.system() != 'Darwin':
|
||||
tpt = 'uds'
|
||||
else:
|
||||
# XXX, precisely we can't use pytest's tmp-path generation
|
||||
# for tests.. apparently because:
|
||||
#
|
||||
# > The OSError: AF_UNIX path too long in macOS Python occurs
|
||||
# > because the path to the Unix domain socket exceeds the
|
||||
# > operating system's maximum path length limit (around 104
|
||||
#
|
||||
# WHICH IS just, wtf hillarious XD
|
||||
tpt = 'tcp'
|
||||
|
||||
async with (
|
||||
tractor.open_nursery(
|
||||
debug_mode=True,
|
||||
enable_stack_on_sig=True,
|
||||
# maybe_enable_greenback=False,
|
||||
loglevel='devx',
|
||||
enable_transports=['uds'],
|
||||
loglevel='devx', # XXX REQUIRED log level!
|
||||
enable_transports=[tpt],
|
||||
# maybe_enable_greenback=True,
|
||||
# ^TODO? maybe a "smarter" way todo all this is how
|
||||
# `modden` does with a rtv serialized through the osenv?
|
||||
) as an,
|
||||
):
|
||||
ptl: tractor.Portal = await an.start_actor(
|
||||
|
|
@ -49,7 +65,9 @@ async def main(
|
|||
start_n_shield_hang,
|
||||
) as (ctx, cpid):
|
||||
|
||||
_, proc, _ = an._children[ptl.chan.uid]
|
||||
_, proc, _ = an._children[
|
||||
ptl.chan.aid.uid
|
||||
]
|
||||
assert cpid == proc.pid
|
||||
|
||||
print(
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
import platform
|
||||
|
||||
import tractor
|
||||
import trio
|
||||
|
||||
|
|
@ -34,9 +36,27 @@ async def just_bp(
|
|||
|
||||
async def main():
|
||||
|
||||
# !TODO, parametrize the --tpt-proto={key} with osenv vars just
|
||||
# like we do for loglevel/spawn-backend!
|
||||
# - [ ] run on both tpts for all such debugger tests?
|
||||
# - [ ] special skip for macos!
|
||||
#
|
||||
if platform.system() != 'Darwin':
|
||||
tpt = 'uds'
|
||||
else:
|
||||
# XXX, precisely we can't use pytest's tmp-path generation
|
||||
# for tests.. apparently because:
|
||||
#
|
||||
# > The OSError: AF_UNIX path too long in macOS Python occurs
|
||||
# > because the path to the Unix domain socket exceeds the
|
||||
# > operating system's maximum path length limit (around 104
|
||||
#
|
||||
# WHICH IS just, wtf hillarious XD
|
||||
tpt = 'tcp'
|
||||
|
||||
async with tractor.open_nursery(
|
||||
debug_mode=True,
|
||||
enable_transports=['uds'],
|
||||
enable_transports=[tpt],
|
||||
loglevel='devx',
|
||||
) as n:
|
||||
p = await n.start_actor(
|
||||
|
|
|
|||
|
|
@ -9,7 +9,6 @@ async def name_error():
|
|||
async def main():
|
||||
async with tractor.open_nursery(
|
||||
debug_mode=True,
|
||||
# loglevel='transport',
|
||||
) as an:
|
||||
|
||||
# TODO: ideally the REPL arrives at this frame in the parent,
|
||||
|
|
|
|||
|
|
@ -1,9 +1,22 @@
|
|||
from functools import partial
|
||||
import os
|
||||
import time
|
||||
|
||||
# ?TODO? how to make `pdbp` enforce this?
|
||||
# os.environ['PYTHON_COLORS'] = '0'
|
||||
# os.environ['NO_COLOR'] = '1'
|
||||
|
||||
import trio
|
||||
import tractor
|
||||
|
||||
# disable `pbdp` prompt colors
|
||||
# for prompt matching in test.
|
||||
def disable_pdbp_color():
|
||||
if os.environ['PYTHON_COLORS'] == '0':
|
||||
from tractor.devx.debug import _repl
|
||||
_repl.TractorConfig.use_pygments = False
|
||||
|
||||
|
||||
# TODO: only import these when not running from test harness?
|
||||
# can we detect `pexpect` usage maybe?
|
||||
# from tractor.devx.debug import (
|
||||
|
|
@ -42,6 +55,7 @@ async def start_n_sync_pause(
|
|||
ctx: tractor.Context,
|
||||
):
|
||||
actor: tractor.Actor = tractor.current_actor()
|
||||
disable_pdbp_color()
|
||||
|
||||
# sync to parent-side task
|
||||
await ctx.started()
|
||||
|
|
@ -52,13 +66,15 @@ async def start_n_sync_pause(
|
|||
|
||||
|
||||
async def main() -> None:
|
||||
disable_pdbp_color()
|
||||
async with (
|
||||
tractor.open_nursery(
|
||||
debug_mode=True,
|
||||
maybe_enable_greenback=True,
|
||||
enable_stack_on_sig=True,
|
||||
# loglevel='warning',
|
||||
# loglevel='devx',
|
||||
|
||||
# XXX flags required for test pattern matching.
|
||||
loglevel='pdb',
|
||||
# enable_stack_on_sig=True,
|
||||
) as an,
|
||||
trio.open_nursery() as tn,
|
||||
):
|
||||
|
|
@ -68,8 +84,8 @@ async def main() -> None:
|
|||
p: tractor.Portal = await an.start_actor(
|
||||
'subactor',
|
||||
enable_modules=[__name__],
|
||||
# infect_asyncio=True,
|
||||
debug_mode=True,
|
||||
# infect_asyncio=True,
|
||||
)
|
||||
|
||||
# TODO: 3 sub-actor usage cases:
|
||||
|
|
|
|||
|
|
@ -90,7 +90,7 @@ async def main() -> list[int]:
|
|||
# yes, a nursery which spawns `trio`-"actors" B)
|
||||
an: ActorNursery
|
||||
async with tractor.open_nursery(
|
||||
loglevel='cancel',
|
||||
loglevel='error',
|
||||
# debug_mode=True,
|
||||
) as an:
|
||||
|
||||
|
|
@ -118,8 +118,10 @@ async def main() -> list[int]:
|
|||
cancelled: bool = await portal.cancel_actor()
|
||||
assert cancelled
|
||||
|
||||
print(f"STREAM TIME = {time.time() - start}")
|
||||
print(f"STREAM + SPAWN TIME = {time.time() - pre_start}")
|
||||
print(
|
||||
f"STREAM TIME = {time.time() - start}\n"
|
||||
f"STREAM + SPAWN TIME = {time.time() - pre_start}\n"
|
||||
)
|
||||
assert result_stream == list(range(seed))
|
||||
return result_stream
|
||||
|
||||
|
|
|
|||
|
|
@ -11,21 +11,17 @@ import tractor
|
|||
|
||||
|
||||
async def aio_echo_server(
|
||||
to_trio: trio.MemorySendChannel,
|
||||
from_trio: asyncio.Queue,
|
||||
|
||||
chan: tractor.to_asyncio.LinkedTaskChannel,
|
||||
) -> None:
|
||||
|
||||
# a first message must be sent **from** this ``asyncio``
|
||||
# task or the ``trio`` side will never unblock from
|
||||
# ``tractor.to_asyncio.open_channel_from():``
|
||||
to_trio.send_nowait('start')
|
||||
chan.started_nowait('start')
|
||||
|
||||
# XXX: this uses an ``from_trio: asyncio.Queue`` currently but we
|
||||
# should probably offer something better.
|
||||
while True:
|
||||
# echo the msg back
|
||||
to_trio.send_nowait(await from_trio.get())
|
||||
chan.send_nowait(await chan.get())
|
||||
await asyncio.sleep(0)
|
||||
|
||||
|
||||
|
|
@ -37,7 +33,7 @@ async def trio_to_aio_echo_server(
|
|||
# message.
|
||||
async with tractor.to_asyncio.open_channel_from(
|
||||
aio_echo_server,
|
||||
) as (first, chan):
|
||||
) as (chan, first):
|
||||
|
||||
assert first == 'start'
|
||||
await ctx.started(first)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
import os
|
||||
|
||||
|
||||
async def child_fn() -> str:
|
||||
return f"child OK pid={os.getpid()}"
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
"""
|
||||
Integration test: spawning tractor actors from an MPI process.
|
||||
|
||||
When a parent is launched via ``mpirun``, Open MPI sets ``OMPI_*`` env
|
||||
vars that bind ``MPI_Init`` to the ``orted`` daemon. Tractor children
|
||||
inherit those env vars, so if ``inherit_parent_main=True`` (the default)
|
||||
the child re-executes ``__main__``, re-imports ``mpi4py``, and
|
||||
``MPI_Init_thread`` fails because the child was never spawned by
|
||||
``orted``::
|
||||
|
||||
getting local rank failed
|
||||
--> Returned value No permission (-17) instead of ORTE_SUCCESS
|
||||
|
||||
Passing ``inherit_parent_main=False`` and placing RPC functions in a
|
||||
separate importable module (``_child``) avoids the re-import entirely.
|
||||
|
||||
Usage::
|
||||
|
||||
mpirun --allow-run-as-root -np 1 python -m \
|
||||
examples.integration.mpi4py.inherit_parent_main
|
||||
"""
|
||||
|
||||
from mpi4py import MPI
|
||||
|
||||
import os
|
||||
import trio
|
||||
import tractor
|
||||
|
||||
from ._child import child_fn
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
rank = MPI.COMM_WORLD.Get_rank()
|
||||
print(f"[parent] rank={rank} pid={os.getpid()}", flush=True)
|
||||
|
||||
async with tractor.open_nursery(start_method='trio') as an:
|
||||
portal = await an.start_actor(
|
||||
'mpi-child',
|
||||
enable_modules=[child_fn.__module__],
|
||||
# Without this the child replays __main__, which
|
||||
# re-imports mpi4py and crashes on MPI_Init.
|
||||
inherit_parent_main=False,
|
||||
)
|
||||
result = await portal.run(child_fn)
|
||||
print(f"[parent] got: {result}", flush=True)
|
||||
await portal.cancel_actor()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
trio.run(main)
|
||||
|
|
@ -10,7 +10,7 @@ async def main(service_name):
|
|||
await an.start_actor(service_name)
|
||||
|
||||
async with tractor.get_registry() as portal:
|
||||
print(f"Arbiter is listening on {portal.channel}")
|
||||
print(f"Registrar is listening on {portal.channel}")
|
||||
|
||||
async with tractor.wait_for_actor(service_name) as sockaddr:
|
||||
print(f"my_service is found at {sockaddr}")
|
||||
|
|
|
|||
|
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"nodes": {
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1769018530,
|
||||
"narHash": "sha256-MJ27Cy2NtBEV5tsK+YraYr2g851f3Fl1LpNHDzDX15c=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "88d3861acdd3d2f0e361767018218e51810df8a1",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nixos",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
# An "impure" template thx to `pyproject.nix`,
|
||||
# https://pyproject-nix.github.io/pyproject.nix/templates.html#impure
|
||||
# https://github.com/pyproject-nix/pyproject.nix/blob/master/templates/impure/flake.nix
|
||||
{
|
||||
description = "An impure overlay (w dev-shell) using `uv`";
|
||||
|
||||
inputs = {
|
||||
nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
|
||||
};
|
||||
|
||||
outputs =
|
||||
{ nixpkgs, ... }:
|
||||
let
|
||||
inherit (nixpkgs) lib;
|
||||
forAllSystems = lib.genAttrs lib.systems.flakeExposed;
|
||||
in
|
||||
{
|
||||
devShells = forAllSystems (
|
||||
system:
|
||||
let
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
|
||||
# XXX NOTE XXX, for now we overlay specific pkgs via
|
||||
# a major-version-pinned-`cpython`
|
||||
cpython = "python313";
|
||||
venv_dir = "py313";
|
||||
pypkgs = pkgs."${cpython}Packages";
|
||||
in
|
||||
{
|
||||
default = pkgs.mkShell {
|
||||
|
||||
packages = [
|
||||
# XXX, ensure sh completions activate!
|
||||
pkgs.bashInteractive
|
||||
pkgs.bash-completion
|
||||
|
||||
# XXX, on nix(os), use pkgs version to avoid
|
||||
# build/sys-sh-integration issues
|
||||
pkgs.ruff
|
||||
|
||||
pkgs.uv
|
||||
pkgs.${cpython}# ?TODO^ how to set from `cpython` above?
|
||||
];
|
||||
|
||||
shellHook = ''
|
||||
# unmask to debug **this** dev-shell-hook
|
||||
# set -e
|
||||
|
||||
# link-in c++ stdlib for various AOT-ext-pkgs (numpy, etc.)
|
||||
LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib:$LD_LIBRARY_PATH"
|
||||
|
||||
export LD_LIBRARY_PATH
|
||||
|
||||
# RUNTIME-SETTINGS
|
||||
# ------ uv ------
|
||||
# - always use the ./py313/ venv-subdir
|
||||
# - sync env with all extras
|
||||
export UV_PROJECT_ENVIRONMENT=${venv_dir}
|
||||
uv sync --dev --all-extras
|
||||
|
||||
# ------ TIPS ------
|
||||
# NOTE, to launch the py-venv installed `xonsh` (like @goodboy)
|
||||
# run the `nix develop` cmd with,
|
||||
# >> nix develop -c uv run xonsh
|
||||
'';
|
||||
};
|
||||
}
|
||||
);
|
||||
};
|
||||
}
|
||||
145
pyproject.toml
145
pyproject.toml
|
|
@ -9,7 +9,7 @@ name = "tractor"
|
|||
version = "0.1.0a6dev0"
|
||||
description = 'structured concurrent `trio`-"actors"'
|
||||
authors = [{ name = "Tyler Goodlet", email = "goodboy_foss@protonmail.com" }]
|
||||
requires-python = ">= 3.11"
|
||||
requires-python = ">=3.13, <3.15"
|
||||
readme = "docs/README.rst"
|
||||
license = "AGPL-3.0-or-later"
|
||||
keywords = [
|
||||
|
|
@ -24,11 +24,14 @@ keywords = [
|
|||
classifiers = [
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Operating System :: POSIX :: Linux",
|
||||
"Operating System :: MacOS",
|
||||
"Framework :: Trio",
|
||||
"License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Programming Language :: Python :: 3.14",
|
||||
"Topic :: System :: Distributed Computing",
|
||||
]
|
||||
dependencies = [
|
||||
|
|
@ -42,48 +45,109 @@ dependencies = [
|
|||
"wrapt>=1.16.0,<2",
|
||||
"colorlog>=6.8.2,<7",
|
||||
# built-in multi-actor `pdb` REPL
|
||||
"pdbp>=1.6,<2", # windows only (from `pdbp`)
|
||||
"pdbp>=1.8.2,<2", # windows only (from `pdbp`)
|
||||
# typed IPC msging
|
||||
"msgspec>=0.19.0",
|
||||
"cffi>=1.17.1",
|
||||
"msgspec>=0.20.0",
|
||||
"bidict>=0.23.1",
|
||||
"multiaddr>=0.2.0",
|
||||
"platformdirs>=4.4.0",
|
||||
# per-actor `argv[0]` proc-title for OS-level diag tools
|
||||
# (`ps`, `top`, `psutil`-backed tooling like `acli.pytree`).
|
||||
# Optional at runtime — guarded by `try/except ImportError` in
|
||||
# `tractor.devx._proctitle` — but listed here so default
|
||||
# installs benefit from it. See tracking issue for follow-ups
|
||||
# (e.g. richer formats, per-backend overrides).
|
||||
"setproctitle>=1.3,<2",
|
||||
]
|
||||
|
||||
# ------ project ------
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
{include-group = 'devx'},
|
||||
{include-group = 'testing'},
|
||||
{include-group = 'repl'},
|
||||
{include-group = 'sync_pause'},
|
||||
]
|
||||
devx = [
|
||||
# `tractor.devx` tooling
|
||||
"stackscope>=0.2.2,<0.3",
|
||||
# ^ requires this?
|
||||
"typing-extensions>=4.14.1",
|
||||
# {include-group = 'sync_pause'}, # XXX, no 3.14 yet!
|
||||
]
|
||||
sync_pause = [
|
||||
"greenback>=1.2.1,<2", # TODO? 3.14 greenlet on nix?
|
||||
]
|
||||
testing = [
|
||||
# test suite
|
||||
# TODO: maybe some of these layout choices?
|
||||
# https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
|
||||
"pytest>=8.3.5",
|
||||
"pexpect>=4.9.0,<5",
|
||||
# `tractor.devx` tooling
|
||||
"greenback>=1.2.1,<2",
|
||||
"stackscope>=0.2.2,<0.3",
|
||||
# ^ requires this?
|
||||
"typing-extensions>=4.14.1",
|
||||
|
||||
# per-test wall-clock bound (used via
|
||||
# `@pytest.mark.timeout(..., method='thread')` on the
|
||||
# known-hanging `subint`-backend audit tests; see
|
||||
# `ai/conc-anal/subint_*_issue.md`).
|
||||
"pytest-timeout>=2.3",
|
||||
# used by `tractor._testing._reap` for the
|
||||
# `tractor-reap` zombie-subactor + leaked-shm
|
||||
# cleanup utility (xplatform `Process.memory_maps`,
|
||||
# `Process.open_files`).
|
||||
"psutil>=7.0.0",
|
||||
]
|
||||
repl = [
|
||||
"pyperclip>=1.9.0",
|
||||
"prompt-toolkit>=3.0.50",
|
||||
"xonsh>=0.19.2",
|
||||
"xonsh>=0.23.0",
|
||||
"psutil>=7.0.0",
|
||||
]
|
||||
lint = [
|
||||
"ruff>=0.9.6"
|
||||
]
|
||||
# XXX, used for linux-only hi perf eventfd+shm channels
|
||||
# now mostly moved over to `hotbaud`.
|
||||
eventfd = [
|
||||
"cffi>=1.17.1",
|
||||
]
|
||||
subints = [
|
||||
"msgspec>=0.21.0",
|
||||
]
|
||||
# TODO, add these with sane versions; were originally in
|
||||
# `requirements-docs.txt`..
|
||||
# docs = [
|
||||
# "sphinx>="
|
||||
# "sphinx_book_theme>="
|
||||
# ]
|
||||
|
||||
# ------ dependency-groups ------
|
||||
|
||||
# ------ dependency-groups ------
|
||||
[tool.uv.dependency-groups]
|
||||
# for subints, we require 3.14+ due to 2 issues,
|
||||
# - hanging behaviour for various multi-task teardown cases (see
|
||||
# "Availability" section in the `tractor.spawn._subints` doc string).
|
||||
# - `msgspec` support which is oustanding per PEP 684 upstream tracker:
|
||||
# https://github.com/jcrist/msgspec/issues/563
|
||||
#
|
||||
# https://docs.astral.sh/uv/concepts/projects/dependencies/#group-requires-python
|
||||
subints = {requires-python = ">=3.14"}
|
||||
eventfd = {requires-python = ">=3.13, <3.14"}
|
||||
sync_pause = {requires-python = ">=3.13, <3.14"}
|
||||
|
||||
[tool.uv.sources]
|
||||
# XXX NOTE, only for @goodboy's hacking on `pprint(sort_dicts=False)`
|
||||
# for the `pp` alias..
|
||||
# pdbp = { path = "../pdbp", editable = true }
|
||||
# ------ gh upstream ------
|
||||
# xonsh = { git = 'https://github.com/anki-code/xonsh.git', branch = 'prompt_next_suggestion' }
|
||||
# ^ https://github.com/xonsh/xonsh/pull/6048
|
||||
# xonsh = { git = 'https://github.com/xonsh/xonsh.git', branch = 'main' }
|
||||
# xonsh = { path = "../xonsh", editable = true }
|
||||
|
||||
# [tool.uv.sources.pdbp]
|
||||
# XXX, in case we need to tmp patch again.
|
||||
# git = "https://github.com/goodboy/pdbp.git"
|
||||
# branch ="repair_stack_trace_frame_indexing"
|
||||
# path = "../pdbp"
|
||||
# editable = true
|
||||
|
||||
# ------ tool.uv.sources ------
|
||||
# TODO, distributed (multi-host) extensions
|
||||
|
|
@ -145,20 +209,69 @@ all_bullets = true
|
|||
|
||||
[tool.pytest.ini_options]
|
||||
minversion = '6.0'
|
||||
# NOTE: `pytest-timeout`'s global per-test cap is intentionally
|
||||
# NOT set — both of its enforcement methods break trio's
|
||||
# runtime under our fork-based spawn backends:
|
||||
#
|
||||
# - `method='signal'` (the default; SIGALRM) raises `Failed`
|
||||
# synchronously from the signal handler in trio's main
|
||||
# thread, which leaves `GLOBAL_RUN_CONTEXT` half-installed
|
||||
# ("Trio guest run got abandoned"). EVERY subsequent
|
||||
# `trio.run()` in the same pytest session then bails with
|
||||
# `RuntimeError: Attempted to call run() from inside a
|
||||
# run()` — full-session poison: a single 200s hang
|
||||
# cascades into 30+ false-positive failures across
|
||||
# downstream test files.
|
||||
#
|
||||
# - `method='thread'` calls `_thread.interrupt_main()` which
|
||||
# can let the resulting `KeyboardInterrupt` escape trio's
|
||||
# `KIManager` under fork-cascade teardown races, killing
|
||||
# the whole pytest session.
|
||||
#
|
||||
# For tests that legitimately need a wall-clock cap, use
|
||||
# `with trio.fail_after(N):` INSIDE the test — trio's own
|
||||
# Cancelled machinery handles the timeout cleanly through
|
||||
# the actor nursery without disturbing global state. See
|
||||
# `tests/test_advanced_streaming.py::test_dynamic_pub_sub`'s
|
||||
# module-level NOTE for the canonical pattern.
|
||||
#
|
||||
# CI environments should rely on job-level wall-clock
|
||||
# timeouts (e.g. GitHub Actions `timeout-minutes`) for an
|
||||
# escape hatch on genuinely-stuck suites.
|
||||
# https://docs.pytest.org/en/stable/reference/reference.html#configuration-options
|
||||
testpaths = [
|
||||
'tests'
|
||||
]
|
||||
addopts = [
|
||||
# TODO: figure out why this isn't working..
|
||||
'--rootdir=./tests',
|
||||
|
||||
'--import-mode=importlib',
|
||||
# don't show frickin captured logs AGAIN in the report..
|
||||
'--show-capture=no',
|
||||
|
||||
# load builtin plugin since we need a boostrapping hook,
|
||||
# `pytest_load_initial_conftests()` for `--capture=` per:
|
||||
# https://docs.pytest.org/en/stable/reference/reference.html#bootstrapping-hooks
|
||||
'-p tractor._testing.pytest',
|
||||
|
||||
# disable `xonsh` plugin
|
||||
# https://docs.pytest.org/en/stable/how-to/plugins.html#disabling-plugins-from-autoloading
|
||||
# https://docs.pytest.org/en/stable/how-to/plugins.html#deactivating-unregistering-a-plugin-by-name
|
||||
'-p no:xonsh',
|
||||
|
||||
# XXX default on non-forking spawners
|
||||
'--capture=fd',
|
||||
# '--capture=sys',
|
||||
# ^XXX NOTE^ ALWAYS SET THIS for `*_forkserver` spawner
|
||||
# backends! see details @
|
||||
# `tractor._testing.pytest.pytest_load_initial_conftests()`
|
||||
|
||||
]
|
||||
log_cli = false
|
||||
# TODO: maybe some of these layout choices?
|
||||
# https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
|
||||
# pythonpath = "src"
|
||||
|
||||
# https://docs.pytest.org/en/stable/reference/reference.html#confval-console_output_style
|
||||
console_output_style = 'progress'
|
||||
# ------ tool.pytest ------
|
||||
|
|
|
|||
|
|
@ -1,8 +0,0 @@
|
|||
# vim: ft=ini
|
||||
# pytest.ini for tractor
|
||||
|
||||
[pytest]
|
||||
# don't show frickin captured logs AGAIN in the report..
|
||||
addopts = --show-capture='no'
|
||||
log_cli = false
|
||||
; minversion = 6.0
|
||||
|
|
@ -35,8 +35,8 @@ exclude = [
|
|||
line-length = 88
|
||||
indent-width = 4
|
||||
|
||||
# Assume Python 3.9
|
||||
target-version = "py311"
|
||||
# assume latest minor cpython
|
||||
target-version = "py313"
|
||||
|
||||
[lint]
|
||||
# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,237 @@
|
|||
#!/usr/bin/env python3
|
||||
# tractor: structured concurrent "actors".
|
||||
# Copyright 2018-eternity Tyler Goodlet.
|
||||
#
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
'''
|
||||
`tractor-reap` — SC-polite zombie-subactor reaper +
|
||||
optional `/dev/shm/` orphan-segment sweep.
|
||||
|
||||
Two cleanup phases (run in order when both are enabled):
|
||||
|
||||
1. **process reap** — finds `tractor` subactor processes
|
||||
left alive after a `pytest` (or any tractor-app) run
|
||||
that failed to fully cancel its actor tree, then sends
|
||||
SIGINT with a bounded grace window before escalating
|
||||
to SIGKILL.
|
||||
|
||||
2. **shm sweep** (`--shm` / `--shm-only`) — unlinks
|
||||
`/dev/shm/<file>` entries owned by the current uid
|
||||
that no live process has open (mmap'd or fd-held).
|
||||
Needed because `tractor` disables
|
||||
`mp.resource_tracker` (see `tractor.ipc._mp_bs`), so a
|
||||
hard-crashing actor leaves leaked segments that
|
||||
nothing else GCs.
|
||||
|
||||
3. **UDS sweep** (`--uds` / `--uds-only`) — unlinks
|
||||
`${XDG_RUNTIME_DIR}/tractor/<name>@<pid>.sock` files
|
||||
whose binder pid is dead (or the `1616` registry
|
||||
sentinel). Needed because the IPC server's
|
||||
`os.unlink()` cleanup lives in a `finally:` block
|
||||
that doesn't always run on hard exits (SIGKILL,
|
||||
escaped `KeyboardInterrupt`, etc.) — see issue #452.
|
||||
|
||||
Process-reap detection modes (auto-selected):
|
||||
|
||||
--parent <pid> : descendant-mode — kill procs whose
|
||||
PPid == <pid>. Use when a parent
|
||||
is still alive and you want to
|
||||
scope the sweep precisely (e.g.
|
||||
CI wrapper calling in from outside
|
||||
pytest).
|
||||
|
||||
(default) : orphan-mode — kill procs with
|
||||
PPid==1 (init-reparented) whose
|
||||
cwd matches the repo root AND
|
||||
whose cmdline contains `python`.
|
||||
The cwd filter is what prevents
|
||||
sweeping unrelated init-children.
|
||||
|
||||
Usage:
|
||||
|
||||
# process reap only (default)
|
||||
scripts/tractor-reap
|
||||
|
||||
# process reap + shm sweep
|
||||
scripts/tractor-reap --shm
|
||||
|
||||
# only the shm sweep, skip process reap
|
||||
scripts/tractor-reap --shm-only
|
||||
|
||||
# process reap + shm + UDS sweep (the works)
|
||||
scripts/tractor-reap --shm --uds
|
||||
|
||||
# only UDS sweep
|
||||
scripts/tractor-reap --uds-only
|
||||
|
||||
# from inside a still-live supervisor
|
||||
scripts/tractor-reap --parent 12345
|
||||
|
||||
# dry-run: list what would be reaped, don't act
|
||||
scripts/tractor-reap -n
|
||||
scripts/tractor-reap --shm --uds -n
|
||||
|
||||
'''
|
||||
import argparse
|
||||
import pathlib
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def _repo_root() -> pathlib.Path:
|
||||
'''
|
||||
Use `git rev-parse --show-toplevel` when available;
|
||||
fall back to the repo this script lives in.
|
||||
|
||||
'''
|
||||
try:
|
||||
out: str = subprocess.check_output(
|
||||
['git', 'rev-parse', '--show-toplevel'],
|
||||
stderr=subprocess.DEVNULL,
|
||||
text=True,
|
||||
).strip()
|
||||
return pathlib.Path(out)
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
return pathlib.Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='tractor-reap',
|
||||
description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--parent', '-p',
|
||||
type=int,
|
||||
default=None,
|
||||
help='descendant-mode: reap procs with PPid==<pid>',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--grace', '-g',
|
||||
type=float,
|
||||
default=3.0,
|
||||
help='SIGINT grace window in seconds (default 3.0)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dry-run', '-n',
|
||||
action='store_true',
|
||||
help='list matched pids/paths but do not signal/unlink',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--shm',
|
||||
action='store_true',
|
||||
help=(
|
||||
'after process reap, also unlink orphaned '
|
||||
'/dev/shm segments owned by the current user '
|
||||
'that no live process is mapping or holding open'
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
'--shm-only',
|
||||
action='store_true',
|
||||
help='skip process reap; only do the shm sweep',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--uds',
|
||||
action='store_true',
|
||||
help=(
|
||||
'after process reap, also unlink orphaned '
|
||||
'${XDG_RUNTIME_DIR}/tractor/*.sock files '
|
||||
'whose binder pid is dead (or the 1616 '
|
||||
'registry sentinel). See issue #452.'
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
'--uds-only',
|
||||
action='store_true',
|
||||
help='skip process reap + shm; only do the UDS sweep',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
# any *-only flag also skips the process reap phase
|
||||
skip_proc_reap: bool = (
|
||||
args.shm_only
|
||||
or
|
||||
args.uds_only
|
||||
)
|
||||
|
||||
# import lazily so `--help` doesn't require the tractor
|
||||
# package to be importable (e.g. when running from a
|
||||
# shell not inside a venv).
|
||||
repo = _repo_root()
|
||||
sys.path.insert(0, str(repo))
|
||||
from tractor._testing._reap import (
|
||||
find_descendants,
|
||||
find_orphans,
|
||||
find_orphaned_shm,
|
||||
find_orphaned_uds,
|
||||
reap,
|
||||
reap_shm,
|
||||
reap_uds,
|
||||
)
|
||||
|
||||
rc: int = 0
|
||||
|
||||
# --- phase 1: process reap (skipped under --*-only) ---
|
||||
if not skip_proc_reap:
|
||||
if args.parent is not None:
|
||||
pids: list[int] = find_descendants(args.parent)
|
||||
mode: str = f'descendants of PPid={args.parent}'
|
||||
else:
|
||||
pids = find_orphans(repo)
|
||||
mode = f'orphans (PPid=1, cwd={repo})'
|
||||
|
||||
if not pids:
|
||||
print(f'[tractor-reap] no {mode} to reap')
|
||||
elif args.dry_run:
|
||||
print(
|
||||
f'[tractor-reap] dry-run — {mode}:\n {pids}'
|
||||
)
|
||||
else:
|
||||
_, survivors = reap(pids, grace=args.grace)
|
||||
if survivors:
|
||||
rc = 1
|
||||
|
||||
# --- phase 2: shm sweep (opt-in) ---
|
||||
if args.shm or args.shm_only:
|
||||
leaked: list[str] = find_orphaned_shm()
|
||||
if not leaked:
|
||||
print(
|
||||
'[tractor-reap] no orphaned /dev/shm '
|
||||
'segments to sweep'
|
||||
)
|
||||
elif args.dry_run:
|
||||
print(
|
||||
f'[tractor-reap] dry-run — {len(leaked)} '
|
||||
f'orphaned shm segment(s):\n {leaked}'
|
||||
)
|
||||
else:
|
||||
_, errors = reap_shm(leaked)
|
||||
if errors:
|
||||
rc = 1
|
||||
|
||||
# --- phase 3: UDS sweep (opt-in) ---
|
||||
if args.uds or args.uds_only:
|
||||
leaked_uds: list[str] = find_orphaned_uds()
|
||||
if not leaked_uds:
|
||||
print(
|
||||
'[tractor-reap] no orphaned UDS sock-files '
|
||||
'to sweep'
|
||||
)
|
||||
elif args.dry_run:
|
||||
print(
|
||||
f'[tractor-reap] dry-run — {len(leaked_uds)} '
|
||||
f'orphaned UDS sock-file(s):\n {leaked_uds}'
|
||||
)
|
||||
else:
|
||||
_, errors = reap_uds(leaked_uds)
|
||||
if errors:
|
||||
rc = 1
|
||||
|
||||
# exit 0 if everything cleaned cleanly, else 1 — useful
|
||||
# for CI health-check chaining.
|
||||
return rc
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
raise SystemExit(main())
|
||||
|
|
@ -9,8 +9,11 @@ import os
|
|||
import signal
|
||||
import platform
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
import pytest
|
||||
import tractor
|
||||
from tractor._testing import (
|
||||
examples_dir as examples_dir,
|
||||
tractor_test as tractor_test,
|
||||
|
|
@ -19,31 +22,102 @@ from tractor._testing import (
|
|||
|
||||
pytest_plugins: list[str] = [
|
||||
'pytester',
|
||||
'tractor._testing.pytest',
|
||||
# NOTE, now loaded in `pytest-ini` section of `pyproject.toml`
|
||||
# 'tractor._testing.pytest',
|
||||
]
|
||||
|
||||
_ci_env: bool = os.environ.get('CI', False)
|
||||
_non_linux: bool = platform.system() != 'Linux'
|
||||
|
||||
# Sending signal.SIGINT on subprocess fails on windows. Use CTRL_* alternatives
|
||||
if platform.system() == 'Windows':
|
||||
_KILL_SIGNAL = signal.CTRL_BREAK_EVENT
|
||||
_INT_SIGNAL = signal.CTRL_C_EVENT
|
||||
_INT_RETURN_CODE = 3221225786
|
||||
_PROC_SPAWN_WAIT = 2
|
||||
else:
|
||||
_KILL_SIGNAL = signal.SIGKILL
|
||||
_INT_SIGNAL = signal.SIGINT
|
||||
_INT_RETURN_CODE = 1 if sys.version_info < (3, 8) else -signal.SIGINT.value
|
||||
_PROC_SPAWN_WAIT = (
|
||||
0.6
|
||||
if sys.version_info < (3, 7)
|
||||
else 0.4
|
||||
)
|
||||
|
||||
|
||||
no_windows = pytest.mark.skipif(
|
||||
platform.system() == "Windows",
|
||||
reason="Test is unsupported on windows",
|
||||
)
|
||||
no_macos = pytest.mark.skipif(
|
||||
platform.system() == "Darwin",
|
||||
reason="Test is unsupported on MacOS",
|
||||
)
|
||||
|
||||
|
||||
def get_cpu_state(
|
||||
icpu: int = 0,
|
||||
setting: Literal[
|
||||
'scaling_governor',
|
||||
'*_pstate_max_freq',
|
||||
'scaling_max_freq',
|
||||
# 'scaling_cur_freq',
|
||||
] = '*_pstate_max_freq',
|
||||
) -> tuple[
|
||||
Path,
|
||||
str|int,
|
||||
]|None:
|
||||
'''
|
||||
Attempt to read the (first) CPU's setting according
|
||||
to the set `setting` from under the file-sys,
|
||||
|
||||
/sys/devices/system/cpu/cpu0/cpufreq/{setting}
|
||||
|
||||
Useful to determine latency headroom for various perf affected
|
||||
test suites.
|
||||
|
||||
'''
|
||||
try:
|
||||
# Read governor for core 0 (usually same for all)
|
||||
setting_path: Path = list(
|
||||
Path(f'/sys/devices/system/cpu/cpu{icpu}/cpufreq/')
|
||||
.glob(f'{setting}')
|
||||
)[0] # <- XXX must be single match!
|
||||
with open(
|
||||
setting_path,
|
||||
'r',
|
||||
) as f:
|
||||
return (
|
||||
setting_path,
|
||||
f.read().strip(),
|
||||
)
|
||||
except (FileNotFoundError, IndexError):
|
||||
return None
|
||||
|
||||
|
||||
def cpu_scaling_factor() -> float:
|
||||
'''
|
||||
Return a latency-headroom multiplier (>= 1.0) reflecting how
|
||||
much to inflate time-limits when CPU-freq scaling is active on
|
||||
linux.
|
||||
|
||||
When no scaling info is available (non-linux, missing sysfs),
|
||||
returns 1.0 (i.e. no headroom adjustment needed).
|
||||
|
||||
'''
|
||||
if _non_linux:
|
||||
return 1.
|
||||
|
||||
mx = get_cpu_state()
|
||||
cur = get_cpu_state(setting='scaling_max_freq')
|
||||
if mx is None or cur is None:
|
||||
return 1.
|
||||
|
||||
_mx_pth, max_freq = mx
|
||||
_cur_pth, cur_freq = cur
|
||||
cpu_scaled: float = int(cur_freq) / int(max_freq)
|
||||
|
||||
if cpu_scaled != 1.:
|
||||
return 1. / (
|
||||
cpu_scaled * 2 # <- bc likely "dual threaded"
|
||||
)
|
||||
|
||||
return 1.
|
||||
|
||||
|
||||
def pytest_addoption(
|
||||
|
|
@ -56,21 +130,64 @@ def pytest_addoption(
|
|||
"--ll",
|
||||
action="store",
|
||||
dest='loglevel',
|
||||
default='ERROR', help="logging level to set when testing"
|
||||
default=None,
|
||||
help="logging level to set when testing",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope='session', autouse=True)
|
||||
def loglevel(request):
|
||||
def loglevel(
|
||||
request: pytest.FixtureRequest,
|
||||
) -> str|None:
|
||||
import tractor
|
||||
orig = tractor.log._default_loglevel
|
||||
level = tractor.log._default_loglevel = request.config.option.loglevel
|
||||
tractor.log.get_console_log(level)
|
||||
yield level
|
||||
flag_level: str|None = request.config.option.loglevel
|
||||
|
||||
if flag_level is not None:
|
||||
tractor.log._default_loglevel = flag_level
|
||||
|
||||
log = tractor.log.get_console_log(
|
||||
level=flag_level,
|
||||
name='tractor', # <- enable root logger
|
||||
)
|
||||
log.info(
|
||||
f'Test-harness set runtime loglevel: {flag_level!r}\n'
|
||||
)
|
||||
yield flag_level
|
||||
tractor.log._default_loglevel = orig
|
||||
|
||||
|
||||
_ci_env: bool = os.environ.get('CI', False)
|
||||
@pytest.fixture(scope='function')
|
||||
def test_log(
|
||||
request: pytest.FixtureRequest,
|
||||
loglevel: str,
|
||||
) -> tractor.log.StackLevelAdapter:
|
||||
'''
|
||||
Deliver a per test-module-fn logger instance for reporting from
|
||||
within actual test bodies/fixtures.
|
||||
|
||||
For example this can be handy to report certain error cases from
|
||||
exception handlers using `test_log.exception()`.
|
||||
|
||||
'''
|
||||
modname: str = request.function.__module__
|
||||
log = tractor.log.get_logger(
|
||||
name=modname, # <- enable root logger
|
||||
# pkg_name='tests',
|
||||
)
|
||||
_log = tractor.log.get_console_log(
|
||||
level=loglevel,
|
||||
logger=log,
|
||||
name=modname,
|
||||
# pkg_name='tests',
|
||||
)
|
||||
_log.debug(
|
||||
f'In-test-logging requested\n'
|
||||
f'test_log.name: {log.name!r}\n'
|
||||
f'level: {loglevel!r}\n'
|
||||
|
||||
)
|
||||
yield _log
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
|
|
@ -85,92 +202,51 @@ def ci_env() -> bool:
|
|||
def sig_prog(
|
||||
proc: subprocess.Popen,
|
||||
sig: int,
|
||||
canc_timeout: float = 0.1,
|
||||
canc_timeout: float = 0.2,
|
||||
tries: int = 3,
|
||||
) -> int:
|
||||
"Kill the actor-process with ``sig``."
|
||||
proc.send_signal(sig)
|
||||
time.sleep(canc_timeout)
|
||||
if not proc.poll():
|
||||
'''
|
||||
Kill the actor-process with `sig`.
|
||||
|
||||
Prefer to kill with the provided signal and
|
||||
failing a `canc_timeout`, send a `SIKILL`-like
|
||||
to ensure termination.
|
||||
|
||||
'''
|
||||
for i in range(tries):
|
||||
proc.send_signal(sig)
|
||||
if proc.poll() is None:
|
||||
print(
|
||||
f'WARNING, proc still alive after,\n'
|
||||
f'canc_timeout={canc_timeout!r}\n'
|
||||
f'sig={sig!r}\n'
|
||||
f'\n'
|
||||
f'{proc.args!r}\n'
|
||||
)
|
||||
time.sleep(canc_timeout)
|
||||
else:
|
||||
# TODO: why sometimes does SIGINT not work on teardown?
|
||||
# seems to happen only when trace logging enabled?
|
||||
proc.send_signal(_KILL_SIGNAL)
|
||||
if proc.poll() is None:
|
||||
print(
|
||||
f'XXX WARNING KILLING PROG WITH SIGINT XXX\n'
|
||||
f'canc_timeout={canc_timeout!r}\n'
|
||||
f'{proc.args!r}\n'
|
||||
)
|
||||
proc.send_signal(_KILL_SIGNAL)
|
||||
|
||||
ret: int = proc.wait()
|
||||
assert ret
|
||||
|
||||
|
||||
# TODO: factor into @cm and move to `._testing`?
|
||||
@pytest.fixture
|
||||
def daemon(
|
||||
debug_mode: bool,
|
||||
loglevel: str,
|
||||
testdir: pytest.Pytester,
|
||||
reg_addr: tuple[str, int],
|
||||
tpt_proto: str,
|
||||
|
||||
) -> subprocess.Popen:
|
||||
'''
|
||||
Run a daemon root actor as a separate actor-process tree and
|
||||
"remote registrar" for discovery-protocol related tests.
|
||||
|
||||
'''
|
||||
if loglevel in ('trace', 'debug'):
|
||||
# XXX: too much logging will lock up the subproc (smh)
|
||||
loglevel: str = 'info'
|
||||
|
||||
code: str = (
|
||||
"import tractor; "
|
||||
"tractor.run_daemon([], "
|
||||
"registry_addrs={reg_addrs}, "
|
||||
"debug_mode={debug_mode}, "
|
||||
"loglevel={ll})"
|
||||
).format(
|
||||
reg_addrs=str([reg_addr]),
|
||||
ll="'{}'".format(loglevel) if loglevel else None,
|
||||
debug_mode=debug_mode,
|
||||
)
|
||||
cmd: list[str] = [
|
||||
sys.executable,
|
||||
'-c', code,
|
||||
]
|
||||
# breakpoint()
|
||||
kwargs = {}
|
||||
if platform.system() == 'Windows':
|
||||
# without this, tests hang on windows forever
|
||||
kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
|
||||
|
||||
proc: subprocess.Popen = testdir.popen(
|
||||
cmd,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# UDS sockets are **really** fast to bind()/listen()/connect()
|
||||
# so it's often required that we delay a bit more starting
|
||||
# the first actor-tree..
|
||||
if tpt_proto == 'uds':
|
||||
global _PROC_SPAWN_WAIT
|
||||
_PROC_SPAWN_WAIT = 0.6
|
||||
|
||||
time.sleep(_PROC_SPAWN_WAIT)
|
||||
|
||||
assert not proc.returncode
|
||||
yield proc
|
||||
sig_prog(proc, _INT_SIGNAL)
|
||||
|
||||
# XXX! yeah.. just be reaaal careful with this bc sometimes it
|
||||
# can lock up on the `_io.BufferedReader` and hang..
|
||||
stderr: str = proc.stderr.read().decode()
|
||||
if stderr:
|
||||
print(
|
||||
f'Daemon actor tree produced STDERR:\n'
|
||||
f'{proc.args}\n'
|
||||
f'\n'
|
||||
f'{stderr}\n'
|
||||
)
|
||||
if proc.returncode != -2:
|
||||
raise RuntimeError(
|
||||
'Daemon actor tree failed !?\n'
|
||||
f'{proc.args}\n'
|
||||
)
|
||||
# NOTE, the `daemon` fixture (+ its `_wait_for_daemon_ready`
|
||||
# helper + the post-yield teardown drain logic) has been
|
||||
# moved to `tests/discovery/conftest.py` since 100% of its
|
||||
# consumers are discovery-protocol tests now living under
|
||||
# that subdir. See:
|
||||
# - `tests/discovery/test_multi_program.py`
|
||||
# - `tests/discovery/test_registrar.py`
|
||||
# - `tests/discovery/test_tpt_bind_addrs.py`
|
||||
|
||||
|
||||
# @pytest.fixture(autouse=True)
|
||||
|
|
|
|||
|
|
@ -3,6 +3,9 @@
|
|||
|
||||
'''
|
||||
from __future__ import annotations
|
||||
import platform
|
||||
import os
|
||||
import signal
|
||||
import time
|
||||
from typing import (
|
||||
Callable,
|
||||
|
|
@ -32,14 +35,29 @@ if TYPE_CHECKING:
|
|||
from pexpect import pty_spawn
|
||||
|
||||
|
||||
_non_linux: bool = platform.system() != 'Linux'
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
# register custom marks to avoid warnings see,
|
||||
# https://docs.pytest.org/en/stable/how-to/writing_plugins.html#registering-custom-markers
|
||||
config.addinivalue_line(
|
||||
'markers',
|
||||
'ctlcs_bish: test will (likely) not behave under SIGINT..'
|
||||
)
|
||||
|
||||
# a fn that sub-instantiates a `pexpect.spawn()`
|
||||
# and returns it.
|
||||
type PexpectSpawner = Callable[[str], pty_spawn.spawn]
|
||||
type PexpectSpawner = Callable[
|
||||
[str],
|
||||
pty_spawn.spawn,
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def spawn(
|
||||
start_method: str,
|
||||
loglevel: str,
|
||||
testdir: pytest.Pytester,
|
||||
reg_addr: tuple[str, int],
|
||||
|
||||
|
|
@ -49,9 +67,19 @@ def spawn(
|
|||
run an `./examples/..` script by name.
|
||||
|
||||
'''
|
||||
if start_method != 'trio':
|
||||
supported_spawners: set[str] = {
|
||||
'trio',
|
||||
# `examples/debugging/<script>.py` picks up the spawn
|
||||
# backend via the `TRACTOR_SPAWN_METHOD` env-var which
|
||||
# is honored inside `tractor._root.open_root_actor()`,
|
||||
# so no per-script edits are required.
|
||||
'main_thread_forkserver',
|
||||
'subint_forkserver',
|
||||
}
|
||||
if start_method not in supported_spawners:
|
||||
pytest.skip(
|
||||
'`pexpect` based tests only supported on `trio` backend'
|
||||
f'`pexpect` based tests NOT supported on spawning-backend: {start_method!r}\n'
|
||||
f'supported-spawners: {supported_spawners!r}'
|
||||
)
|
||||
|
||||
def unset_colors():
|
||||
|
|
@ -63,27 +91,117 @@ def spawn(
|
|||
https://docs.python.org/3/using/cmdline.html#using-on-controlling-color
|
||||
|
||||
'''
|
||||
import os
|
||||
# disable colored tbs
|
||||
os.environ['PYTHON_COLORS'] = '0'
|
||||
# disable all ANSI color output
|
||||
# os.environ['NO_COLOR'] = '1'
|
||||
# ?TODO, doesn't seem to disable prompt color
|
||||
# for `pdbp`?
|
||||
|
||||
def set_spawn_method(
|
||||
start_method: str,
|
||||
):
|
||||
'''
|
||||
Drive the actor-spawn backend inside the spawned
|
||||
`examples/debugging/<script>.py` subproc via env-var
|
||||
(consumed by `tractor._root.open_root_actor()`),
|
||||
without requiring per-script CLI plumbing.
|
||||
|
||||
'''
|
||||
os.environ['TRACTOR_SPAWN_METHOD'] = start_method
|
||||
|
||||
def set_loglevel(
|
||||
loglevel: str|None,
|
||||
):
|
||||
'''
|
||||
Forward the test-suite parametrized `loglevel` into the
|
||||
spawned `examples/debugging/<script>.py` subproc via
|
||||
env-var (consumed by `tractor._root.open_root_actor()`),
|
||||
so console verbosity can be cranked or silenced from
|
||||
the test harness without per-script edits.
|
||||
|
||||
'''
|
||||
if loglevel:
|
||||
os.environ['TRACTOR_LOGLEVEL'] = loglevel
|
||||
else:
|
||||
os.environ.pop('TRACTOR_LOGLEVEL', None)
|
||||
|
||||
spawned: PexpectSpawner|None = None
|
||||
|
||||
def _spawn(
|
||||
cmd: str,
|
||||
expect_timeout: float = 4,
|
||||
start_method: str = start_method,
|
||||
loglevel: str|None = None,
|
||||
**mkcmd_kwargs,
|
||||
) -> pty_spawn.spawn:
|
||||
'''
|
||||
Inner closure handed to consumer tests to invoke
|
||||
`pytest.Pytester.spawn`
|
||||
|
||||
'''
|
||||
nonlocal spawned
|
||||
unset_colors()
|
||||
return testdir.spawn(
|
||||
set_spawn_method(start_method=start_method)
|
||||
set_loglevel(
|
||||
loglevel=loglevel,
|
||||
# ?TODO^ when should this be set by `--ll <level>` ?
|
||||
# by default we apply 'error' but there should be a diff
|
||||
# vs. when the flag IS NOT passed?
|
||||
)
|
||||
spawned = testdir.spawn(
|
||||
cmd=mk_cmd(
|
||||
cmd,
|
||||
**mkcmd_kwargs,
|
||||
),
|
||||
expect_timeout=3,
|
||||
expect_timeout=(timeout:=(
|
||||
expect_timeout + 6
|
||||
if _non_linux and _ci_env
|
||||
else expect_timeout
|
||||
)),
|
||||
# preexec_fn=unset_colors,
|
||||
# ^TODO? get `pytest` core to expose underlying
|
||||
# `pexpect.spawn()` stuff?
|
||||
)
|
||||
# sanity
|
||||
assert spawned.timeout == timeout
|
||||
return spawned
|
||||
|
||||
# such that test-dep can pass input script name.
|
||||
return _spawn # the `PexpectSpawner`, type alias.
|
||||
yield _spawn # the `PexpectSpawner`, type alias.
|
||||
|
||||
if (
|
||||
spawned
|
||||
and
|
||||
(ptyproc := spawned.ptyproc)
|
||||
):
|
||||
start: float = time.time()
|
||||
timeout: float = 5
|
||||
while (
|
||||
ptyproc.isalive()
|
||||
and
|
||||
(
|
||||
(_time_took := (time.time() - start))
|
||||
<
|
||||
timeout
|
||||
)
|
||||
):
|
||||
ptyproc.kill(signal.SIGINT)
|
||||
time.sleep(0.01)
|
||||
|
||||
if ptyproc.isalive():
|
||||
ptyproc.kill(signal.SIGKILL)
|
||||
|
||||
# Scope our env-var mutations to this single fixture invocation
|
||||
# — both `TRACTOR_SPAWN_METHOD` and `TRACTOR_LOGLEVEL` are
|
||||
# honored by `tractor._root.open_root_actor()` so leaking them
|
||||
# past this test could inadvertently re-route a later in-process
|
||||
# tractor test's spawn-backend / loglevel.
|
||||
os.environ.pop('TRACTOR_SPAWN_METHOD', None)
|
||||
os.environ.pop('TRACTOR_LOGLEVEL', None)
|
||||
|
||||
# TODO? ensure we've cleaned up any UDS-paths?
|
||||
# breakpoint()
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
|
|
@ -91,25 +209,47 @@ def spawn(
|
|||
ids='ctl-c={}'.format,
|
||||
)
|
||||
def ctlc(
|
||||
request,
|
||||
request: pytest.FixtureRequest,
|
||||
ci_env: bool,
|
||||
|
||||
start_method: str,
|
||||
) -> bool:
|
||||
'''
|
||||
Parametrize and optionally skip tests which handle
|
||||
ctlc-in-`pdbp`-REPL testing scenarios; certain spawners and actor-tree depths
|
||||
cope very poorly with this..
|
||||
|
||||
use_ctlc = request.param
|
||||
In particular the spawning backends from `multiprocessing` are
|
||||
fragile, as can be the default `trio` spawner under certain
|
||||
conditions where SIGINT is relayed down the entire subproc tree.
|
||||
|
||||
'''
|
||||
use_ctlc: bool = request.param
|
||||
node = request.node
|
||||
markers = node.own_markers
|
||||
for mark in markers:
|
||||
if mark.name == 'has_nested_actors':
|
||||
if (
|
||||
mark.name == 'has_nested_actors'
|
||||
and
|
||||
start_method not in {
|
||||
# TODO, any spawners we should try again?
|
||||
# - [ ] 'trio' but WITHOUT the SIGINT handler setup
|
||||
# per subproc?
|
||||
# 'main_thread_forkserver',
|
||||
}
|
||||
):
|
||||
pytest.skip(
|
||||
f'Test {node} has nested actors and fails with Ctrl-C.\n'
|
||||
f'The test can sometimes run fine locally but until'
|
||||
' we solve' 'this issue this CI test will be xfail:\n'
|
||||
'https://github.com/goodboy/tractor/issues/320'
|
||||
)
|
||||
|
||||
if mark.name == 'ctlcs_bish':
|
||||
if (
|
||||
mark.name == 'ctlcs_bish'
|
||||
and
|
||||
use_ctlc
|
||||
and
|
||||
all(mark.args)
|
||||
):
|
||||
pytest.skip(
|
||||
f'Test {node} prolly uses something from the stdlib (namely `asyncio`..)\n'
|
||||
f'The test and/or underlying example script can *sometimes* run fine '
|
||||
|
|
@ -129,13 +269,10 @@ def ctlc(
|
|||
|
||||
def expect(
|
||||
child,
|
||||
|
||||
# normally a `pdb` prompt by default
|
||||
patt: str,
|
||||
|
||||
patt: str, # often a `pdbp`-prompt
|
||||
**kwargs,
|
||||
|
||||
) -> None:
|
||||
) -> str:
|
||||
'''
|
||||
Expect wrapper that prints last seen console
|
||||
data before failing.
|
||||
|
|
@ -146,6 +283,8 @@ def expect(
|
|||
patt,
|
||||
**kwargs,
|
||||
)
|
||||
before = str(child.before.decode())
|
||||
return before
|
||||
except TIMEOUT:
|
||||
before = str(child.before.decode())
|
||||
print(before)
|
||||
|
|
@ -200,10 +339,13 @@ def in_prompt_msg(
|
|||
def assert_before(
|
||||
child: SpawnBase,
|
||||
patts: list[str],
|
||||
|
||||
**kwargs,
|
||||
) -> str:
|
||||
'''
|
||||
Assert a patter is in `child.before.decode() -> str`,
|
||||
return the full `.before` output on success.
|
||||
|
||||
) -> None:
|
||||
'''
|
||||
__tracebackhide__: bool = False
|
||||
|
||||
assert in_prompt_msg(
|
||||
|
|
@ -214,12 +356,14 @@ def assert_before(
|
|||
err_on_false=True,
|
||||
**kwargs
|
||||
)
|
||||
before: str = str(child.before.decode())
|
||||
return before
|
||||
|
||||
|
||||
def do_ctlc(
|
||||
child,
|
||||
count: int = 3,
|
||||
delay: float = 0.1,
|
||||
delay: float|None = None,
|
||||
patt: str|None = None,
|
||||
|
||||
# expect repl UX to reprint the prompt after every
|
||||
|
|
@ -231,6 +375,7 @@ def do_ctlc(
|
|||
) -> str|None:
|
||||
|
||||
before: str|None = None
|
||||
delay = delay or 0.1
|
||||
|
||||
# make sure ctl-c sends don't do anything but repeat output
|
||||
for _ in range(count):
|
||||
|
|
@ -241,7 +386,10 @@ def do_ctlc(
|
|||
# if you run this test manually it works just fine..
|
||||
if expect_prompt:
|
||||
time.sleep(delay)
|
||||
child.expect(PROMPT)
|
||||
child.expect(
|
||||
PROMPT,
|
||||
timeout=(child.timeout * 2) if _ci_env else child.timeout,
|
||||
)
|
||||
before = str(child.before.decode())
|
||||
time.sleep(delay)
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ from pexpect.exceptions import (
|
|||
TIMEOUT,
|
||||
EOF,
|
||||
)
|
||||
import tractor
|
||||
|
||||
from .conftest import (
|
||||
do_ctlc,
|
||||
|
|
@ -37,6 +38,9 @@ from .conftest import (
|
|||
in_prompt_msg,
|
||||
assert_before,
|
||||
)
|
||||
from ..conftest import (
|
||||
_ci_env,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..conftest import PexpectSpawner
|
||||
|
|
@ -51,13 +55,14 @@ if TYPE_CHECKING:
|
|||
# - recurrent root errors
|
||||
|
||||
|
||||
_non_linux: bool = platform.system() != 'Linux'
|
||||
|
||||
if platform.system() == 'Windows':
|
||||
pytest.skip(
|
||||
'Debugger tests have no windows support (yet)',
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
# TODO: was trying to this xfail style but some weird bug i see in CI
|
||||
# that's happening at collect time.. pretty soon gonna dump actions i'm
|
||||
# thinkin...
|
||||
|
|
@ -193,6 +198,11 @@ def test_root_actor_bp_forever(
|
|||
child.expect(EOF)
|
||||
|
||||
|
||||
# skip on non-Linux CI
|
||||
@pytest.mark.ctlcs_bish(
|
||||
_non_linux,
|
||||
_ci_env,
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
'do_next',
|
||||
(True, False),
|
||||
|
|
@ -258,6 +268,11 @@ def test_subactor_error(
|
|||
child.expect(EOF)
|
||||
|
||||
|
||||
# skip on non-Linux CI
|
||||
@pytest.mark.ctlcs_bish(
|
||||
_non_linux,
|
||||
_ci_env,
|
||||
)
|
||||
def test_subactor_breakpoint(
|
||||
spawn,
|
||||
ctlc: bool,
|
||||
|
|
@ -329,6 +344,7 @@ def test_subactor_breakpoint(
|
|||
def test_multi_subactors(
|
||||
spawn,
|
||||
ctlc: bool,
|
||||
set_fork_aware_capture,
|
||||
):
|
||||
'''
|
||||
Multiple subactors, both erroring and
|
||||
|
|
@ -473,15 +489,32 @@ def test_multi_subactors(
|
|||
def test_multi_daemon_subactors(
|
||||
spawn,
|
||||
loglevel: str,
|
||||
ctlc: bool
|
||||
ctlc: bool,
|
||||
set_fork_aware_capture,
|
||||
):
|
||||
'''
|
||||
Multiple daemon subactors, both erroring and breakpointing within a
|
||||
stream.
|
||||
Multiple daemon subactors, both erroring and breakpointing within
|
||||
a stream.
|
||||
|
||||
'''
|
||||
child = spawn('multi_daemon_subactors')
|
||||
non_linux = _non_linux
|
||||
if non_linux and ctlc:
|
||||
pytest.skip(
|
||||
'Ctl-c + MacOS is too unreliable/racy for this test..\n'
|
||||
)
|
||||
# !TODO, if someone with more patience then i wants to muck
|
||||
# with the timings on this please feel free to see all the
|
||||
# `non_linux` branching logic i added on my first attempt
|
||||
# below!
|
||||
#
|
||||
# my conclusion was that if i were to run the script
|
||||
# manually, and thus as slowly as a human would, the test
|
||||
# would and should pass as described in this test fn, however
|
||||
# after fighting with it for >= 1hr. i decided more then
|
||||
# likely the more extensive `linux` testing should cover most
|
||||
# regressions.
|
||||
|
||||
child = spawn('multi_daemon_subactors')
|
||||
child.expect(PROMPT)
|
||||
|
||||
# there can be a race for which subactor will acquire
|
||||
|
|
@ -511,8 +544,19 @@ def test_multi_daemon_subactors(
|
|||
else:
|
||||
raise ValueError('Neither log msg was found !?')
|
||||
|
||||
non_linux_delay: float = 0.3
|
||||
if ctlc:
|
||||
do_ctlc(child)
|
||||
do_ctlc(
|
||||
child,
|
||||
delay=(
|
||||
non_linux_delay
|
||||
if non_linux
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
if non_linux:
|
||||
time.sleep(1)
|
||||
|
||||
# NOTE: previously since we did not have clobber prevention
|
||||
# in the root actor this final resume could result in the debugger
|
||||
|
|
@ -543,33 +587,69 @@ def test_multi_daemon_subactors(
|
|||
# assert "in use by child ('bp_forever'," in before
|
||||
|
||||
if ctlc:
|
||||
do_ctlc(child)
|
||||
do_ctlc(
|
||||
child,
|
||||
delay=(
|
||||
non_linux_delay
|
||||
if non_linux
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
if non_linux:
|
||||
time.sleep(1)
|
||||
|
||||
# expect another breakpoint actor entry
|
||||
child.sendline('c')
|
||||
child.expect(PROMPT)
|
||||
|
||||
try:
|
||||
assert_before(
|
||||
before: str = assert_before(
|
||||
child,
|
||||
bp_forev_parts,
|
||||
)
|
||||
except AssertionError:
|
||||
assert_before(
|
||||
except (
|
||||
# AssertionError, # TODO? rm since never raised?
|
||||
ValueError,
|
||||
):
|
||||
before: str = assert_before(
|
||||
child,
|
||||
name_error_parts,
|
||||
)
|
||||
|
||||
else:
|
||||
if ctlc:
|
||||
do_ctlc(child)
|
||||
before: str = do_ctlc(
|
||||
child,
|
||||
delay=(
|
||||
non_linux_delay
|
||||
if non_linux
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
if non_linux:
|
||||
time.sleep(1)
|
||||
|
||||
# should crash with the 2nd name error (simulates
|
||||
# a retry) and then the root eventually (boxed) errors
|
||||
# after 1 or more further bp actor entries.
|
||||
|
||||
child.sendline('c')
|
||||
child.expect(PROMPT)
|
||||
try:
|
||||
child.expect(
|
||||
PROMPT,
|
||||
timeout=3,
|
||||
)
|
||||
except EOF:
|
||||
before: str = child.before.decode()
|
||||
print(
|
||||
f'\n'
|
||||
f'??? NEVER RXED `pdb` PROMPT ???\n'
|
||||
f'\n'
|
||||
f'{before}\n'
|
||||
)
|
||||
raise
|
||||
|
||||
assert_before(
|
||||
child,
|
||||
name_error_parts,
|
||||
|
|
@ -689,7 +769,10 @@ def test_multi_subactors_root_errors(
|
|||
|
||||
@has_nested_actors
|
||||
def test_multi_nested_subactors_error_through_nurseries(
|
||||
spawn,
|
||||
ci_env: bool,
|
||||
spawn: PexpectSpawner,
|
||||
is_forking_spawner: bool,
|
||||
test_log: tractor.log.StackLevelAdapter,
|
||||
|
||||
# TODO: address debugger issue for nested tree:
|
||||
# https://github.com/goodboy/tractor/issues/320
|
||||
|
|
@ -706,51 +789,105 @@ def test_multi_nested_subactors_error_through_nurseries(
|
|||
# A test (below) has now been added to explicitly verify this is
|
||||
# fixed.
|
||||
|
||||
child = spawn('multi_nested_subactors_error_up_through_nurseries')
|
||||
child = spawn(
|
||||
'multi_nested_subactors_error_up_through_nurseries',
|
||||
loglevel='pdb',
|
||||
)
|
||||
last_send_char: str|None = None
|
||||
for (
|
||||
i,
|
||||
send_char,
|
||||
) in enumerate(itertools.cycle(['c', 'q'])):
|
||||
|
||||
# timed_out_early: bool = False
|
||||
timeout: float = child.timeout
|
||||
if (
|
||||
_non_linux
|
||||
and
|
||||
ci_env
|
||||
):
|
||||
timeout: float = 6
|
||||
|
||||
# XXX linux but the first crash sequence
|
||||
# can take longer to arrive at a prompt.
|
||||
elif i == 0:
|
||||
timeout = 5
|
||||
|
||||
# XXX forking backends may take longer due to
|
||||
# determinstic IPC cancellation.
|
||||
if is_forking_spawner:
|
||||
timeout += 4
|
||||
|
||||
for send_char in itertools.cycle(['c', 'q']):
|
||||
try:
|
||||
child.expect(PROMPT)
|
||||
child.expect(
|
||||
PROMPT,
|
||||
timeout=timeout,
|
||||
)
|
||||
delay: float = 0.1
|
||||
test_log.info('Sleeping {delay!r} before next send-chart..')
|
||||
time.sleep(delay)
|
||||
last_send_char: str = send_char
|
||||
child.sendline(send_char)
|
||||
time.sleep(0.01)
|
||||
time.sleep(delay)
|
||||
|
||||
# script finally exited with tb on console.
|
||||
except EOF:
|
||||
test_log.info(
|
||||
f'Breaking from send-char loop'
|
||||
f'last_send_char: {last_send_char!r}\n'
|
||||
)
|
||||
break
|
||||
|
||||
# boxed source errors
|
||||
expect_patts: list[str] = [
|
||||
"NameError: name 'doggypants' is not defined",
|
||||
"tractor._exceptions.RemoteActorError:",
|
||||
"('name_error'",
|
||||
|
||||
# first level subtrees
|
||||
# "tractor._exceptions.RemoteActorError: ('spawner0'",
|
||||
"src_uid=('spawner0'",
|
||||
|
||||
# "tractor._exceptions.RemoteActorError: ('spawner1'",
|
||||
|
||||
# propagation of errors up through nested subtrees
|
||||
# "tractor._exceptions.RemoteActorError: ('spawn_until_0'",
|
||||
# "tractor._exceptions.RemoteActorError: ('spawn_until_1'",
|
||||
# "tractor._exceptions.RemoteActorError: ('spawn_until_2'",
|
||||
# ^-NOTE-^ old RAE repr, new one is below with a field
|
||||
# showing the src actor's uid.
|
||||
"src_uid=('spawn_until_2'",
|
||||
]
|
||||
# XXX, I HAVE NO IDEA why these patts only show on the
|
||||
# `trio`-spawner but it seems to have something to do with
|
||||
# what gets dumped in prior-prompt latches somehow??
|
||||
# TODO for claude, explain and or work through how this is
|
||||
# happening but ONLY WHEN RUN FROM THE TEST, bc when i try to
|
||||
# run the test script manually the correct output ALWAYS seems
|
||||
# to be in the last `str(child.before.decode())` output !?!?
|
||||
if (
|
||||
not is_forking_spawner
|
||||
and
|
||||
last_send_char == 'q'
|
||||
):
|
||||
expect_patts += [
|
||||
# expect the pdb-quit exc.
|
||||
"bdb.BdbQuit",
|
||||
# BUT WHY these dude!?
|
||||
"src_uid=('spawn_until_0'",
|
||||
"relay_uid=('spawn_until_1'",
|
||||
]
|
||||
|
||||
assert_before(
|
||||
child,
|
||||
[ # boxed source errors
|
||||
"NameError: name 'doggypants' is not defined",
|
||||
"tractor._exceptions.RemoteActorError:",
|
||||
"('name_error'",
|
||||
"bdb.BdbQuit",
|
||||
|
||||
# first level subtrees
|
||||
# "tractor._exceptions.RemoteActorError: ('spawner0'",
|
||||
"src_uid=('spawner0'",
|
||||
|
||||
# "tractor._exceptions.RemoteActorError: ('spawner1'",
|
||||
|
||||
# propagation of errors up through nested subtrees
|
||||
# "tractor._exceptions.RemoteActorError: ('spawn_until_0'",
|
||||
# "tractor._exceptions.RemoteActorError: ('spawn_until_1'",
|
||||
# "tractor._exceptions.RemoteActorError: ('spawn_until_2'",
|
||||
# ^-NOTE-^ old RAE repr, new one is below with a field
|
||||
# showing the src actor's uid.
|
||||
"src_uid=('spawn_until_0'",
|
||||
"relay_uid=('spawn_until_1'",
|
||||
"src_uid=('spawn_until_2'",
|
||||
]
|
||||
expect_patts,
|
||||
)
|
||||
expect(child, EOF)
|
||||
|
||||
|
||||
@pytest.mark.timeout(15)
|
||||
# @pytest.mark.timeout(15)
|
||||
@has_nested_actors
|
||||
def test_root_nursery_cancels_before_child_releases_tty_lock(
|
||||
spawn,
|
||||
start_method,
|
||||
ctlc: bool,
|
||||
):
|
||||
'''
|
||||
|
|
@ -889,6 +1026,11 @@ def test_different_debug_mode_per_actor(
|
|||
)
|
||||
|
||||
|
||||
# skip on non-Linux CI
|
||||
@pytest.mark.ctlcs_bish(
|
||||
_non_linux,
|
||||
_ci_env,
|
||||
)
|
||||
def test_post_mortem_api(
|
||||
spawn,
|
||||
ctlc: bool,
|
||||
|
|
@ -1087,7 +1229,11 @@ def test_ctxep_pauses_n_maybe_ipc_breaks(
|
|||
mashed and zombie reaper kills sub with no hangs.
|
||||
|
||||
'''
|
||||
child = spawn('subactor_bp_in_ctx')
|
||||
child = spawn(
|
||||
'subactor_bp_in_ctx',
|
||||
loglevel='devx'
|
||||
# ^XXX REQUIRED for below patt matching!
|
||||
)
|
||||
child.expect(PROMPT)
|
||||
|
||||
# 3 iters for the `gen()` pause-points
|
||||
|
|
@ -1133,12 +1279,21 @@ def test_ctxep_pauses_n_maybe_ipc_breaks(
|
|||
# closed so verify we see error reporting as well as
|
||||
# a failed crash-REPL request msg and can CTL-c our way
|
||||
# out.
|
||||
|
||||
# ?TODO, match depending on `tpt_proto(s)`?
|
||||
# - [ ] how can we pass it into the script tho?
|
||||
tpt: str = 'UDS'
|
||||
if _non_linux:
|
||||
tpt: str = 'TCP'
|
||||
|
||||
assert_before(
|
||||
child,
|
||||
['peer IPC channel closed abruptly?',
|
||||
'another task closed this fd',
|
||||
'Debug lock request was CANCELLED?',
|
||||
"TransportClosed: 'MsgpackUDSStream' was already closed locally ?",]
|
||||
f"'Msgpack{tpt}Stream' was already closed locally?",
|
||||
f"TransportClosed: 'Msgpack{tpt}Stream' was already closed 'by peer'?",
|
||||
]
|
||||
|
||||
# XXX races on whether these show/hit?
|
||||
# 'Failed to REPl via `_pause()` You called `tractor.pause()` from an already cancelled scope!',
|
||||
|
|
@ -1168,7 +1323,11 @@ def test_crash_handling_within_cancelled_root_actor(
|
|||
call.
|
||||
|
||||
'''
|
||||
child = spawn('root_self_cancelled_w_error')
|
||||
child = spawn(
|
||||
'root_self_cancelled_w_error',
|
||||
loglevel='cancel',
|
||||
# ^XXX REQUIRED for below patt matching!
|
||||
)
|
||||
child.expect(PROMPT)
|
||||
|
||||
assert_before(
|
||||
|
|
|
|||
|
|
@ -63,19 +63,31 @@ def test_pause_from_sync(
|
|||
`examples/debugging/sync_bp.py`
|
||||
|
||||
'''
|
||||
child = spawn('sync_bp')
|
||||
# XXX required for `breakpoint()` overload and
|
||||
# thus`tractor.devx.pause_from_sync()`.
|
||||
pytest.importorskip('greenback')
|
||||
child = spawn(
|
||||
'sync_bp',
|
||||
loglevel='pdb', # XXX pattern matching
|
||||
)
|
||||
|
||||
# first `sync_pause()` after nurseries open
|
||||
child.expect(PROMPT)
|
||||
assert_before(
|
||||
_before: str = assert_before(
|
||||
child,
|
||||
[
|
||||
# pre-prompt line
|
||||
_pause_msg,
|
||||
"<Task '__main__.main'",
|
||||
# devx-loglevel
|
||||
# "imported <module 'greenback' from",
|
||||
# "successfully scheduled `._pause()` in `trio` thread on behalf of <Task",
|
||||
|
||||
_pause_msg, # pre-prompt line
|
||||
"('root'",
|
||||
"<Task '__main__.main'",
|
||||
"tractor.pause_from_sync()",
|
||||
]
|
||||
)
|
||||
# XXX `enable_stack_on_sig=False` in script
|
||||
assert 'stackscope' not in _before
|
||||
if ctlc:
|
||||
do_ctlc(child)
|
||||
# ^NOTE^ subactor not spawned yet; don't need extra delay.
|
||||
|
|
@ -85,18 +97,18 @@ def test_pause_from_sync(
|
|||
# first `await tractor.pause()` inside `p.open_context()` body
|
||||
child.expect(PROMPT)
|
||||
|
||||
# XXX shouldn't see gb loaded message with PDB loglevel!
|
||||
# assert not in_prompt_msg(
|
||||
# child,
|
||||
# ['`greenback` portal opened!'],
|
||||
# )
|
||||
# should be same root task
|
||||
assert_before(
|
||||
child,
|
||||
[
|
||||
# XXX should see gb loaded with devx-loglevel.
|
||||
# "`greenback` portal opened!",
|
||||
# "Activated `greenback` for `tractor.pause_from_sync()` support!",
|
||||
|
||||
_pause_msg,
|
||||
"<Task '__main__.main'",
|
||||
"('root'",
|
||||
"<Task '__main__.main'",
|
||||
"tractor.pause()",
|
||||
]
|
||||
)
|
||||
|
||||
|
|
@ -127,17 +139,17 @@ def test_pause_from_sync(
|
|||
# `Lock.acquire()`-ed
|
||||
# (NOT both, which will result in REPL clobbering!)
|
||||
attach_patts: dict[str, list[str]] = {
|
||||
'subactor': [
|
||||
"'start_n_sync_pause'",
|
||||
"('subactor'",
|
||||
"|_<Task 'start_n_sync_pause'": [
|
||||
"|_('subactor'",
|
||||
"tractor.pause_from_sync()",
|
||||
],
|
||||
'inline_root_bg_thread': [
|
||||
"<Thread(inline_root_bg_thread",
|
||||
"|_<Thread(inline_root_bg_thread": [
|
||||
"('root'",
|
||||
"breakpoint(hide_tb=hide_tb)",
|
||||
],
|
||||
'start_soon_root_bg_thread': [
|
||||
"<Thread(start_soon_root_bg_thread",
|
||||
"('root'",
|
||||
"|_<Thread(start_soon_root_bg_thread": [
|
||||
"|_('root'",
|
||||
"tractor.pause_from_sync()",
|
||||
],
|
||||
}
|
||||
conts: int = 0 # for debugging below matching logic on failure
|
||||
|
|
@ -260,6 +272,9 @@ def test_sync_pause_from_aio_task(
|
|||
`examples/debugging/asycio_bp.py`
|
||||
|
||||
'''
|
||||
# XXX required for `breakpoint()` overload and
|
||||
# thus`tractor.devx.pause_from_sync()`.
|
||||
pytest.importorskip('greenback')
|
||||
child = spawn('asyncio_bp')
|
||||
|
||||
# RACE on whether trio/asyncio task bps first
|
||||
|
|
|
|||
|
|
@ -0,0 +1,170 @@
|
|||
'''
|
||||
Tests for `tractor.devx._proctitle` (per-actor `setproctitle`)
|
||||
and the intrinsic-signal sub-actor detection in
|
||||
`tractor._testing._reap`.
|
||||
|
||||
The proctitle is set in `tractor._child._actor_child_main()`
|
||||
after `Actor` construction, so any spawned sub-actor process
|
||||
should:
|
||||
|
||||
- have `argv[0]` (== `/proc/<pid>/cmdline`) start with
|
||||
`tractor[<aid.reprol()>]`
|
||||
- have `/proc/<pid>/comm` start with `tractor[` (kernel
|
||||
truncates to ~15 bytes)
|
||||
- be detected as a tractor sub-actor by
|
||||
`_is_tractor_subactor(pid)` via the cmdline marker.
|
||||
|
||||
`set_actor_proctitle()` itself is also unit-tested in-process
|
||||
to verify the format string.
|
||||
|
||||
'''
|
||||
from __future__ import annotations
|
||||
import platform
|
||||
|
||||
import psutil
|
||||
import pytest
|
||||
import trio
|
||||
import tractor
|
||||
|
||||
from tractor.runtime._runtime import Actor
|
||||
from tractor.devx._proctitle import set_actor_proctitle
|
||||
from tractor._testing._reap import (
|
||||
_is_tractor_subactor,
|
||||
_read_cmdline,
|
||||
_read_comm,
|
||||
)
|
||||
|
||||
|
||||
_non_linux: bool = platform.system() != 'Linux'
|
||||
|
||||
|
||||
def test_set_actor_proctitle_format():
|
||||
'''
|
||||
`set_actor_proctitle()` returns the canonical
|
||||
`tractor[<aid.reprol()>]` form and actually mutates
|
||||
the running proc's title.
|
||||
|
||||
'''
|
||||
pytest.importorskip(
|
||||
'setproctitle',
|
||||
reason='`setproctitle` is an optional runtime dep',
|
||||
)
|
||||
import setproctitle
|
||||
|
||||
# save + restore so we don't pollute pytest's own title
|
||||
saved: str = setproctitle.getproctitle()
|
||||
try:
|
||||
actor = Actor(
|
||||
name='unit_test_actor',
|
||||
uuid='1027301b-a0e3-430e-8806-a5279f21abe6',
|
||||
)
|
||||
title: str = set_actor_proctitle(actor)
|
||||
|
||||
# canonical wrapping: `tractor[<aid.reprol()>]`. We
|
||||
# compare against the runtime-computed `reprol()`
|
||||
# rather than a hard-coded value so the test stays
|
||||
# decoupled from `Aid.reprol()`'s internal format
|
||||
# (currently `<name>@<pid>`, but could evolve).
|
||||
expected: str = f'tractor[{actor.aid.reprol()}]'
|
||||
assert title == expected
|
||||
# sanity: the actor's name must be in the title
|
||||
# somewhere (so a future `reprol()` change that
|
||||
# drops the name is also caught).
|
||||
assert 'unit_test_actor' in title
|
||||
|
||||
# actually set on the running proc
|
||||
assert setproctitle.getproctitle() == title
|
||||
|
||||
finally:
|
||||
setproctitle.setproctitle(saved)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
_non_linux,
|
||||
reason=(
|
||||
'detection helpers read `/proc/<pid>/{cmdline,comm}` '
|
||||
'which is Linux-specific'
|
||||
),
|
||||
)
|
||||
def test_subactor_proctitle_visible_via_proc():
|
||||
'''
|
||||
Spawn a sub-actor and verify its proc-title is visible
|
||||
via both `/proc/<pid>/cmdline` AND `/proc/<pid>/comm`,
|
||||
AND that `_is_tractor_subactor()` correctly identifies
|
||||
it.
|
||||
|
||||
'''
|
||||
pytest.importorskip('setproctitle')
|
||||
|
||||
async def main() -> dict:
|
||||
async with tractor.open_nursery() as an:
|
||||
portal = await an.start_actor('proctitle_boi')
|
||||
# let the child finish setproctitle in
|
||||
# `_actor_child_main`
|
||||
await trio.sleep(0.3)
|
||||
|
||||
# the sub-actor's pid is on the portal's chan
|
||||
# repr; psutil-walk `me.children()` is simpler.
|
||||
me = psutil.Process()
|
||||
sub_pids: list[int] = [
|
||||
p.pid for p in me.children(recursive=True)
|
||||
]
|
||||
assert sub_pids, (
|
||||
'expected at least one spawned sub-actor pid'
|
||||
)
|
||||
|
||||
results: dict = {}
|
||||
for pid in sub_pids:
|
||||
results[pid] = {
|
||||
'cmdline': _read_cmdline(pid),
|
||||
'comm': _read_comm(pid),
|
||||
'is_tractor': _is_tractor_subactor(pid),
|
||||
}
|
||||
|
||||
await portal.cancel_actor()
|
||||
return results
|
||||
|
||||
found: dict = trio.run(main)
|
||||
|
||||
# at least one of the spawned procs should match the
|
||||
# `proctitle_boi` actor we started; assert the proc-
|
||||
# title shape on it specifically.
|
||||
matched: list[tuple[int, dict]] = [
|
||||
(pid, info)
|
||||
for pid, info in found.items()
|
||||
if 'proctitle_boi' in info['cmdline']
|
||||
]
|
||||
assert matched, (
|
||||
f'no sub-actor pid had a `proctitle_boi` cmdline; '
|
||||
f'all={found}'
|
||||
)
|
||||
|
||||
pid, info = matched[0]
|
||||
# canonical proctitle prefix in cmdline (full form)
|
||||
assert info['cmdline'].startswith('tractor[proctitle_boi@'), (
|
||||
f'cmdline missing `tractor[proctitle_boi@…]` prefix: '
|
||||
f'{info["cmdline"]!r}'
|
||||
)
|
||||
# comm is kernel-truncated to ~15 bytes — just check the
|
||||
# `tractor[` prefix made it.
|
||||
assert info['comm'].startswith('tractor['), (
|
||||
f'comm missing `tractor[` prefix: {info["comm"]!r}'
|
||||
)
|
||||
# intrinsic-signal detector should match.
|
||||
assert info['is_tractor'] is True
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
_non_linux,
|
||||
reason='reads /proc/<pid>/{cmdline,comm}',
|
||||
)
|
||||
def test_is_tractor_subactor_negative():
|
||||
'''
|
||||
`_is_tractor_subactor()` returns False for non-tractor
|
||||
procs (e.g. the pytest test-runner pid itself, which
|
||||
is `python -m pytest …` — no `tractor[` proctitle, no
|
||||
`tractor._child` cmdline).
|
||||
|
||||
'''
|
||||
import os
|
||||
assert _is_tractor_subactor(os.getpid()) is False
|
||||
|
|
@ -21,6 +21,7 @@ import os
|
|||
import signal
|
||||
import time
|
||||
from typing import (
|
||||
Callable,
|
||||
TYPE_CHECKING,
|
||||
)
|
||||
|
||||
|
|
@ -31,6 +32,9 @@ from .conftest import (
|
|||
PROMPT,
|
||||
_pause_msg,
|
||||
)
|
||||
from ..conftest import (
|
||||
no_macos,
|
||||
)
|
||||
|
||||
import pytest
|
||||
from pexpect.exceptions import (
|
||||
|
|
@ -42,8 +46,14 @@ if TYPE_CHECKING:
|
|||
from ..conftest import PexpectSpawner
|
||||
|
||||
|
||||
@no_macos
|
||||
def test_shield_pause(
|
||||
spawn: PexpectSpawner,
|
||||
spawn: Callable[
|
||||
...,
|
||||
PexpectSpawner,
|
||||
],
|
||||
start_method: str,
|
||||
request: pytest.FixtureRequest,
|
||||
):
|
||||
'''
|
||||
Verify the `tractor.pause()/.post_mortem()` API works inside an
|
||||
|
|
@ -51,12 +61,15 @@ def test_shield_pause(
|
|||
next checkpoint wherein the cancelled will get raised.
|
||||
|
||||
'''
|
||||
child = spawn(
|
||||
'shield_hang_in_sub'
|
||||
child: PexpectSpawner = spawn(
|
||||
'shield_hang_in_sub',
|
||||
loglevel='devx',
|
||||
# ^XXX REQUIRED for below patt matching!
|
||||
)
|
||||
expect(
|
||||
child,
|
||||
'Yo my child hanging..?',
|
||||
timeout=3,
|
||||
)
|
||||
assert_before(
|
||||
child,
|
||||
|
|
@ -81,38 +94,82 @@ def test_shield_pause(
|
|||
# end-of-tree delimiter
|
||||
"end-of-\('root'",
|
||||
)
|
||||
assert_before(
|
||||
_before: str = assert_before(
|
||||
child,
|
||||
[
|
||||
# 'Srying to dump `stackscope` tree..',
|
||||
# 'Dumping `stackscope` tree for actor',
|
||||
"('root'", # uid line
|
||||
|
||||
# TODO!? this used to show?
|
||||
# TODO!? this in-task-code used to show??
|
||||
# -[ ] mk reproducable for @oremanj?
|
||||
# => SOLVED? by our `trio_token.run_sync_soon()`
|
||||
# approach?
|
||||
#
|
||||
# parent block point (non-shielded)
|
||||
# 'await trio.sleep_forever() # in root',
|
||||
]
|
||||
)
|
||||
expect(
|
||||
child,
|
||||
# end-of-tree delimiter
|
||||
"end-of-\('hanger'",
|
||||
)
|
||||
assert_before(
|
||||
child,
|
||||
[
|
||||
# relay to the sub should be reported
|
||||
'Relaying `SIGUSR1`[10] to sub-actor',
|
||||
|
||||
"('hanger'", # uid line
|
||||
# NOTE, hierarchical-ordering invariant restored by
|
||||
# `_dump_then_relay` (co-scheduled dump+relay on the
|
||||
# trio loop, see `tractor.devx._stackscope`): the
|
||||
# parent's full task-tree prints BEFORE the 'Relaying
|
||||
# `SIGUSR1`' log msg, which prints BEFORE any sub-
|
||||
# actor receives the signal and dumps its own tree.
|
||||
# So the relay log appears BETWEEN `end-of-('root'`
|
||||
# (above) and `end-of-('hanger'` (below).
|
||||
handle_out_of_order: bool = False
|
||||
|
||||
# XXX, when capfd is NOT used we don't expect to
|
||||
# see the logging output from the subactor.
|
||||
if (no_capfd := (start_method in [
|
||||
'main_thread_forkserver',
|
||||
])
|
||||
):
|
||||
opts = request.config.option
|
||||
assert opts.spawn_backend == start_method
|
||||
# ?XXX? i guess the `testdir` fixture "pretends to" reset
|
||||
# this to the default 'fd'??
|
||||
# assert opts.capture in [
|
||||
# 'sys',
|
||||
# 'no',
|
||||
# ]
|
||||
|
||||
if (
|
||||
handle_out_of_order
|
||||
and
|
||||
"end-of-('hanger'" in _before
|
||||
):
|
||||
assert "('hanger'" in _before
|
||||
assert 'Relaying `SIGUSR1`[10] to sub-actor' in _before
|
||||
|
||||
else:
|
||||
_before = expect(
|
||||
child,
|
||||
'Relaying `SIGUSR1`\\[10\\] to sub-actor',
|
||||
)
|
||||
# _before: str = assert_before(
|
||||
# child,
|
||||
# ["('hanger'",] # uid line
|
||||
# )
|
||||
if not no_capfd:
|
||||
expect(
|
||||
child,
|
||||
# end-of-subactor's-tree delimiter
|
||||
"end-of-\('hanger'",
|
||||
)
|
||||
_before: str = assert_before(
|
||||
child,
|
||||
[
|
||||
"('hanger'", # uid line
|
||||
|
||||
# TODO!? SEE ABOVE
|
||||
# hanger LOC where it's shield-halted
|
||||
# 'await trio.sleep_forever() # in subactor',
|
||||
]
|
||||
)
|
||||
|
||||
# TODO!? SEE ABOVE
|
||||
# hanger LOC where it's shield-halted
|
||||
# 'await trio.sleep_forever() # in subactor',
|
||||
]
|
||||
)
|
||||
|
||||
# simulate the user sending a ctl-c to the hanging program.
|
||||
# this should result in the terminator kicking in since
|
||||
|
|
@ -121,21 +178,26 @@ def test_shield_pause(
|
|||
child.pid,
|
||||
signal.SIGINT,
|
||||
)
|
||||
from tractor._supervise import _shutdown_msg
|
||||
from tractor.runtime._supervise import _shutdown_msg
|
||||
expect(
|
||||
child,
|
||||
# 'Shutting down actor runtime',
|
||||
_shutdown_msg,
|
||||
timeout=6,
|
||||
)
|
||||
assert_before(
|
||||
child,
|
||||
[
|
||||
'raise KeyboardInterrupt',
|
||||
expect_on_teardown: list[str] = [
|
||||
'raise KeyboardInterrupt',
|
||||
'Root actor terminated',
|
||||
]
|
||||
if not no_capfd:
|
||||
expect_on_teardown += [
|
||||
# 'Shutting down actor runtime',
|
||||
'#T-800 deployed to collect zombie B0',
|
||||
"'--uid', \"('hanger',",
|
||||
]
|
||||
assert_before(
|
||||
child,
|
||||
expect_on_teardown,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -151,8 +213,10 @@ def test_breakpoint_hook_restored(
|
|||
calls used.
|
||||
|
||||
'''
|
||||
# XXX required for `breakpoint()` overload and
|
||||
# thus`tractor.devx.pause_from_sync()`.
|
||||
pytest.importorskip('greenback')
|
||||
child = spawn('restore_builtin_breakpoint')
|
||||
|
||||
child.expect(PROMPT)
|
||||
try:
|
||||
assert_before(
|
||||
|
|
|
|||
|
|
@ -0,0 +1,223 @@
|
|||
'''
|
||||
Discovery-suite fixtures, including the `daemon`
|
||||
remote-registrar subprocess used by the multi-program
|
||||
discovery tests.
|
||||
|
||||
Lives here (vs. the parent `tests/conftest.py`)
|
||||
because `daemon` is a discovery-protocol primitive —
|
||||
boots a separate `tractor.run_daemon()` process whose
|
||||
sole purpose is to serve as a registrar peer for
|
||||
discovery-roundtrip tests. Pytest fixtures inherit
|
||||
DOWNWARD through conftest hierarchy, so anything
|
||||
under `tests/discovery/` automatically picks this up.
|
||||
|
||||
'''
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import platform
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import tractor
|
||||
|
||||
from ..conftest import (
|
||||
sig_prog,
|
||||
_INT_SIGNAL,
|
||||
_non_linux,
|
||||
)
|
||||
|
||||
|
||||
def _wait_for_daemon_ready(
|
||||
reg_addr: tuple,
|
||||
tpt_proto: str,
|
||||
*,
|
||||
deadline: float = 10.0,
|
||||
poll_interval: float = 0.05,
|
||||
proc: subprocess.Popen|None = None,
|
||||
) -> None:
|
||||
'''
|
||||
Active-poll the daemon's bind address until it
|
||||
accepts a connection (proving it has called
|
||||
`bind() + listen()` and is ready to handle IPC).
|
||||
|
||||
Replaces the historical blind `time.sleep()` in the
|
||||
`daemon` fixture which was racy under load — see
|
||||
`ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md`.
|
||||
|
||||
Uses stdlib `socket` directly (no trio runtime
|
||||
bootstrap cost) — sufficient because
|
||||
`tractor.run_daemon()` doesn't return from
|
||||
bootstrap until the runtime is fully ready to
|
||||
accept IPC.
|
||||
|
||||
Raises `TimeoutError` on `deadline` exceeded. If
|
||||
`proc` is given, ALSO raises early if the daemon
|
||||
process exits non-zero before the deadline (catches
|
||||
daemon-startup-crash that the blind sleep used to
|
||||
silently mask).
|
||||
|
||||
'''
|
||||
end: float = time.monotonic() + deadline
|
||||
last_exc: Exception|None = None
|
||||
while time.monotonic() < end:
|
||||
# Daemon-died-during-startup early-exit. Without
|
||||
# this, a crashed-on-import daemon would just
|
||||
# eat the full deadline before raising opaque
|
||||
# TimeoutError.
|
||||
if proc is not None and proc.poll() is not None:
|
||||
raise RuntimeError(
|
||||
f'Daemon proc exited (rc={proc.returncode}) '
|
||||
f'before becoming ready to accept on '
|
||||
f'{reg_addr!r}'
|
||||
)
|
||||
try:
|
||||
if tpt_proto == 'tcp':
|
||||
# `socket.create_connection` does the
|
||||
# `socket() + connect()` dance with a
|
||||
# builtin timeout — perfect primitive
|
||||
# for a one-shot probe.
|
||||
with socket.create_connection(
|
||||
reg_addr,
|
||||
timeout=poll_interval,
|
||||
):
|
||||
return
|
||||
else:
|
||||
# UDS — `reg_addr` is a `(filedir, sockname)`
|
||||
# tuple per `tractor.ipc._uds.UDSAddress.unwrap`.
|
||||
sockpath: str = os.path.join(*reg_addr)
|
||||
sock = socket.socket(socket.AF_UNIX)
|
||||
try:
|
||||
sock.settimeout(poll_interval)
|
||||
sock.connect(sockpath)
|
||||
return
|
||||
finally:
|
||||
sock.close()
|
||||
except (
|
||||
ConnectionRefusedError,
|
||||
FileNotFoundError,
|
||||
OSError,
|
||||
socket.timeout,
|
||||
) as exc:
|
||||
last_exc = exc
|
||||
time.sleep(poll_interval)
|
||||
raise TimeoutError(
|
||||
f'Daemon never accepted on {reg_addr!r} within '
|
||||
f'{deadline}s (last connect-attempt exc: '
|
||||
f'{last_exc!r})'
|
||||
)
|
||||
|
||||
|
||||
# TODO: factor into @cm and move to `._testing`?
|
||||
@pytest.fixture
|
||||
def daemon(
|
||||
debug_mode: bool,
|
||||
loglevel: str,
|
||||
testdir: pytest.Pytester,
|
||||
reg_addr: tuple[str, int],
|
||||
tpt_proto: str,
|
||||
ci_env: bool,
|
||||
test_log: tractor.log.StackLevelAdapter,
|
||||
|
||||
) -> subprocess.Popen:
|
||||
'''
|
||||
Run a daemon root actor as a separate actor-process
|
||||
tree and "remote registrar" for discovery-protocol
|
||||
related tests.
|
||||
|
||||
'''
|
||||
# XXX: too much logging will lock up the subproc (smh)
|
||||
if loglevel in ('trace', 'debug'):
|
||||
test_log.warning(
|
||||
f'Test harness log level is too verbose: {loglevel!r}\n'
|
||||
f'Reducing to INFO level..'
|
||||
)
|
||||
loglevel: str = 'info'
|
||||
|
||||
code: str = (
|
||||
"import tractor; "
|
||||
"tractor.run_daemon([], "
|
||||
"registry_addrs={reg_addrs}, "
|
||||
"enable_transports={enable_tpts}, "
|
||||
"debug_mode={debug_mode}, "
|
||||
"loglevel={ll})"
|
||||
).format(
|
||||
reg_addrs=str([reg_addr]),
|
||||
enable_tpts=str([tpt_proto]),
|
||||
ll="'{}'".format(loglevel) if loglevel else None,
|
||||
debug_mode=debug_mode,
|
||||
)
|
||||
cmd: list[str] = [
|
||||
sys.executable,
|
||||
'-c', code,
|
||||
]
|
||||
kwargs = {}
|
||||
if platform.system() == 'Windows':
|
||||
# without this, tests hang on windows forever
|
||||
kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
|
||||
|
||||
proc: subprocess.Popen = testdir.popen(
|
||||
cmd,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Active-poll the daemon's bind address until it's
|
||||
# ready to accept connections — replaces the legacy
|
||||
# blind `time.sleep(2.2)` which was racy under load
|
||||
# (see
|
||||
# `ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md`).
|
||||
#
|
||||
# Per-test deadline scales with platform: macOS/CI
|
||||
# gets extra headroom; Linux dev boxes need very
|
||||
# little.
|
||||
deadline: float = (
|
||||
15.0 if (_non_linux and ci_env)
|
||||
else 10.0
|
||||
)
|
||||
_wait_for_daemon_ready(
|
||||
reg_addr=reg_addr,
|
||||
tpt_proto=tpt_proto,
|
||||
deadline=deadline,
|
||||
proc=proc,
|
||||
)
|
||||
|
||||
assert not proc.returncode
|
||||
yield proc
|
||||
sig_prog(proc, _INT_SIGNAL)
|
||||
|
||||
# XXX! yeah.. just be reaaal careful with this bc
|
||||
# sometimes it can lock up on the `_io.BufferedReader`
|
||||
# and hang..
|
||||
#
|
||||
# NB, drain happens at TEARDOWN (post-yield), so the
|
||||
# test body has its chance to read `proc.stderr`
|
||||
# FIRST. Reading here AFTER would silently swallow
|
||||
# the daemon's stderr output and break tests that
|
||||
# assert on it (e.g. `test_abort_on_sigint`).
|
||||
stderr: str = proc.stderr.read().decode()
|
||||
stdout: str = proc.stdout.read().decode()
|
||||
if (
|
||||
stderr
|
||||
or
|
||||
stdout
|
||||
):
|
||||
print(
|
||||
f'Daemon actor tree produced output:\n'
|
||||
f'{proc.args}\n'
|
||||
f'\n'
|
||||
f'stderr: {stderr!r}\n'
|
||||
f'stdout: {stdout!r}\n'
|
||||
)
|
||||
|
||||
if (rc := proc.returncode) != -2:
|
||||
msg: str = (
|
||||
f'Daemon actor tree was not cancelled !?\n'
|
||||
f'proc.args: {proc.args!r}\n'
|
||||
f'proc.returncode: {rc!r}\n'
|
||||
)
|
||||
if rc < 0:
|
||||
raise RuntimeError(msg)
|
||||
|
||||
test_log.error(msg)
|
||||
|
|
@ -0,0 +1,355 @@
|
|||
"""
|
||||
Multiple python programs invoking the runtime.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import platform
|
||||
import subprocess
|
||||
import time
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
)
|
||||
|
||||
import pytest
|
||||
import trio
|
||||
import tractor
|
||||
from tractor._testing import (
|
||||
tractor_test,
|
||||
)
|
||||
from tractor import (
|
||||
current_actor,
|
||||
Actor,
|
||||
Context,
|
||||
Portal,
|
||||
)
|
||||
from tractor.runtime import _state
|
||||
from ..conftest import (
|
||||
sig_prog,
|
||||
_INT_SIGNAL,
|
||||
_INT_RETURN_CODE,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tractor.msg import Aid
|
||||
from tractor.discovery._addr import (
|
||||
UnwrappedAddress,
|
||||
)
|
||||
|
||||
|
||||
_non_linux: bool = platform.system() != 'Linux'
|
||||
|
||||
|
||||
# NOTE, multi-program tests historically triggered both
|
||||
# UDS sock-file leaks (daemon-subproc SIGKILL paths) AND
|
||||
# trio `WakeupSocketpair.drain()` busy-loops
|
||||
# (`test_register_duplicate_name`). Track + detect
|
||||
# per-test as a regression net.
|
||||
pytestmark = pytest.mark.usefixtures(
|
||||
'track_orphaned_uds_per_test',
|
||||
'detect_runaway_subactors_per_test',
|
||||
)
|
||||
|
||||
|
||||
def test_abort_on_sigint(
|
||||
daemon: subprocess.Popen,
|
||||
):
|
||||
assert daemon.returncode is None
|
||||
time.sleep(0.1)
|
||||
sig_prog(daemon, _INT_SIGNAL)
|
||||
assert daemon.returncode == _INT_RETURN_CODE
|
||||
|
||||
# XXX: oddly, couldn't get capfd.readouterr() to work here?
|
||||
if platform.system() != 'Windows':
|
||||
# don't check stderr on windows as its empty when sending CTRL_C_EVENT
|
||||
assert "KeyboardInterrupt" in str(daemon.stderr.read())
|
||||
|
||||
|
||||
@tractor_test
|
||||
async def test_cancel_remote_registrar(
|
||||
daemon: subprocess.Popen,
|
||||
reg_addr: UnwrappedAddress,
|
||||
):
|
||||
assert not current_actor().is_registrar
|
||||
async with tractor.get_registry(reg_addr) as portal:
|
||||
await portal.cancel_actor()
|
||||
|
||||
time.sleep(0.1)
|
||||
# the registrar channel server is cancelled but not its main task
|
||||
assert daemon.returncode is None
|
||||
|
||||
# no registrar socket should exist
|
||||
with pytest.raises(OSError):
|
||||
async with tractor.get_registry(reg_addr) as portal:
|
||||
pass
|
||||
|
||||
|
||||
def test_register_duplicate_name(
|
||||
daemon: subprocess.Popen,
|
||||
reg_addr: UnwrappedAddress,
|
||||
):
|
||||
# bug-class-3 breadcrumbs: the *last* `[CANCEL]` line that
|
||||
# appears under `--ll cancel`/`TRACTOR_LOG_FILE=...` names the
|
||||
# cancel-cascade boundary that's parked. Pair with
|
||||
# `_trio_main` entry/exit breadcrumbs in
|
||||
# `tractor/spawn/_entry.py` to triangulate the swallow point.
|
||||
log = tractor.log.get_logger('tractor.tests.test_multi_program')
|
||||
|
||||
async def main():
|
||||
log.cancel('test_register_duplicate_name: enter `main()`')
|
||||
try:
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
) as an:
|
||||
log.cancel(
|
||||
'test_register_duplicate_name: '
|
||||
'actor nursery opened'
|
||||
)
|
||||
|
||||
assert not current_actor().is_registrar
|
||||
|
||||
p1 = await an.start_actor('doggy')
|
||||
log.cancel(
|
||||
'test_register_duplicate_name: '
|
||||
'spawned doggy #1'
|
||||
)
|
||||
p2 = await an.start_actor('doggy')
|
||||
log.cancel(
|
||||
'test_register_duplicate_name: '
|
||||
'spawned doggy #2'
|
||||
)
|
||||
|
||||
async with tractor.wait_for_actor('doggy') as portal:
|
||||
log.cancel(
|
||||
'test_register_duplicate_name: '
|
||||
'`wait_for_actor` returned'
|
||||
)
|
||||
assert portal.channel.uid in (p2.channel.uid, p1.channel.uid)
|
||||
|
||||
log.cancel(
|
||||
'test_register_duplicate_name: '
|
||||
'ABOUT TO CALL `an.cancel()`'
|
||||
)
|
||||
await an.cancel()
|
||||
log.cancel(
|
||||
'test_register_duplicate_name: '
|
||||
'`an.cancel()` returned'
|
||||
)
|
||||
finally:
|
||||
log.cancel(
|
||||
'test_register_duplicate_name: '
|
||||
'`open_nursery.__aexit__` returned, leaving `main()`'
|
||||
)
|
||||
|
||||
# XXX, run manually since we want to start this root **after**
|
||||
# the other "daemon" program with it's own root.
|
||||
trio.run(main)
|
||||
|
||||
|
||||
# `n_dups` in {4, 8} both expose the SAME pre-existing race:
|
||||
# under rapid same-name spawning against a forkserver +
|
||||
# registrar, ONE of the spawned doggies `sys.exit(2)`s during
|
||||
# boot before completing parent-handshake. Surfaces now (post
|
||||
# the spawn-time `wait_for_peer_or_proc_death` fix) as
|
||||
# `ActorFailure rc=2`; previously it was silently masked by
|
||||
# the handshake-wait parking forever.
|
||||
#
|
||||
# Larger `n_dups` widens the race window so the boot-race
|
||||
# fires more often — n_dups=4 hits ~always, n_dups=8 hits
|
||||
# occasionally. Both xfail(strict=False) so the cancel-cascade
|
||||
# regression-check still passes when the boot-race happens
|
||||
# NOT to fire.
|
||||
#
|
||||
# Tracked separately in,
|
||||
# https://github.com/goodboy/tractor/issues/456
|
||||
_DOGGY_BOOT_RACE_XFAIL = pytest.mark.xfail(
|
||||
strict=False,
|
||||
reason=(
|
||||
'doggy boot-race rc=2 under rapid same-name '
|
||||
'spawn — separate bug from cancel-cascade'
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'n_dups',
|
||||
[
|
||||
2,
|
||||
pytest.param(4, marks=_DOGGY_BOOT_RACE_XFAIL),
|
||||
pytest.param(8, marks=_DOGGY_BOOT_RACE_XFAIL),
|
||||
],
|
||||
ids=lambda n: f'n_dups={n}',
|
||||
)
|
||||
def test_dup_name_cancel_cascade_escalates_to_hard_kill(
|
||||
daemon: subprocess.Popen,
|
||||
reg_addr: UnwrappedAddress,
|
||||
n_dups: int,
|
||||
):
|
||||
'''
|
||||
Regression for the duplicate-name cancel-cascade hang under
|
||||
`tcp+main_thread_forkserver`.
|
||||
|
||||
When N actors share a single name and the parent calls
|
||||
`an.cancel()`, the daemon registrar gets N `register_actor` RPCs
|
||||
in tight succession. Under TCP+MTF, kernel-level socket-buffer
|
||||
contention can push at least one sub-actor's cancel-RPC ack past
|
||||
`Portal.cancel_timeout` (default 0.5s).
|
||||
|
||||
Pre-fix, `Portal.cancel_actor()` silently returned `False` on
|
||||
that timeout, the supervisor's outer `move_on_after(3)` never
|
||||
fired (each per-portal task always returned ≤0.5s, never
|
||||
exceeded 3s), and `soft_kill()`'s `await wait_func(proc)` parked
|
||||
forever — deadlocking nursery `__aexit__`.
|
||||
|
||||
Post-fix, `Portal.cancel_actor()` raises `ActorTooSlowError` on
|
||||
the bounded-wait timeout, and `ActorNursery.cancel()`'s
|
||||
per-child wrapper escalates to `proc.terminate()` (hard-kill).
|
||||
The full nursery teardown therefore stays bounded even under
|
||||
pathological timing.
|
||||
|
||||
`n_dups` is parametrized to widen the race window — more
|
||||
same-name siblings = more concurrent register-RPCs at the
|
||||
daemon = higher probability of hitting the contention path.
|
||||
|
||||
'''
|
||||
log = tractor.log.get_logger(
|
||||
'tractor.tests.test_multi_program'
|
||||
)
|
||||
|
||||
# outer hard ceiling: a regression should fail-fast, NOT hang
|
||||
# the test session for minutes. Budget scales with `n_dups`
|
||||
# since each extra same-name sibling adds ~spawn-cost +
|
||||
# potential cancel-ack-timeout escalation latency under
|
||||
# TCP+forkserver. ~5s/sibling + 15s baseline gives plenty of
|
||||
# headroom while still failing-loud on a real hang.
|
||||
fail_after_s: int = 15 + (5 * n_dups)
|
||||
|
||||
async def main():
|
||||
log.cancel(
|
||||
f'enter `main()` n_dups={n_dups}'
|
||||
)
|
||||
with trio.fail_after(fail_after_s):
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
) as an:
|
||||
portals: list[Portal] = []
|
||||
for i in range(n_dups):
|
||||
p: Portal = await an.start_actor('doggy')
|
||||
portals.append(p)
|
||||
log.cancel(
|
||||
f'spawned doggy #{i + 1}/{n_dups}'
|
||||
)
|
||||
|
||||
# at least one of the N must be discoverable by
|
||||
# name; doesn't matter which one (registrar will
|
||||
# have last-wins semantics under same-name).
|
||||
async with tractor.wait_for_actor('doggy') as portal:
|
||||
expected_uids = {p.channel.uid for p in portals}
|
||||
assert portal.channel.uid in expected_uids
|
||||
|
||||
# critical section: this MUST return within
|
||||
# `fail_after_s` even when one or more cancel-RPC
|
||||
# acks time out. Pre-fix, this hangs forever.
|
||||
log.cancel('about to call `an.cancel()`')
|
||||
await an.cancel()
|
||||
log.cancel('`an.cancel()` returned')
|
||||
|
||||
# post-teardown sanity: every child proc must be reaped.
|
||||
# If escalation worked, even timed-out cancel-RPCs would
|
||||
# have triggered `proc.terminate()` and the procs are dead.
|
||||
for p in portals:
|
||||
# `Portal.channel.connected()` -> False once the
|
||||
# underlying chan disconnected (clean exit OR
|
||||
# hard-killed proc both produce disconnect).
|
||||
assert not p.channel.connected(), (
|
||||
f'Portal chan still connected post-teardown?\n'
|
||||
f'{p.channel}'
|
||||
)
|
||||
|
||||
trio.run(main)
|
||||
|
||||
|
||||
@tractor.context
|
||||
async def get_root_portal(
|
||||
ctx: Context,
|
||||
):
|
||||
'''
|
||||
Connect back to the root actor manually (using `._discovery` API)
|
||||
and ensure it's contact info is the same as our immediate parent.
|
||||
|
||||
'''
|
||||
sub: Actor = current_actor()
|
||||
rtvs: dict = _state._runtime_vars
|
||||
raddrs: list[UnwrappedAddress] = rtvs['_root_addrs']
|
||||
|
||||
# await tractor.pause()
|
||||
# XXX, in case the sub->root discovery breaks you might need
|
||||
# this (i know i did Xp)!!
|
||||
# from tractor.devx import mk_pdb
|
||||
# mk_pdb().set_trace()
|
||||
|
||||
assert (
|
||||
len(raddrs) == 1
|
||||
and
|
||||
list(sub._parent_chan.raddr.unwrap()) in raddrs
|
||||
)
|
||||
|
||||
# connect back to our immediate parent which should also
|
||||
# be the actor-tree's root.
|
||||
from tractor.discovery._api import get_root
|
||||
ptl: Portal
|
||||
async with get_root() as ptl:
|
||||
root_aid: Aid = ptl.chan.aid
|
||||
parent_ptl: Portal = current_actor().get_parent()
|
||||
assert (
|
||||
root_aid.name == 'root'
|
||||
and
|
||||
parent_ptl.chan.aid == root_aid
|
||||
)
|
||||
await ctx.started()
|
||||
|
||||
|
||||
def test_non_registrar_spawns_child(
|
||||
daemon: subprocess.Popen,
|
||||
reg_addr: UnwrappedAddress,
|
||||
loglevel: str,
|
||||
debug_mode: bool,
|
||||
ci_env: bool,
|
||||
):
|
||||
'''
|
||||
Ensure a non-regristar (serving) root actor can spawn a sub and
|
||||
that sub can connect back (manually) to it's rent that is the
|
||||
root without issue.
|
||||
|
||||
More or less this audits the global contact info in
|
||||
`._state._runtime_vars`.
|
||||
|
||||
'''
|
||||
async def main():
|
||||
|
||||
# XXX, since apparently on macos in GH's CI it can be a race
|
||||
# with the `daemon` registrar on grabbing the socket-addr..
|
||||
if ci_env and _non_linux:
|
||||
await trio.sleep(.5)
|
||||
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
loglevel=loglevel,
|
||||
debug_mode=debug_mode,
|
||||
) as an:
|
||||
|
||||
actor: Actor = tractor.current_actor()
|
||||
assert not actor.is_registrar
|
||||
sub_ptl: Portal = await an.start_actor(
|
||||
name='sub',
|
||||
enable_modules=[__name__],
|
||||
)
|
||||
|
||||
async with sub_ptl.open_context(
|
||||
get_root_portal,
|
||||
) as (ctx, _):
|
||||
print('Waiting for `sub` to connect back to us..')
|
||||
|
||||
await an.cancel()
|
||||
|
||||
# XXX, run manually since we want to start this root **after**
|
||||
# the other "daemon" program with it's own root.
|
||||
trio.run(main)
|
||||
|
|
@ -0,0 +1,376 @@
|
|||
'''
|
||||
Multiaddr construction, parsing, and round-trip tests for
|
||||
`tractor.discovery._multiaddr.mk_maddr()` and
|
||||
`tractor.discovery._multiaddr.parse_maddr()`.
|
||||
|
||||
'''
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
from multiaddr import Multiaddr
|
||||
|
||||
from tractor.ipc._tcp import TCPAddress
|
||||
from tractor.ipc._uds import UDSAddress
|
||||
from tractor.discovery._multiaddr import (
|
||||
mk_maddr,
|
||||
parse_maddr,
|
||||
parse_endpoints,
|
||||
_tpt_proto_to_maddr,
|
||||
_maddr_to_tpt_proto,
|
||||
)
|
||||
from tractor.discovery._addr import wrap_address
|
||||
|
||||
|
||||
def test_tpt_proto_to_maddr_mapping():
|
||||
'''
|
||||
`_tpt_proto_to_maddr` maps all supported `proto_key`
|
||||
values to their correct multiaddr protocol names.
|
||||
|
||||
'''
|
||||
assert _tpt_proto_to_maddr['tcp'] == 'tcp'
|
||||
assert _tpt_proto_to_maddr['uds'] == 'unix'
|
||||
assert len(_tpt_proto_to_maddr) == 2
|
||||
|
||||
|
||||
def test_mk_maddr_tcp_ipv4():
|
||||
'''
|
||||
`mk_maddr()` on a `TCPAddress` with an IPv4 host
|
||||
produces the correct `/ip4/<host>/tcp/<port>` multiaddr.
|
||||
|
||||
'''
|
||||
addr = TCPAddress('127.0.0.1', 1234)
|
||||
result: Multiaddr = mk_maddr(addr)
|
||||
|
||||
assert isinstance(result, Multiaddr)
|
||||
assert str(result) == '/ip4/127.0.0.1/tcp/1234'
|
||||
|
||||
protos = result.protocols()
|
||||
assert protos[0].name == 'ip4'
|
||||
assert protos[1].name == 'tcp'
|
||||
|
||||
assert result.value_for_protocol('ip4') == '127.0.0.1'
|
||||
assert result.value_for_protocol('tcp') == '1234'
|
||||
|
||||
|
||||
def test_mk_maddr_tcp_ipv6():
|
||||
'''
|
||||
`mk_maddr()` on a `TCPAddress` with an IPv6 host
|
||||
produces the correct `/ip6/<host>/tcp/<port>` multiaddr.
|
||||
|
||||
'''
|
||||
addr = TCPAddress('::1', 5678)
|
||||
result: Multiaddr = mk_maddr(addr)
|
||||
|
||||
assert str(result) == '/ip6/::1/tcp/5678'
|
||||
|
||||
protos = result.protocols()
|
||||
assert protos[0].name == 'ip6'
|
||||
assert protos[1].name == 'tcp'
|
||||
|
||||
|
||||
def test_mk_maddr_uds():
|
||||
'''
|
||||
`mk_maddr()` on a `UDSAddress` produces a `/unix/<path>`
|
||||
multiaddr containing the full socket path.
|
||||
|
||||
'''
|
||||
# NOTE, use an absolute `filedir` to match real runtime
|
||||
# UDS paths; `mk_maddr()` strips the leading `/` to avoid
|
||||
# the double-slash `/unix//run/..` that py-multiaddr
|
||||
# rejects as "empty protocol path".
|
||||
filedir = '/tmp/tractor_test'
|
||||
filename = 'test_sock.sock'
|
||||
addr = UDSAddress(
|
||||
filedir=filedir,
|
||||
filename=filename,
|
||||
)
|
||||
result: Multiaddr = mk_maddr(addr)
|
||||
|
||||
assert isinstance(result, Multiaddr)
|
||||
|
||||
result_str: str = str(result)
|
||||
assert result_str.startswith('/unix/')
|
||||
# verify the leading `/` was stripped to avoid double-slash
|
||||
assert '/unix/tmp/tractor_test/' in result_str
|
||||
|
||||
sockpath_rel: str = str(
|
||||
Path(filedir) / filename
|
||||
).lstrip('/')
|
||||
unix_val: str = result.value_for_protocol('unix')
|
||||
assert unix_val.endswith(sockpath_rel)
|
||||
|
||||
|
||||
def test_mk_maddr_unsupported_proto_key():
|
||||
'''
|
||||
`mk_maddr()` raises `ValueError` for an unsupported
|
||||
`proto_key`.
|
||||
|
||||
'''
|
||||
fake_addr = SimpleNamespace(proto_key='quic')
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match='Unsupported proto_key',
|
||||
):
|
||||
mk_maddr(fake_addr)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'addr',
|
||||
[
|
||||
pytest.param(
|
||||
TCPAddress('127.0.0.1', 9999),
|
||||
id='tcp-ipv4',
|
||||
),
|
||||
pytest.param(
|
||||
UDSAddress(
|
||||
filedir='/tmp/tractor_rt',
|
||||
filename='roundtrip.sock',
|
||||
),
|
||||
id='uds',
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_mk_maddr_roundtrip(addr):
|
||||
'''
|
||||
`mk_maddr()` output is valid multiaddr syntax that the
|
||||
library can re-parse back into an equivalent `Multiaddr`.
|
||||
|
||||
'''
|
||||
maddr: Multiaddr = mk_maddr(addr)
|
||||
reparsed = Multiaddr(str(maddr))
|
||||
|
||||
assert reparsed == maddr
|
||||
assert str(reparsed) == str(maddr)
|
||||
|
||||
|
||||
# ------ parse_maddr() tests ------
|
||||
|
||||
def test_maddr_to_tpt_proto_mapping():
|
||||
'''
|
||||
`_maddr_to_tpt_proto` is the exact inverse of
|
||||
`_tpt_proto_to_maddr`.
|
||||
|
||||
'''
|
||||
assert _maddr_to_tpt_proto == {
|
||||
'tcp': 'tcp',
|
||||
'unix': 'uds',
|
||||
}
|
||||
|
||||
|
||||
def test_parse_maddr_tcp_ipv4():
|
||||
'''
|
||||
`parse_maddr()` on an IPv4 TCP multiaddr string
|
||||
produce a `TCPAddress` with the correct host and port.
|
||||
|
||||
'''
|
||||
result = parse_maddr('/ip4/127.0.0.1/tcp/1234')
|
||||
|
||||
assert isinstance(result, TCPAddress)
|
||||
assert result.unwrap() == ('127.0.0.1', 1234)
|
||||
|
||||
|
||||
def test_parse_maddr_tcp_ipv6():
|
||||
'''
|
||||
`parse_maddr()` on an IPv6 TCP multiaddr string
|
||||
produce a `TCPAddress` with the correct host and port.
|
||||
|
||||
'''
|
||||
result = parse_maddr('/ip6/::1/tcp/5678')
|
||||
|
||||
assert isinstance(result, TCPAddress)
|
||||
assert result.unwrap() == ('::1', 5678)
|
||||
|
||||
|
||||
def test_parse_maddr_uds():
|
||||
'''
|
||||
`parse_maddr()` on a `/unix/...` multiaddr string
|
||||
produce a `UDSAddress` with the correct dir and filename,
|
||||
preserving absolute path semantics.
|
||||
|
||||
'''
|
||||
result = parse_maddr('/unix/tmp/tractor_test/test.sock')
|
||||
|
||||
assert isinstance(result, UDSAddress)
|
||||
filedir, filename = result.unwrap()
|
||||
assert filename == 'test.sock'
|
||||
assert str(filedir) == '/tmp/tractor_test'
|
||||
|
||||
|
||||
def test_parse_maddr_unsupported():
|
||||
'''
|
||||
`parse_maddr()` raise `ValueError` for an unsupported
|
||||
protocol combination like UDP.
|
||||
|
||||
'''
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match='Unsupported multiaddr protocol combo',
|
||||
):
|
||||
parse_maddr('/ip4/127.0.0.1/udp/1234')
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'addr',
|
||||
[
|
||||
pytest.param(
|
||||
TCPAddress('127.0.0.1', 9999),
|
||||
id='tcp-ipv4',
|
||||
),
|
||||
pytest.param(
|
||||
UDSAddress(
|
||||
filedir='/tmp/tractor_rt',
|
||||
filename='roundtrip.sock',
|
||||
),
|
||||
id='uds',
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_parse_maddr_roundtrip(addr):
|
||||
'''
|
||||
Full round-trip: `addr -> mk_maddr -> str -> parse_maddr`
|
||||
produce an `Address` whose `.unwrap()` matches the original.
|
||||
|
||||
'''
|
||||
maddr: Multiaddr = mk_maddr(addr)
|
||||
maddr_str: str = str(maddr)
|
||||
parsed = parse_maddr(maddr_str)
|
||||
|
||||
assert type(parsed) is type(addr)
|
||||
assert parsed.unwrap() == addr.unwrap()
|
||||
|
||||
|
||||
def test_wrap_address_maddr_str():
|
||||
'''
|
||||
`wrap_address()` accept a multiaddr-format string and
|
||||
return the correct `Address` type.
|
||||
|
||||
'''
|
||||
result = wrap_address('/ip4/127.0.0.1/tcp/9999')
|
||||
|
||||
assert isinstance(result, TCPAddress)
|
||||
assert result.unwrap() == ('127.0.0.1', 9999)
|
||||
|
||||
|
||||
# ------ parse_endpoints() tests ------
|
||||
|
||||
def test_parse_endpoints_tcp_only():
|
||||
'''
|
||||
`parse_endpoints()` with a single TCP maddr per actor
|
||||
produce the correct `TCPAddress` instances.
|
||||
|
||||
'''
|
||||
table = {
|
||||
'registry': ['/ip4/127.0.0.1/tcp/1616'],
|
||||
'data_feed': ['/ip4/0.0.0.0/tcp/5555'],
|
||||
}
|
||||
result = parse_endpoints(table)
|
||||
|
||||
assert set(result.keys()) == {'registry', 'data_feed'}
|
||||
|
||||
reg_addr = result['registry'][0]
|
||||
assert isinstance(reg_addr, TCPAddress)
|
||||
assert reg_addr.unwrap() == ('127.0.0.1', 1616)
|
||||
|
||||
feed_addr = result['data_feed'][0]
|
||||
assert isinstance(feed_addr, TCPAddress)
|
||||
assert feed_addr.unwrap() == ('0.0.0.0', 5555)
|
||||
|
||||
|
||||
def test_parse_endpoints_mixed_tpts():
|
||||
'''
|
||||
`parse_endpoints()` with both TCP and UDS maddrs for
|
||||
the same actor produce the correct mixed `Address` list.
|
||||
|
||||
'''
|
||||
table = {
|
||||
'broker': [
|
||||
'/ip4/127.0.0.1/tcp/4040',
|
||||
'/unix/tmp/tractor/broker.sock',
|
||||
],
|
||||
}
|
||||
result = parse_endpoints(table)
|
||||
addrs = result['broker']
|
||||
|
||||
assert len(addrs) == 2
|
||||
assert isinstance(addrs[0], TCPAddress)
|
||||
assert addrs[0].unwrap() == ('127.0.0.1', 4040)
|
||||
|
||||
assert isinstance(addrs[1], UDSAddress)
|
||||
filedir, filename = addrs[1].unwrap()
|
||||
assert filename == 'broker.sock'
|
||||
assert str(filedir) == '/tmp/tractor'
|
||||
|
||||
|
||||
def test_parse_endpoints_unwrapped_tuples():
|
||||
'''
|
||||
`parse_endpoints()` accept raw `(host, port)` tuples
|
||||
and wrap them as `TCPAddress`.
|
||||
|
||||
'''
|
||||
table = {
|
||||
'ems': [('127.0.0.1', 6666)],
|
||||
}
|
||||
result = parse_endpoints(table)
|
||||
|
||||
addr = result['ems'][0]
|
||||
assert isinstance(addr, TCPAddress)
|
||||
assert addr.unwrap() == ('127.0.0.1', 6666)
|
||||
|
||||
|
||||
def test_parse_endpoints_mixed_str_and_tuple():
|
||||
'''
|
||||
`parse_endpoints()` accept a mix of maddr strings and
|
||||
raw tuples in the same actor entry list.
|
||||
|
||||
'''
|
||||
table = {
|
||||
'quoter': [
|
||||
'/ip4/127.0.0.1/tcp/7777',
|
||||
('127.0.0.1', 8888),
|
||||
],
|
||||
}
|
||||
result = parse_endpoints(table)
|
||||
addrs = result['quoter']
|
||||
|
||||
assert len(addrs) == 2
|
||||
assert isinstance(addrs[0], TCPAddress)
|
||||
assert addrs[0].unwrap() == ('127.0.0.1', 7777)
|
||||
|
||||
assert isinstance(addrs[1], TCPAddress)
|
||||
assert addrs[1].unwrap() == ('127.0.0.1', 8888)
|
||||
|
||||
|
||||
def test_parse_endpoints_unsupported_proto():
|
||||
'''
|
||||
`parse_endpoints()` raise `ValueError` when a maddr
|
||||
string uses an unsupported protocol like `/udp/`.
|
||||
|
||||
'''
|
||||
table = {
|
||||
'bad_actor': ['/ip4/127.0.0.1/udp/9999'],
|
||||
}
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match='Unsupported multiaddr protocol combo',
|
||||
):
|
||||
parse_endpoints(table)
|
||||
|
||||
|
||||
def test_parse_endpoints_empty_table():
|
||||
'''
|
||||
`parse_endpoints()` on an empty table return an empty
|
||||
dict.
|
||||
|
||||
'''
|
||||
assert parse_endpoints({}) == {}
|
||||
|
||||
|
||||
def test_parse_endpoints_empty_actor_list():
|
||||
'''
|
||||
`parse_endpoints()` with an actor mapped to an empty
|
||||
list preserve the key with an empty list value.
|
||||
|
||||
'''
|
||||
result = parse_endpoints({'x': []})
|
||||
assert result == {'x': []}
|
||||
|
|
@ -0,0 +1,673 @@
|
|||
'''
|
||||
Discovery subsystem via a "registrar" actor scenarios.
|
||||
|
||||
'''
|
||||
import os
|
||||
import signal
|
||||
import platform
|
||||
from functools import partial
|
||||
import itertools
|
||||
import time
|
||||
from typing import Callable
|
||||
|
||||
import psutil
|
||||
import pytest
|
||||
import subprocess
|
||||
import tractor
|
||||
from tractor.devx import dump_on_hang
|
||||
from tractor.trionics import collapse_eg
|
||||
from tractor._testing import tractor_test
|
||||
from tractor.discovery._addr import wrap_address
|
||||
from tractor.discovery._multiaddr import mk_maddr
|
||||
import trio
|
||||
|
||||
|
||||
pytestmark = pytest.mark.usefixtures(
|
||||
'reap_subactors_per_test',
|
||||
# NOTE, registrar tests stress the discovery
|
||||
# roundtrip (find_actor / wait_for_actor) which
|
||||
# historically left orphaned UDS sock-files when
|
||||
# subactor `hard_kill` SIGKILL'd, and which
|
||||
# exercises the same trio `WakeupSocketpair`
|
||||
# peer-disconnect path that triggered the
|
||||
# busy-loop bug class.
|
||||
'track_orphaned_uds_per_test',
|
||||
'detect_runaway_subactors_per_test',
|
||||
)
|
||||
|
||||
|
||||
@tractor_test
|
||||
async def test_reg_then_unreg(
|
||||
reg_addr: tuple,
|
||||
):
|
||||
actor = tractor.current_actor()
|
||||
assert actor.is_registrar
|
||||
assert len(actor._registry) == 1 # only self is registered
|
||||
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
) as n:
|
||||
|
||||
portal = await n.start_actor('actor', enable_modules=[__name__])
|
||||
uid = portal.channel.aid.uid
|
||||
|
||||
async with tractor.get_registry(reg_addr) as aportal:
|
||||
# this local actor should be the registrar
|
||||
assert actor is aportal.actor
|
||||
|
||||
async with tractor.wait_for_actor('actor'):
|
||||
# sub-actor uid should be in the registry
|
||||
assert uid in aportal.actor._registry
|
||||
sockaddrs = actor._registry[uid]
|
||||
# XXX: can we figure out what the listen addr will be?
|
||||
assert sockaddrs
|
||||
|
||||
await n.cancel() # tear down nursery
|
||||
|
||||
await trio.sleep(0.1)
|
||||
assert uid not in aportal.actor._registry
|
||||
sockaddrs = actor._registry.get(uid)
|
||||
assert not sockaddrs
|
||||
|
||||
|
||||
@tractor_test
|
||||
async def test_reg_then_unreg_maddr(
|
||||
reg_addr: tuple,
|
||||
):
|
||||
'''
|
||||
Same as `test_reg_then_unreg` but pass the registry
|
||||
address as a multiaddr string to verify `wrap_address()`
|
||||
multiaddr parsing end-to-end through the runtime.
|
||||
|
||||
'''
|
||||
# tuple -> Address -> multiaddr string
|
||||
addr_obj = wrap_address(reg_addr)
|
||||
maddr_str: str = str(mk_maddr(addr_obj))
|
||||
|
||||
actor = tractor.current_actor()
|
||||
assert actor.is_registrar
|
||||
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[maddr_str],
|
||||
) as n:
|
||||
|
||||
portal = await n.start_actor(
|
||||
'actor_maddr',
|
||||
enable_modules=[__name__],
|
||||
)
|
||||
uid = portal.channel.aid.uid
|
||||
|
||||
async with tractor.get_registry(maddr_str) as aportal:
|
||||
assert actor is aportal.actor
|
||||
|
||||
async with tractor.wait_for_actor('actor_maddr'):
|
||||
assert uid in aportal.actor._registry
|
||||
sockaddrs = actor._registry[uid]
|
||||
assert sockaddrs
|
||||
|
||||
await n.cancel()
|
||||
|
||||
await trio.sleep(0.1)
|
||||
assert uid not in aportal.actor._registry
|
||||
sockaddrs = actor._registry.get(uid)
|
||||
assert not sockaddrs
|
||||
|
||||
|
||||
the_line = 'Hi my name is {}'
|
||||
|
||||
|
||||
async def hi():
|
||||
return the_line.format(tractor.current_actor().name)
|
||||
|
||||
|
||||
async def say_hello_use_wait(
|
||||
other_actor: str,
|
||||
reg_addr: tuple[str, int],
|
||||
):
|
||||
async with tractor.wait_for_actor(
|
||||
other_actor,
|
||||
registry_addr=reg_addr,
|
||||
) as portal:
|
||||
assert portal is not None
|
||||
result = await portal.run(__name__, 'hi')
|
||||
return result
|
||||
|
||||
|
||||
@tractor_test(
|
||||
timeout=7,
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
'ria_fn',
|
||||
[
|
||||
say_hello_use_wait,
|
||||
]
|
||||
)
|
||||
async def test_trynamic_trio(
|
||||
ria_fn: Callable,
|
||||
start_method: str,
|
||||
reg_addr: tuple,
|
||||
):
|
||||
'''
|
||||
Root actor acting as the "director" and running one-shot-task-actors
|
||||
for the directed subs.
|
||||
|
||||
'''
|
||||
async with tractor.open_nursery() as n:
|
||||
print("Alright... Action!")
|
||||
|
||||
donny = await n.run_in_actor(
|
||||
ria_fn,
|
||||
other_actor='gretchen',
|
||||
reg_addr=reg_addr,
|
||||
name='donny',
|
||||
)
|
||||
gretchen = await n.run_in_actor(
|
||||
ria_fn,
|
||||
other_actor='donny',
|
||||
reg_addr=reg_addr,
|
||||
name='gretchen',
|
||||
)
|
||||
print(await gretchen.result())
|
||||
print(await donny.result())
|
||||
print("CUTTTT CUUTT CUT!!?! Donny!! You're supposed to say...")
|
||||
|
||||
|
||||
async def stream_forever():
|
||||
for i in itertools.count():
|
||||
yield i
|
||||
await trio.sleep(0.01)
|
||||
|
||||
|
||||
async def cancel(
|
||||
use_signal: bool,
|
||||
delay: float = 0,
|
||||
):
|
||||
# hold on there sally
|
||||
await trio.sleep(delay)
|
||||
|
||||
# trigger cancel
|
||||
if use_signal:
|
||||
if platform.system() == 'Windows':
|
||||
pytest.skip("SIGINT not supported on windows")
|
||||
os.kill(os.getpid(), signal.SIGINT)
|
||||
else:
|
||||
raise KeyboardInterrupt
|
||||
|
||||
|
||||
async def stream_from(portal: tractor.Portal):
|
||||
async with portal.open_stream_from(stream_forever) as stream:
|
||||
async for value in stream:
|
||||
print(value)
|
||||
|
||||
|
||||
async def unpack_reg(
|
||||
actor_or_portal: tractor.Portal|tractor.Actor,
|
||||
):
|
||||
'''
|
||||
Get and unpack a "registry" RPC request from the registrar
|
||||
system.
|
||||
|
||||
'''
|
||||
if getattr(actor_or_portal, 'get_registry', None):
|
||||
msg = await actor_or_portal.get_registry()
|
||||
else:
|
||||
msg = await actor_or_portal.run_from_ns('self', 'get_registry')
|
||||
|
||||
return {
|
||||
tuple(key.split('.')): val
|
||||
for key, val in msg.items()
|
||||
}
|
||||
|
||||
|
||||
async def spawn_and_check_registry(
|
||||
reg_addr: tuple,
|
||||
use_signal: bool,
|
||||
debug_mode: bool = False,
|
||||
remote_arbiter: bool = False,
|
||||
with_streaming: bool = False,
|
||||
maybe_daemon: tuple[
|
||||
subprocess.Popen,
|
||||
psutil.Process,
|
||||
]|None = None,
|
||||
|
||||
) -> None:
|
||||
|
||||
if maybe_daemon:
|
||||
popen, proc = maybe_daemon
|
||||
# breakpoint()
|
||||
|
||||
async with tractor.open_root_actor(
|
||||
registry_addrs=[reg_addr],
|
||||
debug_mode=debug_mode,
|
||||
):
|
||||
async with tractor.get_registry(
|
||||
addr=reg_addr,
|
||||
) as portal:
|
||||
# runtime needs to be up to call this
|
||||
actor = tractor.current_actor()
|
||||
|
||||
if remote_arbiter:
|
||||
assert not actor.is_registrar
|
||||
|
||||
if actor.is_registrar:
|
||||
extra = 1 # registrar is local root actor
|
||||
get_reg = partial(unpack_reg, actor)
|
||||
|
||||
else:
|
||||
get_reg = partial(unpack_reg, portal)
|
||||
extra = 2 # local root actor + remote registrar
|
||||
|
||||
# ensure current actor is registered
|
||||
registry: dict = await get_reg()
|
||||
assert actor.aid.uid in registry
|
||||
|
||||
try:
|
||||
async with tractor.open_nursery() as an:
|
||||
async with (
|
||||
collapse_eg(),
|
||||
trio.open_nursery() as trion,
|
||||
):
|
||||
portals = {}
|
||||
for i in range(3):
|
||||
name = f'a{i}'
|
||||
if with_streaming:
|
||||
portals[name] = await an.start_actor(
|
||||
name=name, enable_modules=[__name__])
|
||||
|
||||
else: # no streaming
|
||||
portals[name] = await an.run_in_actor(
|
||||
trio.sleep_forever, name=name)
|
||||
|
||||
# wait on last actor to come up
|
||||
async with tractor.wait_for_actor(name):
|
||||
registry = await get_reg()
|
||||
for uid in an._children:
|
||||
assert uid in registry
|
||||
|
||||
assert len(portals) + extra == len(registry)
|
||||
|
||||
if with_streaming:
|
||||
await trio.sleep(0.1)
|
||||
|
||||
pts = list(portals.values())
|
||||
for p in pts[:-1]:
|
||||
trion.start_soon(stream_from, p)
|
||||
|
||||
# stream for 1 sec
|
||||
trion.start_soon(cancel, use_signal, 1)
|
||||
|
||||
last_p = pts[-1]
|
||||
await stream_from(last_p)
|
||||
|
||||
else:
|
||||
await cancel(use_signal)
|
||||
|
||||
finally:
|
||||
await trio.sleep(0.5)
|
||||
|
||||
# all subactors should have de-registered
|
||||
registry = await get_reg()
|
||||
start: float = time.time()
|
||||
while (
|
||||
not (len(registry) == extra)
|
||||
and
|
||||
(time.time() - start) < 5
|
||||
):
|
||||
print(
|
||||
f'Waiting for remaining subs to dereg..\n'
|
||||
f'{registry!r}\n'
|
||||
)
|
||||
await trio.sleep(0.3)
|
||||
else:
|
||||
assert len(registry) == extra
|
||||
|
||||
assert actor.aid.uid in registry
|
||||
|
||||
|
||||
async def with_timeout(
|
||||
main: Callable,
|
||||
timeout: float = 6,
|
||||
):
|
||||
with trio.fail_after(timeout):
|
||||
await main()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_signal', [False, True])
|
||||
@pytest.mark.parametrize('with_streaming', [False, True])
|
||||
def test_subactors_unregister_on_cancel(
|
||||
debug_mode: bool,
|
||||
start_method: str,
|
||||
use_signal: bool,
|
||||
reg_addr: tuple,
|
||||
with_streaming: bool,
|
||||
):
|
||||
'''
|
||||
Verify that cancelling a nursery results in all subactors
|
||||
deregistering themselves with the registrar.
|
||||
|
||||
'''
|
||||
with pytest.raises(KeyboardInterrupt):
|
||||
trio.run(
|
||||
# with_timeout,
|
||||
partial(
|
||||
spawn_and_check_registry,
|
||||
reg_addr,
|
||||
use_signal,
|
||||
debug_mode=debug_mode,
|
||||
remote_arbiter=False,
|
||||
with_streaming=with_streaming,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_signal', [False, True])
|
||||
@pytest.mark.parametrize('with_streaming', [False, True])
|
||||
def test_subactors_unregister_on_cancel_remote_daemon(
|
||||
daemon: subprocess.Popen,
|
||||
debug_mode: bool,
|
||||
start_method: str,
|
||||
use_signal: bool,
|
||||
reg_addr: tuple,
|
||||
with_streaming: bool,
|
||||
):
|
||||
'''
|
||||
Verify that cancelling a nursery results in all subactors
|
||||
deregistering themselves with a **remote** (not in the local
|
||||
process tree) registrar.
|
||||
|
||||
'''
|
||||
with pytest.raises(KeyboardInterrupt):
|
||||
trio.run(
|
||||
with_timeout,
|
||||
partial(
|
||||
spawn_and_check_registry,
|
||||
reg_addr,
|
||||
use_signal,
|
||||
debug_mode=debug_mode,
|
||||
remote_arbiter=True,
|
||||
with_streaming=with_streaming,
|
||||
maybe_daemon=(
|
||||
daemon,
|
||||
psutil.Process(daemon.pid)
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
async def streamer(agen):
|
||||
async for item in agen:
|
||||
print(item)
|
||||
|
||||
|
||||
async def close_chans_before_nursery(
|
||||
reg_addr: tuple,
|
||||
use_signal: bool,
|
||||
remote_arbiter: bool = False,
|
||||
) -> None:
|
||||
|
||||
# logic for how many actors should still be
|
||||
# in the registry at teardown.
|
||||
if remote_arbiter:
|
||||
entries_at_end = 2
|
||||
else:
|
||||
entries_at_end = 1
|
||||
|
||||
async with tractor.open_root_actor(
|
||||
registry_addrs=[reg_addr],
|
||||
):
|
||||
async with tractor.get_registry(reg_addr) as aportal:
|
||||
try:
|
||||
get_reg = partial(unpack_reg, aportal)
|
||||
|
||||
async with tractor.open_nursery() as an:
|
||||
portal1 = await an.start_actor(
|
||||
name='consumer1',
|
||||
enable_modules=[__name__],
|
||||
)
|
||||
portal2 = await an.start_actor(
|
||||
'consumer2',
|
||||
enable_modules=[__name__],
|
||||
)
|
||||
|
||||
async with (
|
||||
portal1.open_stream_from(
|
||||
stream_forever
|
||||
) as agen1,
|
||||
portal2.open_stream_from(
|
||||
stream_forever
|
||||
) as agen2,
|
||||
):
|
||||
async with (
|
||||
collapse_eg(),
|
||||
trio.open_nursery() as tn,
|
||||
):
|
||||
tn.start_soon(streamer, agen1)
|
||||
tn.start_soon(cancel, use_signal, .5)
|
||||
try:
|
||||
await streamer(agen2)
|
||||
finally:
|
||||
# Kill the root nursery thus resulting in
|
||||
# normal registrar channel ops to fail during
|
||||
# teardown. It doesn't seem like this is
|
||||
# reliably triggered by an external SIGINT.
|
||||
# tractor.current_actor()._root_nursery.cancel_scope.cancel()
|
||||
|
||||
# XXX: THIS IS THE KEY THING that
|
||||
# happens **before** exiting the
|
||||
# actor nursery block
|
||||
|
||||
# also kill off channels cuz why not
|
||||
await agen1.aclose()
|
||||
await agen2.aclose()
|
||||
|
||||
finally:
|
||||
with trio.CancelScope(shield=True):
|
||||
await trio.sleep(1)
|
||||
|
||||
# all subactors should have de-registered
|
||||
registry = await get_reg()
|
||||
assert portal1.channel.aid.uid not in registry
|
||||
assert portal2.channel.aid.uid not in registry
|
||||
assert len(registry) == entries_at_end
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_signal', [False, True])
|
||||
def test_close_channel_explicit(
|
||||
start_method: str,
|
||||
use_signal: bool,
|
||||
reg_addr: tuple,
|
||||
):
|
||||
'''
|
||||
Verify that closing a stream explicitly and killing the actor's
|
||||
"root nursery" **before** the containing nursery tears down also
|
||||
results in subactor(s) deregistering from the registrar.
|
||||
|
||||
'''
|
||||
with pytest.raises(KeyboardInterrupt):
|
||||
trio.run(
|
||||
partial(
|
||||
close_chans_before_nursery,
|
||||
reg_addr,
|
||||
use_signal,
|
||||
remote_arbiter=False,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('use_signal', [False, True])
|
||||
def test_close_channel_explicit_remote_registrar(
|
||||
daemon: subprocess.Popen,
|
||||
start_method: str,
|
||||
use_signal: bool,
|
||||
reg_addr: tuple,
|
||||
):
|
||||
'''
|
||||
Verify that closing a stream explicitly and killing the actor's
|
||||
"root nursery" **before** the containing nursery tears down also
|
||||
results in subactor(s) deregistering from the registrar.
|
||||
|
||||
'''
|
||||
with pytest.raises(KeyboardInterrupt):
|
||||
trio.run(
|
||||
partial(
|
||||
close_chans_before_nursery,
|
||||
reg_addr,
|
||||
use_signal,
|
||||
remote_arbiter=True,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@tractor.context
|
||||
async def kill_transport(
|
||||
ctx: tractor.Context,
|
||||
) -> None:
|
||||
|
||||
await ctx.started()
|
||||
actor: tractor.Actor = tractor.current_actor()
|
||||
actor.ipc_server.cancel()
|
||||
await trio.sleep_forever()
|
||||
|
||||
|
||||
|
||||
# ?TODO, do a OSc style signalling test on this?
|
||||
# -[ ] doesn't work for fork backends
|
||||
# @pytest.mark.parametrize('use_signal', [False, True])
|
||||
#
|
||||
# Wall-clock bound via `pytest-timeout` (`method='thread'`).
|
||||
# Under `--spawn-backend=subint` this test can wedge in an
|
||||
# un-Ctrl-C-able state (abandoned-subint + shared-GIL
|
||||
# starvation → signal-wakeup-fd pipe fills → SIGINT silently
|
||||
# dropped; see `ai/conc-anal/subint_sigint_starvation_issue.md`).
|
||||
# `method='thread'` is specifically required because `signal`-
|
||||
# method SIGALRM suffers the same GIL-starvation path and
|
||||
# wouldn't fire the Python-level handler.
|
||||
# At timeout the plugin hard-kills the pytest process — that's
|
||||
# the intended behavior here; the alternative is an unattended
|
||||
# suite run that never returns.
|
||||
# @pytest.mark.timeout(
|
||||
# 30,
|
||||
# # NOTE should be a 2.1s happy path.
|
||||
# # XXX for `main_thread_forkserver` this is SUPER SENSITIVE
|
||||
# # so keep it higher to avoid flaky runs..
|
||||
# method='thread',
|
||||
# )
|
||||
@pytest.mark.skipon_spawn_backend(
|
||||
'subint',
|
||||
# 'main_thread_forkserver',
|
||||
reason=(
|
||||
'XXX SUBINT HANGING TEST XXX\n'
|
||||
'See outstanding issue(s)\n'
|
||||
# TODO, put issue link!
|
||||
)
|
||||
)
|
||||
def test_stale_entry_is_deleted(
|
||||
debug_mode: bool,
|
||||
daemon: subprocess.Popen,
|
||||
start_method: str,
|
||||
reg_addr: tuple,
|
||||
# set_fork_aware_capture,
|
||||
):
|
||||
'''
|
||||
Ensure that when a stale entry is detected in the registrar's
|
||||
table that the `find_actor()` API takes care of deleting the
|
||||
stale entry and not delivering a bad portal.
|
||||
|
||||
'''
|
||||
async def main():
|
||||
name: str = 'transport_fails_actor'
|
||||
_reg_ptl: tractor.Portal
|
||||
an: tractor.ActorNursery
|
||||
async with (
|
||||
tractor.open_nursery(
|
||||
debug_mode=debug_mode,
|
||||
registry_addrs=[reg_addr],
|
||||
) as an,
|
||||
tractor.get_registry(reg_addr) as _reg_ptl,
|
||||
):
|
||||
ptl: tractor.Portal = await an.start_actor(
|
||||
name,
|
||||
enable_modules=[__name__],
|
||||
)
|
||||
async with ptl.open_context(
|
||||
kill_transport,
|
||||
) as (first, ctx):
|
||||
async with tractor.find_actor(
|
||||
name,
|
||||
registry_addrs=[reg_addr],
|
||||
) as maybe_portal:
|
||||
# because the transitive
|
||||
# `._api.maybe_open_portal()` call should
|
||||
# fail and implicitly call `.delete_addr()`
|
||||
assert maybe_portal is None
|
||||
registry: dict = await unpack_reg(_reg_ptl)
|
||||
assert ptl.chan.aid.uid not in registry
|
||||
|
||||
# should fail since we knocked out the IPC tpt XD
|
||||
await ptl.cancel_actor()
|
||||
await an.cancel()
|
||||
|
||||
# XXX, for tracing if this starts being flaky again..
|
||||
#
|
||||
timeout: float = 4
|
||||
async def _timeout_main():
|
||||
with trio.move_on_after(timeout) as cs:
|
||||
await main()
|
||||
|
||||
if (
|
||||
cs.cancel_called
|
||||
and
|
||||
debug_mode
|
||||
):
|
||||
await tractor.pause()
|
||||
|
||||
# TODO, remove once the `[subint]` variant no longer hangs.
|
||||
#
|
||||
# Status (as of Phase B hard-kill landing):
|
||||
#
|
||||
# - `[trio]`/`[mp_*]` variants: completes normally; `dump_on_hang`
|
||||
# is a no-op safety net here.
|
||||
#
|
||||
# - `[subint]` variant: hangs indefinitely AND is un-Ctrl-C-able.
|
||||
# `strace -p <pytest_pid>` while in the hang reveals a silently-
|
||||
# dropped SIGINT — the C signal handler tries to write the
|
||||
# signum byte to Python's signal-wakeup fd and gets `EAGAIN`,
|
||||
# meaning the pipe is full (nobody's draining it).
|
||||
#
|
||||
# Root-cause chain: our hard-kill in `spawn._subint` abandoned
|
||||
# the driver OS-thread (which is `daemon=True`) after the soft-
|
||||
# kill timeout, but the *sub-interpreter* inside that thread is
|
||||
# still running `trio.run()` — `_interpreters.destroy()` can't
|
||||
# force-stop a running subint (raises `InterpreterError`), and
|
||||
# legacy-config subints share the main GIL. The abandoned subint
|
||||
# starves the parent's trio event loop from iterating often
|
||||
# enough to drain its wakeup pipe → SIGINT silently drops.
|
||||
#
|
||||
# This is structurally a CPython-level limitation: there's no
|
||||
# public force-destroy primitive for a running subint. We
|
||||
# escape on the harness side via a SIGINT-loop in the `daemon`
|
||||
# fixture teardown (killing the bg registrar subproc closes its
|
||||
# end of the IPC, which eventually unblocks a recv in main trio,
|
||||
# which lets the loop drain the wakeup pipe). Long-term fix path:
|
||||
# msgspec PEP 684 support (jcrist/msgspec#563) → isolated-mode
|
||||
# subints with per-interp GIL.
|
||||
#
|
||||
# Full analysis:
|
||||
# `ai/conc-anal/subint_sigint_starvation_issue.md`
|
||||
#
|
||||
# See also the *sibling* hang class documented in
|
||||
# `ai/conc-anal/subint_cancel_delivery_hang_issue.md` — same
|
||||
# subint backend, different root cause (Ctrl-C-able hang, main
|
||||
# trio loop iterating fine; ours to fix, not CPython's).
|
||||
# Reproduced by `tests/test_subint_cancellation.py
|
||||
# ::test_subint_non_checkpointing_child`.
|
||||
#
|
||||
# Kept here (and not behind a `pytestmark.skip`) so we can still
|
||||
# inspect the dump file if the hang ever returns after a refactor.
|
||||
# `pytest`'s stderr capture eats `faulthandler` output otherwise,
|
||||
# so we route `dump_on_hang` to a file.
|
||||
with dump_on_hang(
|
||||
seconds=timeout*2,
|
||||
path=f'/tmp/test_stale_entry_is_deleted_{start_method}.dump',
|
||||
):
|
||||
trio.run(_timeout_main)
|
||||
|
|
@ -0,0 +1,345 @@
|
|||
'''
|
||||
`open_root_actor(tpt_bind_addrs=...)` test suite.
|
||||
|
||||
Verify all three runtime code paths for explicit IPC-server
|
||||
bind-address selection in `_root.py`:
|
||||
|
||||
1. Non-registrar, no explicit bind -> random addrs from registry proto
|
||||
2. Registrar, no explicit bind -> binds to registry_addrs
|
||||
3. Explicit bind given -> wraps via `wrap_address()` and uses them
|
||||
|
||||
'''
|
||||
import pytest
|
||||
import trio
|
||||
import tractor
|
||||
from tractor.discovery._addr import (
|
||||
wrap_address,
|
||||
)
|
||||
from tractor.discovery._multiaddr import mk_maddr
|
||||
from tractor._testing.addr import get_rando_addr
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# helpers
|
||||
# ------------------------------------------------------------------
|
||||
def _bound_bindspaces(
|
||||
actor: tractor.Actor,
|
||||
) -> set[str]:
|
||||
'''
|
||||
Collect the set of bindspace strings from the actor's
|
||||
currently bound IPC-server accept addresses.
|
||||
|
||||
'''
|
||||
return {
|
||||
wrap_address(a).bindspace
|
||||
for a in actor.accept_addrs
|
||||
}
|
||||
|
||||
|
||||
def _bound_wrapped(
|
||||
actor: tractor.Actor,
|
||||
) -> list:
|
||||
'''
|
||||
Return the actor's accept addrs as wrapped `Address` objects.
|
||||
|
||||
'''
|
||||
return [
|
||||
wrap_address(a)
|
||||
for a in actor.accept_addrs
|
||||
]
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 1) Registrar + explicit tpt_bind_addrs
|
||||
# ------------------------------------------------------------------
|
||||
@pytest.mark.parametrize(
|
||||
'addr_combo',
|
||||
[
|
||||
'bind-eq-reg',
|
||||
'bind-subset-reg',
|
||||
'bind-disjoint-reg',
|
||||
],
|
||||
ids=lambda v: v,
|
||||
)
|
||||
def test_registrar_root_tpt_bind_addrs(
|
||||
reg_addr: tuple,
|
||||
tpt_proto: str,
|
||||
debug_mode: bool,
|
||||
addr_combo: str,
|
||||
):
|
||||
'''
|
||||
Registrar root-actor with explicit `tpt_bind_addrs`:
|
||||
bound set must include all registry + all bind addr bindspaces
|
||||
(merge behavior).
|
||||
|
||||
'''
|
||||
reg_wrapped = wrap_address(reg_addr)
|
||||
|
||||
if addr_combo == 'bind-eq-reg':
|
||||
bind_addrs = [reg_addr]
|
||||
# extra secondary reg addr for subset test
|
||||
extra_reg = []
|
||||
|
||||
elif addr_combo == 'bind-subset-reg':
|
||||
second_reg = get_rando_addr(tpt_proto)
|
||||
bind_addrs = [reg_addr]
|
||||
extra_reg = [second_reg]
|
||||
|
||||
elif addr_combo == 'bind-disjoint-reg':
|
||||
# port=0 on same host -> completely different addr
|
||||
rando = wrap_address(reg_addr).get_random(
|
||||
bindspace=reg_wrapped.bindspace,
|
||||
)
|
||||
bind_addrs = [rando.unwrap()]
|
||||
extra_reg = []
|
||||
|
||||
all_reg = [reg_addr] + extra_reg
|
||||
|
||||
async def _main():
|
||||
async with tractor.open_root_actor(
|
||||
registry_addrs=all_reg,
|
||||
tpt_bind_addrs=bind_addrs,
|
||||
debug_mode=debug_mode,
|
||||
):
|
||||
actor = tractor.current_actor()
|
||||
assert actor.is_registrar
|
||||
|
||||
bound = actor.accept_addrs
|
||||
bound_bs = _bound_bindspaces(actor)
|
||||
|
||||
# all registry bindspaces must appear in bound set
|
||||
for ra in all_reg:
|
||||
assert wrap_address(ra).bindspace in bound_bs
|
||||
|
||||
# all bind-addr bindspaces must appear
|
||||
for ba in bind_addrs:
|
||||
assert wrap_address(ba).bindspace in bound_bs
|
||||
|
||||
# registry addr must appear verbatim in bound
|
||||
# (after wrapping both sides for comparison)
|
||||
bound_w = _bound_wrapped(actor)
|
||||
assert reg_wrapped in bound_w
|
||||
|
||||
if addr_combo == 'bind-disjoint-reg':
|
||||
assert len(bound) >= 2
|
||||
|
||||
trio.run(_main)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'addr_combo',
|
||||
[
|
||||
'bind-same-bindspace',
|
||||
'bind-disjoint',
|
||||
],
|
||||
ids=lambda v: v,
|
||||
)
|
||||
def test_non_registrar_root_tpt_bind_addrs(
|
||||
daemon,
|
||||
reg_addr: tuple,
|
||||
tpt_proto: str,
|
||||
debug_mode: bool,
|
||||
addr_combo: str,
|
||||
):
|
||||
'''
|
||||
Non-registrar root with explicit `tpt_bind_addrs`:
|
||||
bound set must exactly match the requested bind addrs
|
||||
(no merge with registry).
|
||||
|
||||
'''
|
||||
reg_wrapped = wrap_address(reg_addr)
|
||||
|
||||
if addr_combo == 'bind-same-bindspace':
|
||||
# same bindspace as reg but port=0 so we get a random port
|
||||
rando = reg_wrapped.get_random(
|
||||
bindspace=reg_wrapped.bindspace,
|
||||
)
|
||||
bind_addrs = [rando.unwrap()]
|
||||
|
||||
elif addr_combo == 'bind-disjoint':
|
||||
rando = reg_wrapped.get_random(
|
||||
bindspace=reg_wrapped.bindspace,
|
||||
)
|
||||
bind_addrs = [rando.unwrap()]
|
||||
|
||||
async def _main():
|
||||
async with tractor.open_root_actor(
|
||||
registry_addrs=[reg_addr],
|
||||
tpt_bind_addrs=bind_addrs,
|
||||
debug_mode=debug_mode,
|
||||
):
|
||||
actor = tractor.current_actor()
|
||||
assert not actor.is_registrar
|
||||
|
||||
bound = actor.accept_addrs
|
||||
assert len(bound) == len(bind_addrs)
|
||||
|
||||
# bindspaces must match
|
||||
bound_bs = _bound_bindspaces(actor)
|
||||
for ba in bind_addrs:
|
||||
assert wrap_address(ba).bindspace in bound_bs
|
||||
|
||||
# TCP port=0 should resolve to a real port
|
||||
for uw_addr in bound:
|
||||
w = wrap_address(uw_addr)
|
||||
if w.proto_key == 'tcp':
|
||||
_host, port = uw_addr
|
||||
assert port > 0
|
||||
|
||||
trio.run(_main)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 3) Non-registrar, default random bind (baseline)
|
||||
# ------------------------------------------------------------------
|
||||
def test_non_registrar_default_random_bind(
|
||||
daemon,
|
||||
reg_addr: tuple,
|
||||
debug_mode: bool,
|
||||
):
|
||||
'''
|
||||
Baseline: no `tpt_bind_addrs`, daemon running.
|
||||
Bound bindspace matches registry bindspace,
|
||||
but bound addr differs from reg_addr (random).
|
||||
|
||||
'''
|
||||
reg_wrapped = wrap_address(reg_addr)
|
||||
|
||||
async def _main():
|
||||
async with tractor.open_root_actor(
|
||||
registry_addrs=[reg_addr],
|
||||
debug_mode=debug_mode,
|
||||
):
|
||||
actor = tractor.current_actor()
|
||||
assert not actor.is_registrar
|
||||
|
||||
bound_bs = _bound_bindspaces(actor)
|
||||
assert reg_wrapped.bindspace in bound_bs
|
||||
|
||||
# bound addr should differ from the registry addr
|
||||
# (the runtime picks a random port/path)
|
||||
bound_w = _bound_wrapped(actor)
|
||||
assert reg_wrapped not in bound_w
|
||||
|
||||
trio.run(_main)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 4) Multiaddr string input
|
||||
# ------------------------------------------------------------------
|
||||
def test_tpt_bind_addrs_as_maddr_str(
|
||||
reg_addr: tuple,
|
||||
debug_mode: bool,
|
||||
):
|
||||
'''
|
||||
Pass multiaddr strings as `tpt_bind_addrs`.
|
||||
Runtime should parse and bind successfully.
|
||||
|
||||
'''
|
||||
reg_wrapped = wrap_address(reg_addr)
|
||||
# build a port-0 / random maddr string for binding
|
||||
rando = reg_wrapped.get_random(
|
||||
bindspace=reg_wrapped.bindspace,
|
||||
)
|
||||
maddr_str: str = str(mk_maddr(rando))
|
||||
|
||||
async def _main():
|
||||
async with tractor.open_root_actor(
|
||||
registry_addrs=[reg_addr],
|
||||
tpt_bind_addrs=[maddr_str],
|
||||
debug_mode=debug_mode,
|
||||
):
|
||||
actor = tractor.current_actor()
|
||||
assert actor.is_registrar
|
||||
|
||||
for uw_addr in actor.accept_addrs:
|
||||
w = wrap_address(uw_addr)
|
||||
if w.proto_key == 'tcp':
|
||||
_host, port = uw_addr
|
||||
assert port > 0
|
||||
|
||||
trio.run(_main)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 5) Registrar merge produces union of binds
|
||||
# ------------------------------------------------------------------
|
||||
def test_registrar_merge_binds_union(
|
||||
tpt_proto: str,
|
||||
debug_mode: bool,
|
||||
):
|
||||
'''
|
||||
Registrar + disjoint bind addr: bound set must include
|
||||
both registry and explicit bind addresses.
|
||||
|
||||
'''
|
||||
reg_addr = get_rando_addr(tpt_proto)
|
||||
reg_wrapped = wrap_address(reg_addr)
|
||||
|
||||
rando = reg_wrapped.get_random(
|
||||
bindspace=reg_wrapped.bindspace,
|
||||
)
|
||||
bind_addrs = [rando.unwrap()]
|
||||
|
||||
# NOTE: for UDS, `get_random()` produces the same
|
||||
# filename for the same pid+actor-state, so the
|
||||
# "disjoint" premise only holds when the addrs
|
||||
# actually differ (always true for TCP, may
|
||||
# collide for UDS).
|
||||
expect_disjoint: bool = (
|
||||
tuple(reg_addr) != rando.unwrap()
|
||||
)
|
||||
|
||||
async def _main():
|
||||
async with tractor.open_root_actor(
|
||||
registry_addrs=[reg_addr],
|
||||
tpt_bind_addrs=bind_addrs,
|
||||
debug_mode=debug_mode,
|
||||
):
|
||||
actor = tractor.current_actor()
|
||||
assert actor.is_registrar
|
||||
|
||||
bound = actor.accept_addrs
|
||||
bound_w = _bound_wrapped(actor)
|
||||
|
||||
if expect_disjoint:
|
||||
# must have at least 2 (registry + bind)
|
||||
assert len(bound) >= 2
|
||||
|
||||
# registry addr must appear in bound set
|
||||
assert reg_wrapped in bound_w
|
||||
|
||||
trio.run(_main)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 6) open_nursery forwards tpt_bind_addrs
|
||||
# ------------------------------------------------------------------
|
||||
def test_open_nursery_forwards_tpt_bind_addrs(
|
||||
reg_addr: tuple,
|
||||
debug_mode: bool,
|
||||
):
|
||||
'''
|
||||
`open_nursery(tpt_bind_addrs=...)` forwards through
|
||||
`**kwargs` to `open_root_actor()`.
|
||||
|
||||
'''
|
||||
reg_wrapped = wrap_address(reg_addr)
|
||||
rando = reg_wrapped.get_random(
|
||||
bindspace=reg_wrapped.bindspace,
|
||||
)
|
||||
bind_addrs = [rando.unwrap()]
|
||||
|
||||
async def _main():
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
tpt_bind_addrs=bind_addrs,
|
||||
debug_mode=debug_mode,
|
||||
):
|
||||
actor = tractor.current_actor()
|
||||
bound_bs = _bound_bindspaces(actor)
|
||||
|
||||
for ba in bind_addrs:
|
||||
assert wrap_address(ba).bindspace in bound_bs
|
||||
|
||||
trio.run(_main)
|
||||
|
|
@ -8,17 +8,16 @@ from pathlib import Path
|
|||
import pytest
|
||||
import trio
|
||||
import tractor
|
||||
from tractor import (
|
||||
Actor,
|
||||
_state,
|
||||
_addr,
|
||||
)
|
||||
from tractor import Actor
|
||||
from tractor.runtime import _state
|
||||
from tractor.discovery import _addr
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def bindspace_dir_str() -> str:
|
||||
|
||||
rt_dir: Path = tractor._state.get_rt_dir()
|
||||
from tractor.runtime._state import get_rt_dir
|
||||
rt_dir: Path = get_rt_dir()
|
||||
bs_dir: Path = rt_dir / 'doggy'
|
||||
bs_dir_str: str = str(bs_dir)
|
||||
assert not bs_dir.is_dir()
|
||||
|
|
|
|||
|
|
@ -13,9 +13,9 @@ from tractor import (
|
|||
Portal,
|
||||
ipc,
|
||||
msg,
|
||||
_state,
|
||||
_addr,
|
||||
)
|
||||
from tractor.runtime import _state
|
||||
from tractor.discovery import _addr
|
||||
|
||||
@tractor.context
|
||||
async def chk_tpts(
|
||||
|
|
@ -59,9 +59,19 @@ async def chk_tpts(
|
|||
)
|
||||
def test_root_passes_tpt_to_sub(
|
||||
tpt_proto_key: str,
|
||||
tpt_proto: str,
|
||||
reg_addr: tuple,
|
||||
debug_mode: bool,
|
||||
):
|
||||
# `reg_addr` is sourced from the CLI `--tpt-proto={tpt_proto}`,
|
||||
# so when the parametrized `tpt_proto_key` differs, the test
|
||||
# asks the runtime to `enable_transports=[<other_proto>]` while
|
||||
# pointing `registry_addrs` at a `reg_addr` of the wrong proto.
|
||||
# The layer-2 guard in `open_root_actor` is expected to fail
|
||||
# fast with `ValueError` on this mismatch (rather than the prior
|
||||
# silent hang during the registrar handshake).
|
||||
proto_mismatch: bool = (tpt_proto_key != tpt_proto)
|
||||
|
||||
async def main():
|
||||
async with tractor.open_nursery(
|
||||
enable_transports=[tpt_proto_key],
|
||||
|
|
@ -92,4 +102,14 @@ def test_root_passes_tpt_to_sub(
|
|||
# shudown sub-actor(s)
|
||||
await an.cancel()
|
||||
|
||||
trio.run(main)
|
||||
if proto_mismatch:
|
||||
# mismatched proto must raise `ValueError` from the
|
||||
# `open_root_actor` runtime guard before any subactor spawn.
|
||||
with pytest.raises(ValueError) as excinfo:
|
||||
trio.run(main)
|
||||
msg: str = str(excinfo.value)
|
||||
assert 'enable_transports' in msg
|
||||
assert 'registry_addrs' in msg
|
||||
assert tpt_proto_key in msg or tpt_proto in msg
|
||||
else:
|
||||
trio.run(main)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,4 @@
|
|||
'''
|
||||
`tractor.msg.*` sub-sys test suite.
|
||||
|
||||
'''
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
'''
|
||||
`tractor.msg.*` test sub-pkg conf.
|
||||
|
||||
'''
|
||||
|
|
@ -57,6 +57,7 @@ from tractor.msg._ops import (
|
|||
limit_plds,
|
||||
)
|
||||
|
||||
|
||||
def enc_nsp(obj: Any) -> Any:
|
||||
actor: Actor = tractor.current_actor(
|
||||
err_on_no_runtime=False,
|
||||
|
|
@ -617,6 +618,17 @@ def test_ext_types_over_ipc(
|
|||
debug_mode: bool,
|
||||
pld_spec: Union[Type],
|
||||
add_hooks: bool,
|
||||
|
||||
set_fork_aware_capture,
|
||||
# ^^XXX? for forking spawners
|
||||
|
||||
# capfd: pytest.CaptureFixture,
|
||||
# ^^NOTE, super interesting that if
|
||||
# we disable this below then the tpt-layer
|
||||
# suffers as an "unclean EOF"??
|
||||
# ?TODO, determine why/how that mks sense when addressing,
|
||||
# https://github.com/pytest-dev/pytest/issues/14444
|
||||
#
|
||||
):
|
||||
'''
|
||||
Ensure we can support extension types coverted using
|
||||
|
|
@ -725,18 +737,26 @@ def test_ext_types_over_ipc(
|
|||
|
||||
await p.cancel_actor()
|
||||
|
||||
async def fa_main():
|
||||
with (
|
||||
trio.fail_after(2),
|
||||
# ?TODO, investigate? see NOTE above..
|
||||
# capfd.disabled(),
|
||||
):
|
||||
await main()
|
||||
|
||||
if (
|
||||
NamespacePath in pld_types
|
||||
and
|
||||
add_hooks
|
||||
):
|
||||
trio.run(main)
|
||||
trio.run(fa_main)
|
||||
|
||||
else:
|
||||
with pytest.raises(
|
||||
expected_exception=tractor.RemoteActorError,
|
||||
) as excinfo:
|
||||
trio.run(main)
|
||||
trio.run(fa_main)
|
||||
|
||||
exc = excinfo.value
|
||||
# bc `.started(nsp: NamespacePath)` will raise
|
||||
|
|
@ -55,13 +55,38 @@ async def maybe_expect_raises(
|
|||
raises: BaseException|None = None,
|
||||
ensure_in_message: list[str]|None = None,
|
||||
post_mortem: bool = False,
|
||||
timeout: int = 3,
|
||||
# NOTE, `None` selects a backend-aware default below —
|
||||
# see `_BACKEND_TIMEOUT_DEFAULTS` for rationale. Caller
|
||||
# can override with an explicit value to opt out.
|
||||
timeout: int|None = None,
|
||||
) -> None:
|
||||
'''
|
||||
Async wrapper for ensuring errors propagate from the inner scope.
|
||||
|
||||
'''
|
||||
if tractor._state.debug_mode():
|
||||
if timeout is None:
|
||||
# Pick a backend-aware default. Fork-based backends
|
||||
# (`main_thread_forkserver`) need much more headroom
|
||||
# because actor spawn + IPC ctx-exit + msg-validation
|
||||
# error path takes longer than under `trio` backend
|
||||
# — especially under cross-pytest-stream contention
|
||||
# (#451). `test_basic_payload_spec` empirically:
|
||||
# - 3s flaked all-valid variant (`TooSlowError`)
|
||||
# - 8s flaked `invalid-return` variant
|
||||
# (`Cancelled` surfaced instead of `MsgTypeError`
|
||||
# because `fail_after` fired mid-error-path)
|
||||
# - 15s flaked under cross-stream contention
|
||||
# 30s for fork-based gives plenty of headroom while
|
||||
# still failing-loud on a genuine hang. Other
|
||||
# backends keep the original 3s.
|
||||
from tractor.spawn import _spawn as _spawn_mod
|
||||
timeout = (
|
||||
30
|
||||
if _spawn_mod._spawn_method == 'main_thread_forkserver'
|
||||
else 3
|
||||
)
|
||||
|
||||
if tractor.debug_mode():
|
||||
timeout += 999
|
||||
|
||||
with trio.fail_after(timeout):
|
||||
|
|
@ -259,6 +284,11 @@ def test_basic_payload_spec(
|
|||
return_value: str|None,
|
||||
started_value: int|PldMsg,
|
||||
pld_check_started_value: bool,
|
||||
|
||||
set_fork_aware_capture,
|
||||
# ^XXX TODO? for forking spawners, seems to prevent hangs when
|
||||
# --capture=sys not set, but only for a while then the problem
|
||||
# accumulates?
|
||||
):
|
||||
'''
|
||||
Validate the most basic `PldRx` msg-type-spec semantics around
|
||||
|
|
@ -0,0 +1,240 @@
|
|||
'''
|
||||
Unit tests for `tractor.msg.pretty_struct`
|
||||
private-field filtering in `pformat()`.
|
||||
|
||||
'''
|
||||
import pytest
|
||||
|
||||
from tractor.msg.pretty_struct import (
|
||||
Struct,
|
||||
pformat,
|
||||
iter_struct_ppfmt_lines,
|
||||
)
|
||||
from tractor.msg._codec import (
|
||||
MsgDec,
|
||||
mk_dec,
|
||||
)
|
||||
|
||||
|
||||
# ------ test struct definitions ------ #
|
||||
|
||||
class PublicOnly(Struct):
|
||||
'''
|
||||
All-public fields for baseline testing.
|
||||
|
||||
'''
|
||||
name: str = 'alice'
|
||||
age: int = 30
|
||||
|
||||
|
||||
class PrivateOnly(Struct):
|
||||
'''
|
||||
Only underscore-prefixed (private) fields.
|
||||
|
||||
'''
|
||||
_secret: str = 'hidden'
|
||||
_internal: int = 99
|
||||
|
||||
|
||||
class MixedFields(Struct):
|
||||
'''
|
||||
Mix of public and private fields.
|
||||
|
||||
'''
|
||||
name: str = 'bob'
|
||||
_hidden: int = 42
|
||||
value: float = 3.14
|
||||
_meta: str = 'internal'
|
||||
|
||||
|
||||
class Inner(
|
||||
Struct,
|
||||
frozen=True,
|
||||
):
|
||||
'''
|
||||
Frozen inner struct with a private field,
|
||||
for nesting tests.
|
||||
|
||||
'''
|
||||
x: int = 1
|
||||
_secret: str = 'nope'
|
||||
|
||||
|
||||
class Outer(Struct):
|
||||
'''
|
||||
Outer struct nesting an `Inner`.
|
||||
|
||||
'''
|
||||
label: str = 'outer'
|
||||
inner: Inner = Inner()
|
||||
|
||||
|
||||
class EmptyStruct(Struct):
|
||||
'''
|
||||
Struct with zero fields.
|
||||
|
||||
'''
|
||||
pass
|
||||
|
||||
|
||||
# ------ tests ------ #
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'struct_and_expected',
|
||||
[
|
||||
(
|
||||
PublicOnly(),
|
||||
{
|
||||
'shown': ['name', 'age'],
|
||||
'hidden': [],
|
||||
},
|
||||
),
|
||||
(
|
||||
MixedFields(),
|
||||
{
|
||||
'shown': ['name', 'value'],
|
||||
'hidden': ['_hidden', '_meta'],
|
||||
},
|
||||
),
|
||||
(
|
||||
PrivateOnly(),
|
||||
{
|
||||
'shown': [],
|
||||
'hidden': ['_secret', '_internal'],
|
||||
},
|
||||
),
|
||||
],
|
||||
ids=[
|
||||
'all-public',
|
||||
'mixed-pub-priv',
|
||||
'all-private',
|
||||
],
|
||||
)
|
||||
def test_field_visibility_in_pformat(
|
||||
struct_and_expected: tuple[
|
||||
Struct,
|
||||
dict[str, list[str]],
|
||||
],
|
||||
):
|
||||
'''
|
||||
Verify `pformat()` shows public fields
|
||||
and hides `_`-prefixed private fields.
|
||||
|
||||
'''
|
||||
(
|
||||
struct,
|
||||
expected,
|
||||
) = struct_and_expected
|
||||
output: str = pformat(struct)
|
||||
|
||||
for field_name in expected['shown']:
|
||||
assert field_name in output, (
|
||||
f'{field_name!r} should appear in:\n'
|
||||
f'{output}'
|
||||
)
|
||||
|
||||
for field_name in expected['hidden']:
|
||||
assert field_name not in output, (
|
||||
f'{field_name!r} should NOT appear in:\n'
|
||||
f'{output}'
|
||||
)
|
||||
|
||||
|
||||
def test_iter_ppfmt_lines_skips_private():
|
||||
'''
|
||||
Directly verify `iter_struct_ppfmt_lines()`
|
||||
never yields tuples with `_`-prefixed field
|
||||
names.
|
||||
|
||||
'''
|
||||
struct = MixedFields()
|
||||
lines: list[tuple[str, str]] = list(
|
||||
iter_struct_ppfmt_lines(
|
||||
struct,
|
||||
field_indent=2,
|
||||
)
|
||||
)
|
||||
# should have lines for public fields only
|
||||
assert len(lines) == 2
|
||||
|
||||
for _prefix, line_content in lines:
|
||||
field_name: str = (
|
||||
line_content.split(':')[0].strip()
|
||||
)
|
||||
assert not field_name.startswith('_'), (
|
||||
f'private field leaked: {field_name!r}'
|
||||
)
|
||||
|
||||
|
||||
def test_nested_struct_filters_inner_private():
|
||||
'''
|
||||
Verify that nested struct's private fields
|
||||
are also filtered out during recursion.
|
||||
|
||||
'''
|
||||
outer = Outer()
|
||||
output: str = pformat(outer)
|
||||
|
||||
# outer's public field
|
||||
assert 'label' in output
|
||||
|
||||
# inner's public field (recursed into)
|
||||
assert 'x' in output
|
||||
|
||||
# inner's private field must be hidden
|
||||
assert '_secret' not in output
|
||||
|
||||
|
||||
def test_empty_struct_pformat():
|
||||
'''
|
||||
An empty struct should produce a valid
|
||||
`pformat()` result with no field lines.
|
||||
|
||||
'''
|
||||
output: str = pformat(EmptyStruct())
|
||||
assert 'EmptyStruct(' in output
|
||||
assert output.rstrip().endswith(')')
|
||||
|
||||
# no field lines => only struct header+footer
|
||||
lines: list[tuple[str, str]] = list(
|
||||
iter_struct_ppfmt_lines(
|
||||
EmptyStruct(),
|
||||
field_indent=2,
|
||||
)
|
||||
)
|
||||
assert lines == []
|
||||
|
||||
|
||||
def test_real_msgdec_pformat_hides_private():
|
||||
'''
|
||||
Verify `pformat()` on a real `MsgDec`
|
||||
hides the `_dec` internal field.
|
||||
|
||||
NOTE: `MsgDec.__repr__` is custom and does
|
||||
NOT call `pformat()`, so we call it directly.
|
||||
|
||||
'''
|
||||
dec: MsgDec = mk_dec(spec=int)
|
||||
output: str = pformat(dec)
|
||||
|
||||
# the private `_dec` field should be filtered
|
||||
assert '_dec' not in output
|
||||
|
||||
# but the struct type name should be present
|
||||
assert 'MsgDec(' in output
|
||||
|
||||
|
||||
def test_pformat_repr_integration():
|
||||
'''
|
||||
Verify that `Struct.__repr__()` (which calls
|
||||
`pformat()`) also hides private fields for
|
||||
custom structs that do NOT override `__repr__`.
|
||||
|
||||
'''
|
||||
mixed = MixedFields()
|
||||
output: str = repr(mixed)
|
||||
|
||||
assert 'name' in output
|
||||
assert 'value' in output
|
||||
assert '_hidden' not in output
|
||||
assert '_meta' not in output
|
||||
|
|
@ -0,0 +1,652 @@
|
|||
'''
|
||||
Integration exercises for the `tractor.spawn._main_thread_forkserver`
|
||||
submodule at three tiers:
|
||||
|
||||
1. the low-level primitives
|
||||
(`fork_from_worker_thread()` from `_main_thread_forkserver`
|
||||
+ `run_subint_in_worker_thread()` from
|
||||
`_subint_forkserver`) driven from inside a real
|
||||
`trio.run()` in the parent process,
|
||||
|
||||
2. the full `main_thread_forkserver_proc` spawn backend wired
|
||||
through tractor's normal actor-nursery + portal-RPC
|
||||
machinery — i.e. `open_root_actor` + `open_nursery` +
|
||||
`run_in_actor` against a subactor spawned via fork from a
|
||||
main-interp worker thread.
|
||||
|
||||
Background
|
||||
----------
|
||||
`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
||||
establishes that `os.fork()` from a non-main sub-interpreter
|
||||
aborts the child at the CPython level. The sibling
|
||||
`subint_fork_from_main_thread_smoketest.py` proves the escape
|
||||
hatch: fork from a main-interp *worker thread* (one that has
|
||||
never entered a subint) works, and the forked child can then
|
||||
host its own `trio.run()` inside a fresh subint.
|
||||
|
||||
Those smoke-test scenarios are standalone — no trio runtime
|
||||
in the *parent*. Tiers (1)+(2) here cover the primitives
|
||||
driven from inside `trio.run()` in the parent, and tier (3)
|
||||
(the `*_spawn_basic` test) drives the registered
|
||||
`main_thread_forkserver` spawn backend end-to-end against
|
||||
the tractor runtime.
|
||||
|
||||
Gating
|
||||
------
|
||||
- py3.14+ (via `concurrent.interpreters` presence)
|
||||
- no `--spawn-backend` restriction — the backend-level test
|
||||
flips `tractor.spawn._spawn._spawn_method` programmatically
|
||||
(via `try_set_start_method('main_thread_forkserver')`) and
|
||||
restores it on teardown, so these tests are independent of
|
||||
the session-level CLI backend choice.
|
||||
|
||||
'''
|
||||
from __future__ import annotations
|
||||
from functools import partial
|
||||
import os
|
||||
from pathlib import Path
|
||||
import platform
|
||||
import select
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import trio
|
||||
|
||||
import tractor
|
||||
from tractor.devx import dump_on_hang
|
||||
|
||||
|
||||
# Gate: subint forkserver primitives require py3.14+. Check
|
||||
# the public stdlib wrapper's presence (added in 3.14) rather
|
||||
# than `_interpreters` directly — see
|
||||
# `tractor.spawn._subint` for why.
|
||||
pytest.importorskip('concurrent.interpreters')
|
||||
|
||||
from tractor.spawn._main_thread_forkserver import ( # noqa: E402
|
||||
fork_from_worker_thread,
|
||||
wait_child,
|
||||
)
|
||||
from tractor.spawn._subint_forkserver import ( # noqa: E402
|
||||
run_subint_in_worker_thread,
|
||||
)
|
||||
from tractor.spawn import _spawn as _spawn_mod # noqa: E402
|
||||
from tractor.spawn._spawn import try_set_start_method # noqa: E402
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# child-side callables (passed via `child_target=` across fork)
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
_CHILD_TRIO_BOOTSTRAP: str = (
|
||||
'import trio\n'
|
||||
'async def _main():\n'
|
||||
' await trio.sleep(0.05)\n'
|
||||
' return 42\n'
|
||||
'result = trio.run(_main)\n'
|
||||
'assert result == 42, f"trio.run returned {result}"\n'
|
||||
)
|
||||
|
||||
|
||||
def _child_trio_in_subint() -> int:
|
||||
'''
|
||||
`child_target` for the trio-in-child scenario: drive a
|
||||
trivial `trio.run()` inside a fresh legacy-config subint
|
||||
on a worker thread.
|
||||
|
||||
Returns an exit code suitable for `os._exit()`:
|
||||
- 0: subint-hosted `trio.run()` succeeded
|
||||
- 3: driver thread hang (timeout inside `run_subint_in_worker_thread`)
|
||||
- 4: subint bootstrap raised some other exception
|
||||
|
||||
'''
|
||||
try:
|
||||
run_subint_in_worker_thread(
|
||||
_CHILD_TRIO_BOOTSTRAP,
|
||||
thread_name='child-subint-trio-thread',
|
||||
)
|
||||
except RuntimeError:
|
||||
# timeout / thread-never-returned
|
||||
return 3
|
||||
except BaseException:
|
||||
return 4
|
||||
return 0
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# parent-side harnesses (run inside `trio.run()`)
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_fork_in_non_trio_thread(
|
||||
deadline: float,
|
||||
*,
|
||||
child_target=None,
|
||||
) -> int:
|
||||
'''
|
||||
From inside a parent `trio.run()`, off-load the
|
||||
forkserver primitive to a main-interp worker thread via
|
||||
`trio.to_thread.run_sync()` and return the forked child's
|
||||
pid.
|
||||
|
||||
Then `wait_child()` on that pid (also off-loaded so we
|
||||
don't block trio's event loop on `waitpid()`) and assert
|
||||
the child exited cleanly.
|
||||
|
||||
'''
|
||||
with trio.fail_after(deadline):
|
||||
# NOTE: `fork_from_worker_thread` internally spawns its
|
||||
# own dedicated `threading.Thread` (not from trio's
|
||||
# cache) and joins it before returning — so we can
|
||||
# safely off-load via `to_thread.run_sync` without
|
||||
# worrying about the trio-thread-cache recycling the
|
||||
# runner. Pass `abandon_on_cancel=False` for the
|
||||
# same "bounded + clean" rationale we use in
|
||||
# `_subint.subint_proc`.
|
||||
pid: int = await trio.to_thread.run_sync(
|
||||
partial(
|
||||
fork_from_worker_thread,
|
||||
child_target,
|
||||
thread_name='test-subint-forkserver',
|
||||
),
|
||||
abandon_on_cancel=False,
|
||||
)
|
||||
assert pid > 0
|
||||
|
||||
ok, status_str = await trio.to_thread.run_sync(
|
||||
partial(
|
||||
wait_child,
|
||||
pid,
|
||||
expect_exit_ok=True,
|
||||
),
|
||||
abandon_on_cancel=False,
|
||||
)
|
||||
assert ok, (
|
||||
f'forked child did not exit cleanly: '
|
||||
f'{status_str}'
|
||||
)
|
||||
return pid
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# tests
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
# Bounded wall-clock via `pytest-timeout` (`method='thread'`)
|
||||
# for the usual GIL-hostage safety reason documented in the
|
||||
# sibling `test_subint_cancellation.py` / the class-A
|
||||
# `subint_sigint_starvation_issue.md`. Each test also has an
|
||||
# inner `trio.fail_after()` so assertion failures fire fast
|
||||
# under normal conditions.
|
||||
# @pytest.mark.timeout(30, method='thread')
|
||||
def test_fork_from_worker_thread_via_trio(
|
||||
) -> None:
|
||||
'''
|
||||
Baseline: inside `trio.run()`, call
|
||||
`fork_from_worker_thread()` via `trio.to_thread.run_sync()`,
|
||||
get a child pid back, reap the child cleanly.
|
||||
|
||||
No trio-in-child. If this regresses we know the parent-
|
||||
side trio↔worker-thread plumbing is broken independent
|
||||
of any child-side subint machinery.
|
||||
|
||||
'''
|
||||
deadline: float = 10.0
|
||||
with dump_on_hang(
|
||||
seconds=deadline,
|
||||
path='/tmp/main_thread_forkserver_baseline.dump',
|
||||
):
|
||||
pid: int = trio.run(
|
||||
partial(run_fork_in_non_trio_thread, deadline),
|
||||
)
|
||||
# parent-side sanity — we got a real pid back.
|
||||
assert isinstance(pid, int) and pid > 0
|
||||
# by now the child has been waited on; it shouldn't be
|
||||
# reap-able again.
|
||||
with pytest.raises((ChildProcessError, OSError)):
|
||||
os.waitpid(pid, os.WNOHANG)
|
||||
|
||||
|
||||
@pytest.mark.timeout(30, method='thread')
|
||||
def test_fork_and_run_trio_in_child() -> None:
|
||||
'''
|
||||
End-to-end: inside the parent's `trio.run()`, off-load
|
||||
`fork_from_worker_thread()` to a worker thread, have the
|
||||
forked child then create a fresh subint and run
|
||||
`trio.run()` inside it on yet another worker thread.
|
||||
|
||||
This is the full "forkserver + trio-in-subint-in-child"
|
||||
pattern the proposed `main_thread_forkserver` spawn backend
|
||||
would rest on.
|
||||
|
||||
'''
|
||||
deadline: float = 15.0
|
||||
with dump_on_hang(
|
||||
seconds=deadline,
|
||||
path='/tmp/main_thread_forkserver_trio_in_child.dump',
|
||||
):
|
||||
pid: int = trio.run(
|
||||
partial(
|
||||
run_fork_in_non_trio_thread,
|
||||
deadline,
|
||||
child_target=_child_trio_in_subint,
|
||||
),
|
||||
)
|
||||
assert isinstance(pid, int) and pid > 0
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# tier-3 backend test: drive the registered `main_thread_forkserver`
|
||||
# spawn backend end-to-end through tractor's actor-nursery +
|
||||
# portal-RPC machinery.
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
async def _trivial_rpc() -> str:
|
||||
'''
|
||||
Minimal subactor-side RPC body: just return a sentinel
|
||||
string the parent can assert on.
|
||||
|
||||
'''
|
||||
return 'hello from subint-forkserver child'
|
||||
|
||||
|
||||
async def _happy_path_forkserver(
|
||||
reg_addr: tuple[str, int | str],
|
||||
deadline: float,
|
||||
) -> None:
|
||||
'''
|
||||
Parent-side harness: stand up a root actor, open an actor
|
||||
nursery, spawn one subactor via the currently-selected
|
||||
spawn backend (which this test will have flipped to
|
||||
`main_thread_forkserver`), run a trivial RPC through its
|
||||
portal, assert the round-trip result.
|
||||
|
||||
'''
|
||||
with trio.fail_after(deadline):
|
||||
async with (
|
||||
tractor.open_root_actor(
|
||||
registry_addrs=[reg_addr],
|
||||
),
|
||||
tractor.open_nursery() as an,
|
||||
):
|
||||
portal: tractor.Portal = await an.run_in_actor(
|
||||
_trivial_rpc,
|
||||
name='subint-forkserver-child',
|
||||
)
|
||||
result: str = await portal.wait_for_result()
|
||||
assert result == 'hello from subint-forkserver child'
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def forkserver_spawn_method():
|
||||
'''
|
||||
Flip `tractor.spawn._spawn._spawn_method` to
|
||||
`'main_thread_forkserver'` for the duration of a test,
|
||||
then restore whatever was in place before (usually the
|
||||
session-level CLI choice, typically `'trio'`).
|
||||
|
||||
Without this, other tests in the same session would
|
||||
observe the global flip and start spawning via fork —
|
||||
which is almost certainly NOT what their assertions were
|
||||
written against.
|
||||
|
||||
'''
|
||||
prev_method: str = _spawn_mod._spawn_method
|
||||
prev_ctx = _spawn_mod._ctx
|
||||
try_set_start_method('main_thread_forkserver')
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
_spawn_mod._spawn_method = prev_method
|
||||
_spawn_mod._ctx = prev_ctx
|
||||
|
||||
|
||||
@pytest.mark.timeout(60, method='thread')
|
||||
def test_main_thread_forkserver_spawn_basic(
|
||||
reg_addr: tuple[str, int | str],
|
||||
forkserver_spawn_method,
|
||||
) -> None:
|
||||
'''
|
||||
Happy-path: spawn ONE subactor via the
|
||||
`main_thread_forkserver` backend (parent-side fork from a
|
||||
main-interp worker thread), do a trivial portal-RPC
|
||||
round-trip, tear the nursery down cleanly.
|
||||
|
||||
If this passes, the "forkserver + tractor runtime" arch
|
||||
is proven end-to-end: the registered
|
||||
`main_thread_forkserver_proc` spawn target successfully
|
||||
forks a child, the child runs `_actor_child_main()` +
|
||||
completes IPC handshake + serves an RPC, and the parent
|
||||
reaps via `_ForkedProc.wait()` without regressing any of
|
||||
the normal nursery teardown invariants.
|
||||
|
||||
'''
|
||||
deadline: float = 20.0
|
||||
with dump_on_hang(
|
||||
seconds=deadline,
|
||||
path='/tmp/main_thread_forkserver_spawn_basic.dump',
|
||||
):
|
||||
trio.run(
|
||||
partial(
|
||||
_happy_path_forkserver,
|
||||
reg_addr,
|
||||
deadline,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# tier-4 DRAFT: orphaned-subactor SIGINT survivability
|
||||
#
|
||||
# Motivating question: with `main_thread_forkserver`, the child's
|
||||
# `trio.run()` lives on the fork-inherited worker thread which
|
||||
# is NOT `threading.main_thread()` — so trio cannot install its
|
||||
# `signal.set_wakeup_fd`-based SIGINT handler. If the parent
|
||||
# goes away via `SIGKILL` (no IPC `Portal.cancel_actor()`
|
||||
# possible), does SIGINT on the orphan child cleanly tear it
|
||||
# down via CPython's default `KeyboardInterrupt` delivery, or
|
||||
# does it hang?
|
||||
#
|
||||
# Working hypothesis (unverified pre-this-test): post-fork the
|
||||
# child is effectively single-threaded (only the fork-worker
|
||||
# tstate survived), so SIGINT → default handler → raises
|
||||
# `KeyboardInterrupt` on the only thread — which happens to be
|
||||
# the one driving trio's event loop — so trio observes it at
|
||||
# the next checkpoint. If so, we're "fine" on this backend
|
||||
# despite the missing trio SIGINT handler.
|
||||
#
|
||||
# Cross-backend generalization (decide after this passes):
|
||||
# - applicable to any backend whose subactors are separate OS
|
||||
# processes: `trio`, `mp_spawn`, `mp_forkserver`,
|
||||
# `main_thread_forkserver`.
|
||||
# - NOT applicable to plain `subint` (subactors are in-process
|
||||
# subinterpreters, no orphan child process to SIGINT).
|
||||
# - move path: lift the harness script into
|
||||
# `tests/_orphan_harness.py`, parametrize on the session's
|
||||
# `_spawn_method`, add `skipif _spawn_method == 'subint'`.
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
_ORPHAN_HARNESS_SCRIPT: str = '''
|
||||
import os
|
||||
import sys
|
||||
import trio
|
||||
import tractor
|
||||
from tractor.spawn._spawn import try_set_start_method
|
||||
|
||||
async def _sleep_forever() -> None:
|
||||
print(f"CHILD_PID={os.getpid()}", flush=True)
|
||||
await trio.sleep_forever()
|
||||
|
||||
async def _main(reg_addr):
|
||||
async with (
|
||||
tractor.open_root_actor(registry_addrs=[reg_addr]),
|
||||
tractor.open_nursery() as an,
|
||||
):
|
||||
portal = await an.run_in_actor(
|
||||
_sleep_forever,
|
||||
name="orphan-test-child",
|
||||
)
|
||||
print(f"PARENT_READY={os.getpid()}", flush=True)
|
||||
await trio.sleep_forever()
|
||||
|
||||
if __name__ == "__main__":
|
||||
backend = sys.argv[1]
|
||||
host = sys.argv[2]
|
||||
port = int(sys.argv[3])
|
||||
try_set_start_method(backend)
|
||||
trio.run(_main, (host, port))
|
||||
'''
|
||||
|
||||
|
||||
def _read_marker(
|
||||
proc: subprocess.Popen,
|
||||
marker: str,
|
||||
timeout: float,
|
||||
_buf: dict,
|
||||
) -> str:
|
||||
'''
|
||||
Block until `<marker>=<value>\\n` appears on `proc.stdout`
|
||||
and return `<value>`. Uses a per-proc byte buffer (`_buf`)
|
||||
to carry partial lines across calls.
|
||||
|
||||
'''
|
||||
deadline: float = time.monotonic() + timeout
|
||||
remainder: bytes = _buf.get('remainder', b'')
|
||||
prefix: bytes = f'{marker}='.encode()
|
||||
while time.monotonic() < deadline:
|
||||
# drain any complete lines already buffered
|
||||
while b'\n' in remainder:
|
||||
line, remainder = remainder.split(b'\n', 1)
|
||||
if line.startswith(prefix):
|
||||
_buf['remainder'] = remainder
|
||||
return line[len(prefix):].decode().strip()
|
||||
ready, _, _ = select.select([proc.stdout], [], [], 0.2)
|
||||
if not ready:
|
||||
continue
|
||||
chunk: bytes = os.read(proc.stdout.fileno(), 4096)
|
||||
if not chunk:
|
||||
break
|
||||
remainder += chunk
|
||||
_buf['remainder'] = remainder
|
||||
raise TimeoutError(
|
||||
f'Never observed marker {marker!r} on harness stdout '
|
||||
f'within {timeout}s'
|
||||
)
|
||||
|
||||
|
||||
def _process_alive(pid: int) -> bool:
|
||||
'''Liveness probe for a pid we do NOT parent (post-orphan).'''
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
return True
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
|
||||
|
||||
# Known-gap test — `main_thread_forkserver` orphan-SIGINT
|
||||
# handling. See
|
||||
# `ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`.
|
||||
# `strict=True` so if a future fix closes the gap the
|
||||
# XPASS surfaces as a FAIL and forces us to drop the
|
||||
# mark intentionally.
|
||||
@pytest.mark.xfail(
|
||||
strict=True,
|
||||
reason=(
|
||||
'Orphan subactor SIGINT delivery: trio event loop '
|
||||
'on non-main thread post-fork doesn\'t see the '
|
||||
'external SIGINT → KBI path. See tracker doc.\n'
|
||||
'ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md'
|
||||
),
|
||||
)
|
||||
@pytest.mark.timeout(
|
||||
30,
|
||||
method='thread',
|
||||
)
|
||||
def test_orphaned_subactor_sigint_cleanup_DRAFT(
|
||||
reg_addr: tuple[str, int | str],
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
'''
|
||||
DRAFT — orphaned-subactor SIGINT survivability under the
|
||||
`main_thread_forkserver` backend.
|
||||
|
||||
Sequence:
|
||||
1. Spawn a harness subprocess that brings up a root
|
||||
actor + one `sleep_forever` subactor via
|
||||
`main_thread_forkserver`.
|
||||
2. Read the harness's stdout for `PARENT_READY=<pid>`
|
||||
and `CHILD_PID=<pid>` markers (confirms the
|
||||
parent→child IPC handshake completed).
|
||||
3. `SIGKILL` the parent (no IPC cancel possible — the
|
||||
whole point of this test).
|
||||
4. `SIGINT` the orphan child.
|
||||
5. Poll `os.kill(child_pid, 0)` for up to 10s — assert
|
||||
the child exits.
|
||||
|
||||
Empirical result (2026-04, py3.14): currently **FAILS** —
|
||||
SIGINT on the orphan child doesn't unwind the trio loop,
|
||||
despite trio's `KIManager` handler being correctly
|
||||
installed in the subactor (the post-fork thread IS
|
||||
`threading.main_thread()` on py3.14). `faulthandler` dump
|
||||
shows the subactor wedged in `trio/_core/_io_epoll.py::
|
||||
get_events` — the signal's supposed wakeup of the event
|
||||
loop isn't firing. Full analysis + diagnostic evidence
|
||||
in `ai/conc-anal/
|
||||
subint_forkserver_orphan_sigint_hang_issue.md`.
|
||||
|
||||
The runtime's *intentional* "KBI-as-OS-cancel" path at
|
||||
`tractor/spawn/_entry.py::_trio_main:164` is therefore
|
||||
unreachable under this backend+config. Closing the gap is
|
||||
aligned with existing design intent (make the already-
|
||||
designed behavior actually fire), not a new feature.
|
||||
Marked `xfail(strict=True)` so the
|
||||
mark flips to XPASS→fail once the gap is closed and we'll
|
||||
know to drop the mark.
|
||||
|
||||
'''
|
||||
if platform.system() != 'Linux':
|
||||
pytest.skip(
|
||||
'orphan-reparenting semantics only exercised on Linux'
|
||||
)
|
||||
|
||||
script_path = tmp_path / '_orphan_harness.py'
|
||||
script_path.write_text(_ORPHAN_HARNESS_SCRIPT)
|
||||
|
||||
# Offset the port so we don't race the session reg_addr with
|
||||
# any concurrently-running backend test's listener.
|
||||
host: str = reg_addr[0]
|
||||
port: int = int(reg_addr[1]) + 17
|
||||
|
||||
proc: subprocess.Popen = subprocess.Popen(
|
||||
[
|
||||
sys.executable,
|
||||
str(script_path),
|
||||
'main_thread_forkserver',
|
||||
host,
|
||||
str(port),
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
)
|
||||
parent_pid: int | None = None
|
||||
child_pid: int | None = None
|
||||
buf: dict = {}
|
||||
try:
|
||||
child_pid = int(_read_marker(proc, 'CHILD_PID', 15.0, buf))
|
||||
parent_pid = int(_read_marker(proc, 'PARENT_READY', 15.0, buf))
|
||||
|
||||
# sanity: both alive before we start killing stuff
|
||||
assert _process_alive(parent_pid), (
|
||||
f'harness parent pid={parent_pid} gone before '
|
||||
f'SIGKILL — test premise broken'
|
||||
)
|
||||
assert _process_alive(child_pid), (
|
||||
f'orphan-candidate child pid={child_pid} gone '
|
||||
f'before test started'
|
||||
)
|
||||
|
||||
# step 3: kill parent — no IPC cancel arrives at child.
|
||||
# `proc.wait()` reaps the zombie so it truly disappears
|
||||
# from the process table (otherwise `os.kill(pid, 0)`
|
||||
# keeps reporting it as alive).
|
||||
os.kill(parent_pid, signal.SIGKILL)
|
||||
try:
|
||||
proc.wait(timeout=3.0)
|
||||
except subprocess.TimeoutExpired:
|
||||
pytest.fail(
|
||||
f'harness parent pid={parent_pid} did not die '
|
||||
f'after SIGKILL — test premise broken'
|
||||
)
|
||||
assert _process_alive(child_pid), (
|
||||
f'child pid={child_pid} died along with parent — '
|
||||
f'did the parent reap it before SIGKILL took? '
|
||||
f'test premise requires an orphan.'
|
||||
)
|
||||
|
||||
# step 4+5: SIGINT the orphan, poll for exit.
|
||||
os.kill(child_pid, signal.SIGINT)
|
||||
timeout: float = 6.0
|
||||
cleanup_deadline: float = time.monotonic() + timeout
|
||||
while time.monotonic() < cleanup_deadline:
|
||||
if not _process_alive(child_pid):
|
||||
return # <- success path
|
||||
time.sleep(0.1)
|
||||
|
||||
pytest.fail(
|
||||
f'Orphan subactor (pid={child_pid}) did NOT exit '
|
||||
f'within 10s of SIGINT under `main_thread_forkserver` '
|
||||
f'→ trio on non-main thread did not observe the '
|
||||
f'default CPython KeyboardInterrupt; backend needs '
|
||||
f'explicit SIGINT plumbing.'
|
||||
)
|
||||
finally:
|
||||
# best-effort cleanup to avoid leaking orphans across
|
||||
# the test session regardless of outcome.
|
||||
for pid in (parent_pid, child_pid):
|
||||
if pid is None:
|
||||
continue
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
try:
|
||||
proc.kill()
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
proc.wait(timeout=2.0)
|
||||
except subprocess.TimeoutExpired:
|
||||
pass
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# regression guard: variant-2 (`subint_forkserver`) placeholder
|
||||
# MUST raise `NotImplementedError` today — guards against future
|
||||
# commits accidentally re-aliasing the key to the variant-1
|
||||
# coroutine (which was a transient state during the rename).
|
||||
# ----------------------------------------------------------------
|
||||
def test_subint_forkserver_key_errors_cleanly() -> None:
|
||||
'''
|
||||
`--spawn-backend=subint_forkserver` is reserved for the
|
||||
eventual variant-2 (subint-isolated child runtime)
|
||||
backend, gated on jcrist/msgspec#1026 unblocking PEP 684
|
||||
isolated-mode subints upstream.
|
||||
|
||||
Until that lands, the dispatch entry MUST raise
|
||||
`NotImplementedError` immediately rather than silently
|
||||
aliasing to `main_thread_forkserver_proc`. Verify the
|
||||
error message also surfaces both the working-backend
|
||||
pointer and the upstream-blocker ref so an operator
|
||||
arriving at the error has somewhere to go.
|
||||
|
||||
'''
|
||||
import asyncio
|
||||
from tractor.spawn._spawn import _methods
|
||||
|
||||
proc = _methods['subint_forkserver']
|
||||
with pytest.raises(NotImplementedError) as ei:
|
||||
# signature args match `main_thread_forkserver_proc`'s
|
||||
# — the stub raises before touching them so dummy
|
||||
# values are fine.
|
||||
asyncio.run(
|
||||
proc(
|
||||
'x', None, None, {}, [],
|
||||
('127.0.0.1', 0), {},
|
||||
)
|
||||
)
|
||||
|
||||
msg: str = str(ei.value)
|
||||
assert 'main_thread_forkserver' in msg, (
|
||||
f'stub error msg should redirect to the working '
|
||||
f'variant-1 backend; got: {msg!r}'
|
||||
)
|
||||
assert 'msgspec#1026' in msg or '1026' in msg, (
|
||||
f'stub error msg should reference the upstream '
|
||||
f'blocker (jcrist/msgspec#1026); got: {msg!r}'
|
||||
)
|
||||
|
|
@ -0,0 +1,245 @@
|
|||
'''
|
||||
Cancellation + hard-kill semantics audit for the `subint` spawn
|
||||
backend.
|
||||
|
||||
Exercises the escape-hatch machinery added to
|
||||
`tractor.spawn._subint` (module-level `_HARD_KILL_TIMEOUT`,
|
||||
bounded shields around the soft-kill / thread-join sites, daemon
|
||||
driver-thread abandonment) so that future stdlib regressions or
|
||||
our own refactors don't silently re-introduce the hangs first
|
||||
diagnosed during the Phase B.2/B.3 bringup (issue #379).
|
||||
|
||||
Every test in this module:
|
||||
- is wrapped in `trio.fail_after()` for a deterministic per-test
|
||||
wall-clock ceiling (the whole point of these tests is to fail
|
||||
fast when our escape hatches regress; an unbounded test would
|
||||
defeat itself),
|
||||
- arms `tractor.devx.dump_on_hang()` to capture a stack dump on
|
||||
failure — without it, a hang here is opaque because pytest's
|
||||
stderr capture swallows `faulthandler` output by default
|
||||
(hard-won lesson from the original diagnosis),
|
||||
- skips on py<3.13 (no `_interpreters`) and on any
|
||||
`--spawn-backend` other than `'subint'` (these tests are
|
||||
subint-specific by design — they'd be nonsense under `trio` or
|
||||
`mp_*`).
|
||||
|
||||
'''
|
||||
from __future__ import annotations
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
import trio
|
||||
import tractor
|
||||
from tractor.devx import dump_on_hang
|
||||
|
||||
|
||||
# Gate: the `subint` backend requires py3.14+. Check the
|
||||
# public stdlib wrapper's presence (added in 3.14) rather than
|
||||
# the private `_interpreters` module (which exists on 3.13 but
|
||||
# wedges under tractor's usage — see `tractor.spawn._subint`).
|
||||
pytest.importorskip('concurrent.interpreters')
|
||||
|
||||
# Subint-only: read the spawn method that `pytest_configure`
|
||||
# committed via `try_set_start_method()`. By the time this module
|
||||
# imports, the CLI backend choice has been applied.
|
||||
from tractor.spawn._spawn import _spawn_method # noqa: E402
|
||||
|
||||
if _spawn_method != 'subint':
|
||||
pytestmark = pytest.mark.skip(
|
||||
reason=(
|
||||
"subint-specific cancellation audit — "
|
||||
"pass `--spawn-backend=subint` to run."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# child-side task bodies (run inside the spawned subint)
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
async def _trivial_rpc() -> str:
|
||||
'''
|
||||
Minimal RPC body for the baseline happy-teardown test.
|
||||
'''
|
||||
return 'hello from subint'
|
||||
|
||||
|
||||
async def _spin_without_trio_checkpoints() -> None:
|
||||
'''
|
||||
Block the main task with NO trio-visible checkpoints so any
|
||||
`Portal.cancel_actor()` arriving over IPC has nothing to hand
|
||||
off to.
|
||||
|
||||
`threading.Event.wait(timeout)` releases the GIL (so other
|
||||
threads — including trio's IO/RPC tasks — can progress) but
|
||||
does NOT insert a trio checkpoint, so the subactor's main
|
||||
task never notices cancellation.
|
||||
|
||||
This is the exact "stuck subint" scenario the hard-kill
|
||||
shields exist to survive.
|
||||
'''
|
||||
import threading
|
||||
never_set = threading.Event()
|
||||
while not never_set.is_set():
|
||||
# 1s re-check granularity; low enough not to waste CPU,
|
||||
# high enough that even a pathologically slow
|
||||
# `_HARD_KILL_TIMEOUT` won't accidentally align with a
|
||||
# wake.
|
||||
never_set.wait(timeout=1.0)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# parent-side harnesses (driven inside `trio.run(...)`)
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
async def _happy_path(
|
||||
reg_addr: tuple[str, int|str],
|
||||
deadline: float,
|
||||
) -> None:
|
||||
with trio.fail_after(deadline):
|
||||
async with (
|
||||
tractor.open_root_actor(
|
||||
registry_addrs=[reg_addr],
|
||||
),
|
||||
tractor.open_nursery() as an,
|
||||
):
|
||||
portal: tractor.Portal = await an.run_in_actor(
|
||||
_trivial_rpc,
|
||||
name='subint-happy',
|
||||
)
|
||||
result: str = await portal.wait_for_result()
|
||||
assert result == 'hello from subint'
|
||||
|
||||
|
||||
async def _spawn_stuck_then_cancel(
|
||||
reg_addr: tuple[str, int|str],
|
||||
deadline: float,
|
||||
) -> None:
|
||||
with trio.fail_after(deadline):
|
||||
async with (
|
||||
tractor.open_root_actor(
|
||||
registry_addrs=[reg_addr],
|
||||
),
|
||||
tractor.open_nursery() as an,
|
||||
):
|
||||
await an.run_in_actor(
|
||||
_spin_without_trio_checkpoints,
|
||||
name='subint-stuck',
|
||||
)
|
||||
# Give the child time to reach its non-checkpointing
|
||||
# loop before we cancel; the precise value doesn't
|
||||
# matter as long as it's a handful of trio schedule
|
||||
# ticks.
|
||||
await trio.sleep(0.5)
|
||||
an.cancel_scope.cancel()
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# tests
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
def test_subint_happy_teardown(
|
||||
reg_addr: tuple[str, int|str],
|
||||
) -> None:
|
||||
'''
|
||||
Baseline: spawn a subactor, do one portal RPC, close nursery
|
||||
cleanly. No cancel, no faults.
|
||||
|
||||
If this regresses we know something's wrong at the
|
||||
spawn/teardown layer unrelated to the hard-kill escape
|
||||
hatches.
|
||||
|
||||
'''
|
||||
deadline: float = 10.0
|
||||
with dump_on_hang(
|
||||
seconds=deadline,
|
||||
path='/tmp/subint_cancellation_happy.dump',
|
||||
):
|
||||
trio.run(partial(_happy_path, reg_addr, deadline))
|
||||
|
||||
|
||||
@pytest.mark.skipon_spawn_backend(
|
||||
'subint',
|
||||
reason=(
|
||||
'XXX SUBINT HANGING TEST XXX\n'
|
||||
'See oustanding issue(s)\n'
|
||||
# TODO, put issue link!
|
||||
)
|
||||
)
|
||||
# Wall-clock bound via `pytest-timeout` (`method='thread'`)
|
||||
# as defense-in-depth over the inner `trio.fail_after(15)`.
|
||||
# Under the orphaned-channel hang class described in
|
||||
# `ai/conc-anal/subint_cancel_delivery_hang_issue.md`, SIGINT
|
||||
# is still deliverable and this test *should* be unwedgeable
|
||||
# by the inner trio timeout — but sibling subint-backend
|
||||
# tests in this repo have also exhibited the
|
||||
# `subint_sigint_starvation_issue.md` GIL-starvation flavor,
|
||||
# so `method='thread'` keeps us safe in case ordering or
|
||||
# load shifts the failure mode.
|
||||
# @pytest.mark.timeout(
|
||||
# 3, # NOTE never passes pre-3.14+ subints support.
|
||||
# method='thread',
|
||||
# )
|
||||
def test_subint_non_checkpointing_child(
|
||||
reg_addr: tuple[str, int|str],
|
||||
) -> None:
|
||||
'''
|
||||
Cancel a subactor whose main task is stuck in a non-
|
||||
checkpointing Python loop.
|
||||
|
||||
`Portal.cancel_actor()` may be delivered over IPC but the
|
||||
main task never checkpoints to observe the Cancelled —
|
||||
so the subint's `trio.run()` can't exit gracefully.
|
||||
|
||||
The parent `subint_proc` bounded-shield + daemon-driver-
|
||||
thread combo should abandon the thread after
|
||||
`_HARD_KILL_TIMEOUT` and let the parent return cleanly.
|
||||
|
||||
Wall-clock budget:
|
||||
- ~0.5s: settle time for child to enter the stuck loop
|
||||
- ~3s: `_HARD_KILL_TIMEOUT` (soft-kill wait)
|
||||
- ~3s: `_HARD_KILL_TIMEOUT` (thread-join wait)
|
||||
- margin
|
||||
|
||||
KNOWN ISSUE (Ctrl-C-able hang):
|
||||
-------------------------------
|
||||
This test currently hangs past the hard-kill timeout for
|
||||
reasons unrelated to the subint teardown itself — after
|
||||
the subint is destroyed, a parent-side trio task appears
|
||||
to park on an orphaned IPC channel (no clean EOF
|
||||
delivered to a waiting receive). Unlike the
|
||||
SIGINT-starvation sibling case in
|
||||
`test_stale_entry_is_deleted`, this hang IS Ctrl-C-able
|
||||
(`strace` shows SIGINT wakeup-fd `write() = 1`, not
|
||||
`EAGAIN`) — i.e. the main trio loop is still iterating
|
||||
normally. That makes this *our* bug to fix, not a
|
||||
CPython-level limitation.
|
||||
|
||||
See `ai/conc-anal/subint_cancel_delivery_hang_issue.md`
|
||||
for the full analysis + candidate fix directions
|
||||
(explicit parent-side channel abort in `subint_proc`
|
||||
teardown being the most likely surgical fix).
|
||||
|
||||
The sibling `ai/conc-anal/subint_sigint_starvation_issue.md`
|
||||
documents the *other* hang class (abandoned-legacy-subint
|
||||
thread + shared-GIL starvation → signal-wakeup-fd pipe
|
||||
fills → SIGINT silently dropped) — that one is
|
||||
structurally blocked on msgspec PEP 684 adoption and is
|
||||
NOT what this test is hitting.
|
||||
|
||||
'''
|
||||
deadline: float = 15.0
|
||||
with dump_on_hang(
|
||||
seconds=deadline,
|
||||
path='/tmp/subint_cancellation_stuck.dump',
|
||||
):
|
||||
trio.run(
|
||||
partial(
|
||||
_spawn_stuck_then_cancel,
|
||||
reg_addr,
|
||||
deadline,
|
||||
),
|
||||
)
|
||||
|
|
@ -1,7 +1,12 @@
|
|||
"""
|
||||
Bidirectional streaming.
|
||||
'''
|
||||
Audit the simplest inter-actor bidirectional (streaming)
|
||||
msg patterns.
|
||||
|
||||
"""
|
||||
'''
|
||||
from __future__ import annotations
|
||||
from typing import (
|
||||
Callable,
|
||||
)
|
||||
import pytest
|
||||
import trio
|
||||
import tractor
|
||||
|
|
@ -9,10 +14,8 @@ import tractor
|
|||
|
||||
@tractor.context
|
||||
async def simple_rpc(
|
||||
|
||||
ctx: tractor.Context,
|
||||
data: int,
|
||||
|
||||
) -> None:
|
||||
'''
|
||||
Test a small ping-pong server.
|
||||
|
|
@ -39,15 +42,13 @@ async def simple_rpc(
|
|||
|
||||
@tractor.context
|
||||
async def simple_rpc_with_forloop(
|
||||
|
||||
ctx: tractor.Context,
|
||||
data: int,
|
||||
|
||||
) -> None:
|
||||
"""Same as previous test but using ``async for`` syntax/api.
|
||||
|
||||
"""
|
||||
'''
|
||||
Same as previous test but using `async for` syntax/api.
|
||||
|
||||
'''
|
||||
# signal to parent that we're up
|
||||
await ctx.started(data + 1)
|
||||
|
||||
|
|
@ -68,62 +69,78 @@ async def simple_rpc_with_forloop(
|
|||
|
||||
@pytest.mark.parametrize(
|
||||
'use_async_for',
|
||||
[True, False],
|
||||
[
|
||||
True,
|
||||
False,
|
||||
],
|
||||
ids='use_async_for={}'.format,
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
'server_func',
|
||||
[simple_rpc, simple_rpc_with_forloop],
|
||||
[
|
||||
simple_rpc,
|
||||
simple_rpc_with_forloop,
|
||||
],
|
||||
ids='server_func={}'.format,
|
||||
)
|
||||
def test_simple_rpc(server_func, use_async_for):
|
||||
def test_simple_rpc(
|
||||
server_func: Callable,
|
||||
use_async_for: bool,
|
||||
loglevel: str,
|
||||
debug_mode: bool,
|
||||
):
|
||||
'''
|
||||
The simplest request response pattern.
|
||||
|
||||
'''
|
||||
async def main():
|
||||
async with tractor.open_nursery() as n:
|
||||
with trio.fail_after(6):
|
||||
async with tractor.open_nursery(
|
||||
loglevel=loglevel,
|
||||
debug_mode=debug_mode,
|
||||
) as an:
|
||||
portal: tractor.Portal = await an.start_actor(
|
||||
'rpc_server',
|
||||
enable_modules=[__name__],
|
||||
)
|
||||
|
||||
portal = await n.start_actor(
|
||||
'rpc_server',
|
||||
enable_modules=[__name__],
|
||||
)
|
||||
async with portal.open_context(
|
||||
server_func, # taken from pytest parameterization
|
||||
data=10,
|
||||
) as (ctx, sent):
|
||||
|
||||
async with portal.open_context(
|
||||
server_func, # taken from pytest parameterization
|
||||
data=10,
|
||||
) as (ctx, sent):
|
||||
assert sent == 11
|
||||
|
||||
assert sent == 11
|
||||
async with ctx.open_stream() as stream:
|
||||
|
||||
async with ctx.open_stream() as stream:
|
||||
if use_async_for:
|
||||
|
||||
if use_async_for:
|
||||
|
||||
count = 0
|
||||
# receive msgs using async for style
|
||||
print('ping')
|
||||
await stream.send('ping')
|
||||
|
||||
async for msg in stream:
|
||||
assert msg == 'pong'
|
||||
count = 0
|
||||
# receive msgs using async for style
|
||||
print('ping')
|
||||
await stream.send('ping')
|
||||
count += 1
|
||||
|
||||
if count >= 9:
|
||||
break
|
||||
async for msg in stream:
|
||||
assert msg == 'pong'
|
||||
print('ping')
|
||||
await stream.send('ping')
|
||||
count += 1
|
||||
|
||||
else:
|
||||
# classic send/receive style
|
||||
for _ in range(10):
|
||||
if count >= 9:
|
||||
break
|
||||
|
||||
print('ping')
|
||||
await stream.send('ping')
|
||||
assert await stream.receive() == 'pong'
|
||||
else:
|
||||
# classic send/receive style
|
||||
for _ in range(10):
|
||||
|
||||
# stream should terminate here
|
||||
print('ping')
|
||||
await stream.send('ping')
|
||||
assert await stream.receive() == 'pong'
|
||||
|
||||
# final context result(s) should be consumed here in __aexit__()
|
||||
# stream should terminate here
|
||||
|
||||
await portal.cancel_actor()
|
||||
# final context result(s) should be consumed here in __aexit__()
|
||||
|
||||
await portal.cancel_actor()
|
||||
|
||||
trio.run(main)
|
||||
|
|
|
|||
|
|
@ -98,7 +98,8 @@ def test_ipc_channel_break_during_stream(
|
|||
expect_final_exc = TransportClosed
|
||||
|
||||
mod: ModuleType = import_path(
|
||||
examples_dir() / 'advanced_faults'
|
||||
examples_dir()
|
||||
/ 'advanced_faults'
|
||||
/ 'ipc_failure_during_stream.py',
|
||||
root=examples_dir(),
|
||||
consider_namespace_packages=False,
|
||||
|
|
@ -113,8 +114,9 @@ def test_ipc_channel_break_during_stream(
|
|||
if (
|
||||
# only expect EoC if trans is broken on the child side,
|
||||
ipc_break['break_child_ipc_after'] is not False
|
||||
and
|
||||
# AND we tell the child to call `MsgStream.aclose()`.
|
||||
and pre_aclose_msgstream
|
||||
pre_aclose_msgstream
|
||||
):
|
||||
# expect_final_exc = trio.EndOfChannel
|
||||
# ^XXX NOPE! XXX^ since now `.open_stream()` absorbs this
|
||||
|
|
@ -144,9 +146,6 @@ def test_ipc_channel_break_during_stream(
|
|||
# a user sending ctl-c by raising a KBI.
|
||||
if pre_aclose_msgstream:
|
||||
expect_final_exc = KeyboardInterrupt
|
||||
if tpt_proto == 'uds':
|
||||
expect_final_exc = TransportClosed
|
||||
expect_final_cause = trio.BrokenResourceError
|
||||
|
||||
# XXX OLD XXX
|
||||
# if child calls `MsgStream.aclose()` then expect EoC.
|
||||
|
|
@ -160,16 +159,13 @@ def test_ipc_channel_break_during_stream(
|
|||
ipc_break['break_child_ipc_after'] is not False
|
||||
and (
|
||||
ipc_break['break_parent_ipc_after']
|
||||
> ipc_break['break_child_ipc_after']
|
||||
>
|
||||
ipc_break['break_child_ipc_after']
|
||||
)
|
||||
):
|
||||
if pre_aclose_msgstream:
|
||||
expect_final_exc = KeyboardInterrupt
|
||||
|
||||
if tpt_proto == 'uds':
|
||||
expect_final_exc = TransportClosed
|
||||
expect_final_cause = trio.BrokenResourceError
|
||||
|
||||
# NOTE when the parent IPC side dies (even if the child does as well
|
||||
# but the child fails BEFORE the parent) we always expect the
|
||||
# IPC layer to raise a closed-resource, NEVER do we expect
|
||||
|
|
@ -248,8 +244,15 @@ def test_ipc_channel_break_during_stream(
|
|||
# get raw instance from pytest wrapper
|
||||
value = excinfo.value
|
||||
if isinstance(value, ExceptionGroup):
|
||||
excs = value.exceptions
|
||||
assert len(excs) == 1
|
||||
excs: tuple[Exception] = value.exceptions
|
||||
assert (
|
||||
len(excs) <= 2
|
||||
and
|
||||
all(
|
||||
isinstance(exc, TransportClosed)
|
||||
for exc in excs
|
||||
)
|
||||
)
|
||||
final_exc = excs[0]
|
||||
assert isinstance(final_exc, expect_final_exc)
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ Advanced streaming patterns using bidirectional streams and contexts.
|
|||
from collections import Counter
|
||||
import itertools
|
||||
import platform
|
||||
from typing import Type
|
||||
|
||||
import pytest
|
||||
import trio
|
||||
|
|
@ -76,9 +77,7 @@ async def subscribe(
|
|||
|
||||
|
||||
async def consumer(
|
||||
|
||||
subs: list[str],
|
||||
|
||||
) -> None:
|
||||
|
||||
uid = tractor.current_actor().uid
|
||||
|
|
@ -108,59 +107,196 @@ async def consumer(
|
|||
print(f'{uid} got: {value}')
|
||||
|
||||
|
||||
def test_dynamic_pub_sub():
|
||||
# NOTE: deliberately NOT using `@pytest.mark.timeout(...)` —
|
||||
# both pytest-timeout enforcement modes break trio under
|
||||
# fork-based backends:
|
||||
#
|
||||
# - `method='signal'` (SIGALRM): the handler synchronously
|
||||
# raises `Failed` in trio's main thread mid-`epoll.poll()`,
|
||||
# leaves `GLOBAL_RUN_CONTEXT` half-installed ("Trio guest
|
||||
# run got abandoned"), and EVERY subsequent `trio.run()`
|
||||
# in the same pytest process bails with
|
||||
# `RuntimeError: Attempted to call run() from inside a
|
||||
# run()` — session-wide poison.
|
||||
#
|
||||
# - `method='thread'`: calls `_thread.interrupt_main()`
|
||||
# raising `KeyboardInterrupt` into the main thread. Under
|
||||
# fork-based backends with mid-cascade fd-juggling the KBI
|
||||
# can escape trio's `KIManager` and bubble out of pytest
|
||||
# itself — kills the WHOLE session.
|
||||
#
|
||||
# Instead we use `trio.fail_after()` INSIDE `main()` below:
|
||||
# trio's own `Cancelled`/`TooSlowError` machinery handles the
|
||||
# timeout, cleanly unwinds the actor nursery's cancel
|
||||
# cascade, and only fails the single test (no cross-test
|
||||
# state corruption either way).
|
||||
#
|
||||
# `pyproject.toml`'s default `timeout = 200` is still a
|
||||
# last-resort safety net.
|
||||
@pytest.mark.parametrize(
|
||||
'expect_cancel_exc', [
|
||||
KeyboardInterrupt,
|
||||
trio.TooSlowError,
|
||||
],
|
||||
ids=lambda item:
|
||||
f'expect_user_exc_raised={item.__name__}'
|
||||
)
|
||||
def test_dynamic_pub_sub(
|
||||
reg_addr: tuple,
|
||||
debug_mode: bool,
|
||||
test_log: tractor.log.StackLevelAdapter,
|
||||
reap_subactors_per_test: int,
|
||||
expect_cancel_exc: Type[BaseException],
|
||||
|
||||
is_forking_spawner: bool,
|
||||
set_fork_aware_capture,
|
||||
):
|
||||
failed_to_raise_report: str = (
|
||||
f'Never got a {expect_cancel_exc!r} ??'
|
||||
)
|
||||
|
||||
global _registry
|
||||
|
||||
from multiprocessing import cpu_count
|
||||
cpus = cpu_count()
|
||||
|
||||
# Hard safety cap via trio's own cancellation. NOTE see the
|
||||
# module-level note on why we avoid `pytest-timeout` for this
|
||||
# test. Picked backend-aware: under `trio` backend spawn is
|
||||
# cheap (~1s for `cpus` actors) but fork-based backends pay
|
||||
# a per-spawn cost (forkserver round-trip + IPC peer-handshake)
|
||||
# that can stack up over `cpus - 1` sequential `n.run_in_actor()`
|
||||
# calls — especially on UDS under cross-pytest contention
|
||||
# (#451 / #452). Empirically a flat 15s flakes on
|
||||
# `main_thread_forkserver` for many-cpu hosts (a single bad
|
||||
# spawn-stack puts total run-time at ~15.5s, just over);
|
||||
# 30s gives plenty of headroom while still failing-loud on
|
||||
# a real hang.
|
||||
#
|
||||
# XXX caveat: this is an *inner* `trio.fail_after` — its
|
||||
# `Cancelled` cannot reach a task parked in a shielded `await`
|
||||
# (e.g. inside actor-nursery teardown). When the in-band cancel
|
||||
# path is itself buggy (the bug-class-3 `raise KBI` swallow we're
|
||||
# currently chasing) this guard does NOT fire and the test sits
|
||||
# forever until external SIGINT. The `_DIAG_CAP_S` outer guard
|
||||
# below is the AFK-safety counterpart.
|
||||
fail_after_s: int = (
|
||||
4
|
||||
if is_forking_spawner
|
||||
else 12
|
||||
)
|
||||
|
||||
# outer guard: when the inner fail_after fails to fire because of
|
||||
# a shielded-await deadlock, this cap *aborts the trio run via
|
||||
# signal.alarm → KBI* so AFK runs don't sit for >20min on the
|
||||
# bug-class-3 hang. Slightly larger than `fail_after_s` so the
|
||||
# trio-native path always wins when it works.
|
||||
_DIAG_CAP_S: int = fail_after_s + 5
|
||||
|
||||
async def main():
|
||||
async with tractor.open_nursery() as n:
|
||||
|
||||
# name of this actor will be same as target func
|
||||
await n.run_in_actor(publisher)
|
||||
|
||||
for i, sub in zip(
|
||||
range(cpus - 2),
|
||||
itertools.cycle(_registry.keys())
|
||||
):
|
||||
await n.run_in_actor(
|
||||
consumer,
|
||||
name=f'consumer_{sub}',
|
||||
subs=[sub],
|
||||
# bug-class-3 breadcrumb: tag each level of the cancel path
|
||||
# so when the run hangs and we capture cancel-level logs, the
|
||||
# *last* breadcrumb that fired names the swallow point.
|
||||
test_log.cancel('test_dynamic_pub_sub: enter main()')
|
||||
try:
|
||||
with trio.fail_after(fail_after_s):
|
||||
test_log.cancel(
|
||||
f'test_dynamic_pub_sub: '
|
||||
f'enter `trio.fail_after({fail_after_s})` scope'
|
||||
)
|
||||
try:
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
debug_mode=debug_mode,
|
||||
) as n:
|
||||
test_log.cancel(
|
||||
'test_dynamic_pub_sub: '
|
||||
'actor nursery opened'
|
||||
)
|
||||
|
||||
# make one dynamic subscriber
|
||||
await n.run_in_actor(
|
||||
consumer,
|
||||
name='consumer_dynamic',
|
||||
subs=list(_registry.keys()),
|
||||
# name of this actor will be same as target func
|
||||
await n.run_in_actor(publisher)
|
||||
|
||||
for i, sub in zip(
|
||||
range(cpus - 2),
|
||||
itertools.cycle(_registry.keys())
|
||||
):
|
||||
await n.run_in_actor(
|
||||
consumer,
|
||||
name=f'consumer_{sub}',
|
||||
subs=[sub],
|
||||
)
|
||||
|
||||
# make one dynamic subscriber
|
||||
await n.run_in_actor(
|
||||
consumer,
|
||||
name='consumer_dynamic',
|
||||
subs=list(_registry.keys()),
|
||||
)
|
||||
|
||||
# block until "cancelled by user"
|
||||
await trio.sleep(3)
|
||||
test_log.warning(
|
||||
f'Raising user cancel exc: '
|
||||
f'{expect_cancel_exc!r}'
|
||||
)
|
||||
test_log.cancel(
|
||||
f'test_dynamic_pub_sub: '
|
||||
f'ABOUT TO RAISE {expect_cancel_exc!r}'
|
||||
)
|
||||
raise expect_cancel_exc('simulate user cancel!')
|
||||
finally:
|
||||
test_log.cancel(
|
||||
'test_dynamic_pub_sub: '
|
||||
'actor nursery `__aexit__` returned'
|
||||
)
|
||||
test_log.cancel(
|
||||
'test_dynamic_pub_sub: `fail_after` scope exited'
|
||||
)
|
||||
finally:
|
||||
test_log.cancel(
|
||||
'test_dynamic_pub_sub: leaving `main()`'
|
||||
)
|
||||
|
||||
# block until cancelled by user
|
||||
with trio.fail_after(3):
|
||||
await trio.sleep_forever()
|
||||
|
||||
# outer signal-based guard — survives a shielded-await deadlock
|
||||
# since `signal.alarm` raises in the main thread regardless of
|
||||
# trio's scope state. ONLY armed under fork-based backends since
|
||||
# the bug we're chasing is MTF-specific.
|
||||
import signal
|
||||
armed_alarm: bool = bool(is_forking_spawner)
|
||||
if armed_alarm:
|
||||
signal.alarm(_DIAG_CAP_S)
|
||||
try:
|
||||
trio.run(main)
|
||||
except (
|
||||
trio.TooSlowError,
|
||||
ExceptionGroup,
|
||||
) as err:
|
||||
if isinstance(err, ExceptionGroup):
|
||||
for suberr in err.exceptions:
|
||||
if isinstance(suberr, trio.TooSlowError):
|
||||
break
|
||||
else:
|
||||
pytest.fail('Never got a `TooSlowError` ?')
|
||||
try:
|
||||
trio.run(main)
|
||||
pytest.fail(failed_to_raise_report)
|
||||
except expect_cancel_exc:
|
||||
# parent-side raised the user-cancel exc directly and
|
||||
# it propagated unwrapped; clean path.
|
||||
test_log.exception('Got user-cancel exc AS EXPECTED')
|
||||
except BaseExceptionGroup as err:
|
||||
# under fork-based backends the user-raised cancel
|
||||
# can race with subactor-side stream teardown
|
||||
# (`trio.EndOfChannel` from a publisher's `send()`
|
||||
# whose remote half got cut). The expected exc may
|
||||
# then be nested deeper in the group rather than at
|
||||
# the top level. `BaseExceptionGroup.split()` walks
|
||||
# the exc tree recursively (Python 3.11+).
|
||||
matched, _ = err.split(expect_cancel_exc)
|
||||
if matched is None:
|
||||
pytest.fail(failed_to_raise_report)
|
||||
|
||||
test_log.exception('Got user-cancel exc AS EXPECTED')
|
||||
finally:
|
||||
# always disarm so a passing test doesn't get killed
|
||||
# post-trio.run by a stale alarm.
|
||||
if armed_alarm:
|
||||
signal.alarm(0)
|
||||
|
||||
|
||||
@tractor.context
|
||||
async def one_task_streams_and_one_handles_reqresp(
|
||||
|
||||
ctx: tractor.Context,
|
||||
|
||||
) -> None:
|
||||
|
||||
await ctx.started()
|
||||
|
|
@ -257,7 +393,8 @@ async def echo_ctx_stream(
|
|||
|
||||
|
||||
def test_sigint_both_stream_types():
|
||||
'''Verify that running a bi-directional and recv only stream
|
||||
'''
|
||||
Verify that running a bi-directional and recv only stream
|
||||
side-by-side will cancel correctly from SIGINT.
|
||||
|
||||
'''
|
||||
|
|
@ -287,9 +424,11 @@ def test_sigint_both_stream_types():
|
|||
assert resp == msg
|
||||
raise KeyboardInterrupt
|
||||
|
||||
# TODO, use pytest.raises() here instead?
|
||||
# (why weren't we originally?)
|
||||
try:
|
||||
trio.run(main)
|
||||
assert 0, "Didn't receive KBI!?"
|
||||
pytest.fail("Didn't receive KBI!?")
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
|
|
@ -356,7 +495,12 @@ async def inf_streamer(
|
|||
print('streamer exited .open_streamer() block')
|
||||
|
||||
|
||||
# @pytest.mark.timeout(
|
||||
# 6,
|
||||
# method='signal',
|
||||
# )
|
||||
def test_local_task_fanout_from_stream(
|
||||
reg_addr: tuple,
|
||||
debug_mode: bool,
|
||||
):
|
||||
'''
|
||||
|
|
@ -421,4 +565,9 @@ def test_local_task_fanout_from_stream(
|
|||
|
||||
await p.cancel_actor()
|
||||
|
||||
trio.run(main)
|
||||
async def w_timeout():
|
||||
with trio.fail_after(6):
|
||||
await main()
|
||||
|
||||
# trio.run(main)
|
||||
trio.run(w_timeout)
|
||||
|
|
|
|||
|
|
@ -17,8 +17,18 @@ from tractor._testing import (
|
|||
from .conftest import no_windows
|
||||
|
||||
|
||||
def is_win():
|
||||
return platform.system() == 'Windows'
|
||||
_non_linux: bool = platform.system() != 'Linux'
|
||||
_friggin_windows: bool = platform.system() == 'Windows'
|
||||
|
||||
|
||||
pytestmark = pytest.mark.skipon_spawn_backend(
|
||||
'subint',
|
||||
reason=(
|
||||
'XXX SUBINT HANGING TEST XXX\n'
|
||||
'See oustanding issue(s)\n'
|
||||
# TODO, put issue link!
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
async def assert_err(delay=0):
|
||||
|
|
@ -110,8 +120,17 @@ def test_remote_error(reg_addr, args_err):
|
|||
assert exc.boxed_type == errtype
|
||||
|
||||
|
||||
# @pytest.mark.skipon_spawn_backend(
|
||||
# 'subint',
|
||||
# reason=(
|
||||
# 'XXX SUBINT HANGING TEST XXX\n'
|
||||
# 'See oustanding issue(s)\n'
|
||||
# # TODO, put issue link!
|
||||
# )
|
||||
# )
|
||||
def test_multierror(
|
||||
reg_addr: tuple[str, int],
|
||||
start_method: str,
|
||||
):
|
||||
'''
|
||||
Verify we raise a ``BaseExceptionGroup`` out of a nursery where
|
||||
|
|
@ -141,15 +160,28 @@ def test_multierror(
|
|||
trio.run(main)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('delay', (0, 0.5))
|
||||
@pytest.mark.parametrize(
|
||||
'num_subactors', range(25, 26),
|
||||
'delay',
|
||||
(0, 0.5),
|
||||
ids='delays={}'.format,
|
||||
)
|
||||
def test_multierror_fast_nursery(reg_addr, start_method, num_subactors, delay):
|
||||
"""Verify we raise a ``BaseExceptionGroup`` out of a nursery where
|
||||
@pytest.mark.parametrize(
|
||||
'num_subactors',
|
||||
range(25, 26),
|
||||
ids= 'num_subs={}'.format,
|
||||
)
|
||||
def test_multierror_fast_nursery(
|
||||
reg_addr: tuple,
|
||||
start_method: str,
|
||||
num_subactors: int,
|
||||
delay: float,
|
||||
):
|
||||
'''
|
||||
Verify we raise a ``BaseExceptionGroup`` out of a nursery where
|
||||
more then one actor errors and also with a delay before failure
|
||||
to test failure during an ongoing spawning.
|
||||
"""
|
||||
|
||||
'''
|
||||
async def main():
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
|
|
@ -189,8 +221,15 @@ async def do_nothing():
|
|||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize('mechanism', ['nursery_cancel', KeyboardInterrupt])
|
||||
def test_cancel_single_subactor(reg_addr, mechanism):
|
||||
@pytest.mark.parametrize(
|
||||
'mechanism', [
|
||||
'nursery_cancel',
|
||||
KeyboardInterrupt,
|
||||
])
|
||||
def test_cancel_single_subactor(
|
||||
reg_addr: tuple,
|
||||
mechanism: str|KeyboardInterrupt,
|
||||
):
|
||||
'''
|
||||
Ensure a ``ActorNursery.start_actor()`` spawned subactor
|
||||
cancels when the nursery is cancelled.
|
||||
|
|
@ -232,9 +271,13 @@ async def stream_forever():
|
|||
await trio.sleep(0.01)
|
||||
|
||||
|
||||
@tractor_test
|
||||
async def test_cancel_infinite_streamer(start_method):
|
||||
|
||||
@tractor_test(
|
||||
timeout=6,
|
||||
)
|
||||
async def test_cancel_infinite_streamer(
|
||||
reg_addr: tuple,
|
||||
start_method: str,
|
||||
):
|
||||
# stream for at most 1 seconds
|
||||
with (
|
||||
trio.fail_after(4),
|
||||
|
|
@ -257,6 +300,14 @@ async def test_cancel_infinite_streamer(start_method):
|
|||
assert n.cancelled
|
||||
|
||||
|
||||
# @pytest.mark.skipon_spawn_backend(
|
||||
# 'subint',
|
||||
# reason=(
|
||||
# 'XXX SUBINT HANGING TEST XXX\n'
|
||||
# 'See oustanding issue(s)\n'
|
||||
# # TODO, put issue link!
|
||||
# )
|
||||
# )
|
||||
@pytest.mark.parametrize(
|
||||
'num_actors_and_errs',
|
||||
[
|
||||
|
|
@ -286,9 +337,12 @@ async def test_cancel_infinite_streamer(start_method):
|
|||
'no_daemon_actors_fail_all_run_in_actors_sleep_then_fail',
|
||||
],
|
||||
)
|
||||
@tractor_test
|
||||
@tractor_test(
|
||||
timeout=10,
|
||||
)
|
||||
async def test_some_cancels_all(
|
||||
num_actors_and_errs: tuple,
|
||||
reg_addr: tuple,
|
||||
start_method: str,
|
||||
loglevel: str,
|
||||
):
|
||||
|
|
@ -370,7 +424,10 @@ async def test_some_cancels_all(
|
|||
pytest.fail("Should have gotten a remote assertion error?")
|
||||
|
||||
|
||||
async def spawn_and_error(breadth, depth) -> None:
|
||||
async def spawn_and_error(
|
||||
breadth: int,
|
||||
depth: int,
|
||||
) -> None:
|
||||
name = tractor.current_actor().name
|
||||
async with tractor.open_nursery() as nursery:
|
||||
for i in range(breadth):
|
||||
|
|
@ -395,8 +452,22 @@ async def spawn_and_error(breadth, depth) -> None:
|
|||
await nursery.run_in_actor(*args, **kwargs)
|
||||
|
||||
|
||||
# NOTE: `main_thread_forkserver` capture-fd hang class is no
|
||||
# longer skipped here — `--capture=sys` (the new `pyproject.toml`
|
||||
# default) sidesteps the pipe-buffer-fill deadlock for
|
||||
# `test_nested_multierrors`. See
|
||||
# `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
|
||||
# / #449 for the post-mortem.
|
||||
# @pytest.mark.timeout(
|
||||
# 10,
|
||||
# method='thread',
|
||||
# )
|
||||
@tractor_test
|
||||
async def test_nested_multierrors(loglevel, start_method):
|
||||
async def test_nested_multierrors(
|
||||
reg_addr: tuple,
|
||||
loglevel: str,
|
||||
start_method: str,
|
||||
):
|
||||
'''
|
||||
Test that failed actor sets are wrapped in `BaseExceptionGroup`s. This
|
||||
test goes only 2 nurseries deep but we should eventually have tests
|
||||
|
|
@ -431,7 +502,7 @@ async def test_nested_multierrors(loglevel, start_method):
|
|||
for subexc in err.exceptions:
|
||||
|
||||
# verify first level actor errors are wrapped as remote
|
||||
if is_win():
|
||||
if _friggin_windows:
|
||||
|
||||
# windows is often too slow and cancellation seems
|
||||
# to happen before an actor is spawned
|
||||
|
|
@ -464,7 +535,7 @@ async def test_nested_multierrors(loglevel, start_method):
|
|||
# XXX not sure what's up with this..
|
||||
# on windows sometimes spawning is just too slow and
|
||||
# we get back the (sent) cancel signal instead
|
||||
if is_win():
|
||||
if _friggin_windows:
|
||||
if isinstance(subexc, tractor.RemoteActorError):
|
||||
assert subexc.boxed_type in (
|
||||
BaseExceptionGroup,
|
||||
|
|
@ -483,20 +554,24 @@ async def test_nested_multierrors(loglevel, start_method):
|
|||
|
||||
@no_windows
|
||||
def test_cancel_via_SIGINT(
|
||||
loglevel,
|
||||
start_method,
|
||||
spawn_backend,
|
||||
reg_addr: tuple,
|
||||
loglevel: str,
|
||||
start_method: str,
|
||||
):
|
||||
"""Ensure that a control-C (SIGINT) signal cancels both the parent and
|
||||
'''
|
||||
Ensure that a control-C (SIGINT) signal cancels both the parent and
|
||||
child processes in trionic fashion
|
||||
"""
|
||||
pid = os.getpid()
|
||||
|
||||
'''
|
||||
pid: int = os.getpid()
|
||||
|
||||
async def main():
|
||||
with trio.fail_after(2):
|
||||
async with tractor.open_nursery() as tn:
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
) as tn:
|
||||
await tn.start_actor('sucka')
|
||||
if 'mp' in spawn_backend:
|
||||
if 'mp' in start_method:
|
||||
time.sleep(0.1)
|
||||
os.kill(pid, signal.SIGINT)
|
||||
await trio.sleep_forever()
|
||||
|
|
@ -507,23 +582,38 @@ def test_cancel_via_SIGINT(
|
|||
|
||||
@no_windows
|
||||
def test_cancel_via_SIGINT_other_task(
|
||||
loglevel,
|
||||
start_method,
|
||||
spawn_backend,
|
||||
reg_addr: tuple,
|
||||
loglevel: str,
|
||||
start_method: str,
|
||||
spawn_backend: str,
|
||||
):
|
||||
"""Ensure that a control-C (SIGINT) signal cancels both the parent
|
||||
and child processes in trionic fashion even a subprocess is started
|
||||
from a seperate ``trio`` child task.
|
||||
"""
|
||||
pid = os.getpid()
|
||||
timeout: float = 2
|
||||
if is_win(): # smh
|
||||
'''
|
||||
Ensure that a control-C (SIGINT) signal cancels both the parent
|
||||
and child processes in trionic fashion even a subprocess is
|
||||
started from a seperate ``trio`` child task.
|
||||
|
||||
'''
|
||||
from .conftest import cpu_scaling_factor
|
||||
|
||||
pid: int = os.getpid()
|
||||
timeout: float = (
|
||||
4 if _non_linux
|
||||
else 2
|
||||
)
|
||||
if _friggin_windows: # smh
|
||||
timeout += 1
|
||||
|
||||
# add latency headroom for CPU freq scaling (auto-cpufreq et al.)
|
||||
headroom: float = cpu_scaling_factor()
|
||||
if headroom != 1.:
|
||||
timeout *= headroom
|
||||
|
||||
async def spawn_and_sleep_forever(
|
||||
task_status=trio.TASK_STATUS_IGNORED
|
||||
):
|
||||
async with tractor.open_nursery() as tn:
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
) as tn:
|
||||
for i in range(3):
|
||||
await tn.run_in_actor(
|
||||
sleep_forever,
|
||||
|
|
@ -568,6 +658,14 @@ async def spawn_sub_with_sync_blocking_task():
|
|||
print('exiting first subactor layer..\n')
|
||||
|
||||
|
||||
# @pytest.mark.skipon_spawn_backend(
|
||||
# 'subint',
|
||||
# reason=(
|
||||
# 'XXX SUBINT HANGING TEST XXX\n'
|
||||
# 'See oustanding issue(s)\n'
|
||||
# # TODO, put issue link!
|
||||
# )
|
||||
# )
|
||||
@pytest.mark.parametrize(
|
||||
'man_cancel_outer',
|
||||
[
|
||||
|
|
@ -644,7 +742,11 @@ def test_cancel_while_childs_child_in_sync_sleep(
|
|||
#
|
||||
# delay = 1 # no AssertionError in eg, TooSlowError raised.
|
||||
# delay = 2 # is AssertionError in eg AND no TooSlowError !?
|
||||
delay = 4 # is AssertionError in eg AND no _cs cancellation.
|
||||
# is AssertionError in eg AND no _cs cancellation.
|
||||
delay = (
|
||||
6 if _non_linux
|
||||
else 4
|
||||
)
|
||||
|
||||
with trio.fail_after(delay) as _cs:
|
||||
# with trio.CancelScope() as cs:
|
||||
|
|
@ -678,7 +780,7 @@ def test_cancel_while_childs_child_in_sync_sleep(
|
|||
|
||||
|
||||
def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon(
|
||||
start_method,
|
||||
start_method: str,
|
||||
):
|
||||
'''
|
||||
This is a very subtle test which demonstrates how cancellation
|
||||
|
|
@ -696,7 +798,7 @@ def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon(
|
|||
kbi_delay = 0.5
|
||||
timeout: float = 2.9
|
||||
|
||||
if is_win(): # smh
|
||||
if _friggin_windows: # smh
|
||||
timeout += 1
|
||||
|
||||
async def main():
|
||||
|
|
|
|||
|
|
@ -18,16 +18,15 @@ from tractor import RemoteActorError
|
|||
|
||||
|
||||
async def aio_streamer(
|
||||
from_trio: asyncio.Queue,
|
||||
to_trio: trio.abc.SendChannel,
|
||||
chan: tractor.to_asyncio.LinkedTaskChannel,
|
||||
) -> trio.abc.ReceiveChannel:
|
||||
|
||||
# required first msg to sync caller
|
||||
to_trio.send_nowait(None)
|
||||
chan.started_nowait(None)
|
||||
|
||||
from itertools import cycle
|
||||
for i in cycle(range(10)):
|
||||
to_trio.send_nowait(i)
|
||||
chan.send_nowait(i)
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
|
||||
|
|
@ -69,7 +68,7 @@ async def wrapper_mngr(
|
|||
else:
|
||||
async with tractor.to_asyncio.open_channel_from(
|
||||
aio_streamer,
|
||||
) as (first, from_aio):
|
||||
) as (from_aio, first):
|
||||
assert not first
|
||||
|
||||
# cache it so next task uses broadcast receiver
|
||||
|
|
|
|||
|
|
@ -10,7 +10,19 @@ from tractor._testing import tractor_test
|
|||
MESSAGE = 'tractoring at full speed'
|
||||
|
||||
|
||||
def test_empty_mngrs_input_raises() -> None:
|
||||
def test_empty_mngrs_input_raises(
|
||||
tpt_proto: str,
|
||||
) -> None:
|
||||
# TODO, the `open_actor_cluster()` teardown hangs
|
||||
# intermittently on UDS when `gather_contexts(mngrs=())`
|
||||
# raises `ValueError` mid-setup; likely a race in the
|
||||
# actor-nursery cleanup vs UDS socket shutdown. Needs
|
||||
# a deeper look at `._clustering`/`._supervise` teardown
|
||||
# paths with the UDS transport.
|
||||
if tpt_proto == 'uds':
|
||||
pytest.skip(
|
||||
'actor-cluster teardown hangs intermittently on UDS'
|
||||
)
|
||||
|
||||
async def main():
|
||||
with trio.fail_after(3):
|
||||
|
|
@ -56,25 +68,44 @@ async def worker(
|
|||
print(msg)
|
||||
assert msg == MESSAGE
|
||||
|
||||
# TODO: does this ever cause a hang
|
||||
# ?TODO, does this ever cause a hang?
|
||||
# assert 0
|
||||
|
||||
|
||||
# ?TODO, but needs a fn-scoped tpt_proto fixture..
|
||||
# @pytest.mark.no_tpt('uds')
|
||||
@tractor_test
|
||||
async def test_streaming_to_actor_cluster() -> None:
|
||||
async def test_streaming_to_actor_cluster(
|
||||
tpt_proto: str,
|
||||
is_forking_spawner: bool,
|
||||
):
|
||||
'''
|
||||
Open an actor "cluster" using the (experimental) `._clustering`
|
||||
API and conduct standard inter-task-ctx streaming.
|
||||
|
||||
async with (
|
||||
open_actor_cluster(modules=[__name__]) as portals,
|
||||
'''
|
||||
if tpt_proto == 'uds':
|
||||
pytest.skip(
|
||||
f'Test currently fails with tpt-proto={tpt_proto!r}\n'
|
||||
)
|
||||
|
||||
gather_contexts(
|
||||
mngrs=[p.open_context(worker) for p in portals.values()],
|
||||
) as contexts,
|
||||
delay: float = (
|
||||
10 if is_forking_spawner
|
||||
else 6
|
||||
)
|
||||
with trio.fail_after(delay):
|
||||
async with (
|
||||
open_actor_cluster(modules=[__name__]) as portals,
|
||||
|
||||
gather_contexts(
|
||||
mngrs=[ctx[0].open_stream() for ctx in contexts],
|
||||
) as streams,
|
||||
gather_contexts(
|
||||
mngrs=[p.open_context(worker) for p in portals.values()],
|
||||
) as contexts,
|
||||
|
||||
):
|
||||
with trio.move_on_after(1):
|
||||
for stream in itertools.cycle(streams):
|
||||
await stream.send(MESSAGE)
|
||||
gather_contexts(
|
||||
mngrs=[ctx[0].open_stream() for ctx in contexts],
|
||||
) as streams,
|
||||
|
||||
):
|
||||
with trio.move_on_after(1):
|
||||
for stream in itertools.cycle(streams):
|
||||
await stream.send(MESSAGE)
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ from itertools import count
|
|||
import math
|
||||
import platform
|
||||
from pprint import pformat
|
||||
import sys
|
||||
from typing import (
|
||||
Callable,
|
||||
)
|
||||
|
|
@ -25,7 +26,7 @@ from tractor._exceptions import (
|
|||
StreamOverrun,
|
||||
ContextCancelled,
|
||||
)
|
||||
from tractor._state import current_ipc_ctx
|
||||
from tractor.runtime._state import current_ipc_ctx
|
||||
|
||||
from tractor._testing import (
|
||||
tractor_test,
|
||||
|
|
@ -114,10 +115,12 @@ async def not_started_but_stream_opened(
|
|||
)
|
||||
def test_started_misuse(
|
||||
target: Callable,
|
||||
reg_addr: tuple,
|
||||
debug_mode: bool,
|
||||
):
|
||||
async def main():
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
debug_mode=debug_mode,
|
||||
) as an:
|
||||
portal = await an.start_actor(
|
||||
|
|
@ -183,15 +186,24 @@ def test_simple_context(
|
|||
error_parent,
|
||||
child_blocks_forever,
|
||||
pointlessly_open_stream,
|
||||
reg_addr: tuple,
|
||||
debug_mode: bool,
|
||||
is_forking_spawner: bool,
|
||||
):
|
||||
|
||||
timeout = 1.5 if not platform.system() == 'Windows' else 4
|
||||
timeout: float = 1.5
|
||||
# windows and forking-spawner both have "slower but more
|
||||
# deterministic" cancel teardown.
|
||||
if platform.system() == 'Windows':
|
||||
timeout = 4
|
||||
elif is_forking_spawner:
|
||||
timeout = 3
|
||||
|
||||
async def main():
|
||||
|
||||
with trio.fail_after(timeout):
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
debug_mode=debug_mode,
|
||||
) as an:
|
||||
portal = await an.start_actor(
|
||||
|
|
@ -277,6 +289,7 @@ def test_parent_cancels(
|
|||
cancel_method: str,
|
||||
chk_ctx_result_before_exit: bool,
|
||||
child_returns_early: bool,
|
||||
reg_addr: tuple,
|
||||
debug_mode: bool,
|
||||
):
|
||||
'''
|
||||
|
|
@ -354,6 +367,7 @@ def test_parent_cancels(
|
|||
async def main():
|
||||
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
debug_mode=debug_mode,
|
||||
) as an:
|
||||
portal = await an.start_actor(
|
||||
|
|
@ -930,6 +944,7 @@ async def keep_sending_from_child(
|
|||
)
|
||||
def test_one_end_stream_not_opened(
|
||||
overrun_by: tuple[str, int, Callable],
|
||||
reg_addr: tuple,
|
||||
debug_mode: bool,
|
||||
):
|
||||
'''
|
||||
|
|
@ -938,11 +953,17 @@ def test_one_end_stream_not_opened(
|
|||
|
||||
'''
|
||||
overrunner, buf_size_increase, entrypoint = overrun_by
|
||||
from tractor._runtime import Actor
|
||||
from tractor.runtime._runtime import Actor
|
||||
buf_size = buf_size_increase + Actor.msg_buffer_size
|
||||
|
||||
timeout: float = (
|
||||
1 if sys.platform == 'linux'
|
||||
else 3
|
||||
)
|
||||
|
||||
async def main():
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
debug_mode=debug_mode,
|
||||
) as an:
|
||||
portal = await an.start_actor(
|
||||
|
|
@ -950,7 +971,7 @@ def test_one_end_stream_not_opened(
|
|||
enable_modules=[__name__],
|
||||
)
|
||||
|
||||
with trio.fail_after(1):
|
||||
with trio.fail_after(timeout):
|
||||
async with portal.open_context(
|
||||
entrypoint,
|
||||
) as (ctx, sent):
|
||||
|
|
@ -1107,6 +1128,7 @@ def test_maybe_allow_overruns_stream(
|
|||
|
||||
# conftest wide
|
||||
loglevel: str,
|
||||
reg_addr: tuple,
|
||||
debug_mode: bool,
|
||||
):
|
||||
'''
|
||||
|
|
@ -1127,6 +1149,7 @@ def test_maybe_allow_overruns_stream(
|
|||
'''
|
||||
async def main():
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
debug_mode=debug_mode,
|
||||
) as an:
|
||||
portal = await an.start_actor(
|
||||
|
|
@ -1243,6 +1266,7 @@ def test_maybe_allow_overruns_stream(
|
|||
|
||||
def test_ctx_with_self_actor(
|
||||
loglevel: str,
|
||||
reg_addr: tuple,
|
||||
debug_mode: bool,
|
||||
):
|
||||
'''
|
||||
|
|
@ -1257,6 +1281,7 @@ def test_ctx_with_self_actor(
|
|||
'''
|
||||
async def main():
|
||||
async with tractor.open_nursery(
|
||||
registry_addrs=[reg_addr],
|
||||
debug_mode=debug_mode,
|
||||
enable_modules=[__name__],
|
||||
) as an:
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue