Add timeout + shielding to `NoBsWs` reconnect logic

Add timeout param to `.reset()` and `.send_msg()` to prevent
indefinite blocking on reconnect attempts. Shield reconnect
sleeps from cancellation to ensure we avoid any "finally footgun" type
scenarios where `trio.Cancelled` masks an underlying exc per,
- https://github.com/goodboy/tractor/pull/387
- https://github.com/goodboy/tractor/pull/391

Deats,
- add `timeout` param to `.reset()`, return `bool` for success
- add `timeout=3` default to `.send_msg()` for reconnect wait
- shield `.reset()` call in `.send_msg()` error handler
- log warning when reconnect timeout exceeded
- shield throttled sleeps in `_reconnect_forever()` error paths

(this commit msg was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
tpt_closed_and_finally_footguns
Gud Boi 2026-02-04 20:09:35 -05:00
parent ac6ab3791e
commit ef3309adf9
1 changed files with 24 additions and 5 deletions

View File

@ -105,7 +105,10 @@ class NoBsWs:
def connected(self) -> bool: def connected(self) -> bool:
return self._connected.is_set() return self._connected.is_set()
async def reset(self) -> None: async def reset(
self,
timeout: float,
) -> bool:
''' '''
Reset the underlying ws connection by cancelling Reset the underlying ws connection by cancelling
the bg relay task and waiting for it to signal the bg relay task and waiting for it to signal
@ -114,18 +117,31 @@ class NoBsWs:
''' '''
self._connected = trio.Event() self._connected = trio.Event()
self._cs.cancel() self._cs.cancel()
with trio.move_on_after(timeout) as cs:
await self._connected.wait() await self._connected.wait()
return True
assert cs.cancelled_caught
return False
async def send_msg( async def send_msg(
self, self,
data: Any, data: Any,
timeout: float = 3,
) -> None: ) -> None:
while True: while True:
try: try:
msg: Any = self._dumps(data) msg: Any = self._dumps(data)
return await self._ws.send_message(msg) return await self._ws.send_message(msg)
except self.recon_errors: except self.recon_errors:
await self.reset() with trio.CancelScope(shield=True):
reconnected: bool = await self.reset(
timeout=timeout,
)
if not reconnected:
log.warning(
'Failed to reconnect after {timeout!r}s ??'
)
async def recv_msg(self) -> Any: async def recv_msg(self) -> Any:
msg: Any = await self._rx.receive() msg: Any = await self._rx.receive()
@ -191,7 +207,9 @@ async def _reconnect_forever(
f'{src_mod}\n' f'{src_mod}\n'
f'{url} connection bail with:' f'{url} connection bail with:'
) )
with trio.CancelScope(shield=True):
await trio.sleep(0.5) await trio.sleep(0.5)
rent_cs.cancel() rent_cs.cancel()
# go back to reonnect loop in parent task # go back to reonnect loop in parent task
@ -291,6 +309,7 @@ async def _reconnect_forever(
log.exception( log.exception(
'Reconnect-attempt failed ??\n' 'Reconnect-attempt failed ??\n'
) )
with trio.CancelScope(shield=True):
await trio.sleep(0.2) # throttle await trio.sleep(0.2) # throttle
raise berr raise berr