Rejig scan loop for flaky TCP connects, better caching

`ib-gw` seems particularly fragile to connections from clients with the
same id (can result in weird connect hangs and even crashes) and
`ib_insync` doesn't handle intermittent tcp disconnects that
well..(especially on dockerized IBC setups). This adds a bunch of
changes to our client caching and scan loop as well a proper
task-locking-to-cache-proxies so that,

- `asyncio`-side clients aren't double-loaded/connected even when
  explicitly trying to reconnect repeatedly with a given client to work
  around the unreliability of the `asyncio.Transport` design in
  `ib_insync`.
- we can use `tractor.trionics.maybe_open_context()` to lock the `trio`
  side from loading more then one `Client` on the `asyncio` side and
  instead on cache hits only making a new `MethodProxy` around the
  reused `asyncio`-side client (since each `trio` task needs its own
  inter-task msg channel).
- a `finally:` block teardown on all clients loaded in the scan loop
  avoids stale connections.
- the connect params are now exposed as named args to
  `load_aio_clients()` can be easily controlled from caller code.

Oh, and we properly hooked up the internal `ib_insync` logging to our
own internal schema - makes it a lot easier to debug wtf is going on XD
ib_dedicated_data_client
Tyler Goodlet 2022-05-21 10:59:34 -04:00
parent 26f47227d2
commit a5389beccd
1 changed files with 132 additions and 121 deletions

View File

@ -38,8 +38,6 @@ from typing import (
import asyncio
from pprint import pformat
import inspect
import logging
from random import randint
import time
from types import SimpleNamespace
@ -164,13 +162,23 @@ class NonShittyIB(ibis.IB):
- Don't use named tuples
"""
def __init__(self):
# override `ib_insync` internal loggers so we can see wtf
# it's doing..
self._logger = get_logger(
'ib_insync.ib',
)
self._createEvents()
# XXX: just to override this wrapper
self.wrapper = NonShittyWrapper(self)
self.client = ib_Client(self.wrapper)
self.client._logger = get_logger(
'ib_insync.client',
)
# self.errorEvent += self._onError
self.client.apiEnd += self.disconnectedEvent
self._logger = logging.getLogger('ib_insync.ib')
# map of symbols to contract ids
@ -883,9 +891,6 @@ _try_ports = [
_gw_port,
_tws_port
]
# TODO: remove the randint stuff and use proper error checking in client
# factor below..
_client_ids = itertools.count(randint(1, 100))
_client_cache: dict[tuple[str, int], Client] = {}
_scan_ignore: set[tuple[str, int]] = set()
@ -911,8 +916,12 @@ async def load_aio_clients(
host: str = '127.0.0.1',
port: int = None,
client_id: int = 6116,
client_id: Optional[int] = None,
# the API TCP in `ib_insync` connection can be flaky af so instead
# retry a few times to get the client going..
connect_retries: int = 3,
connect_timeout: float = 0.5,
) -> dict[str, Client]:
'''
@ -949,49 +958,34 @@ async def load_aio_clients(
{
'gw': 4002,
'tws': 7497,
# 'order': ['gw', 'tws']
}
)
order = ports.pop('order', None)
if order:
log.warning('`ports.order` section in `brokers.toml` is deprecated')
_err = None
accounts_def = config.load_accounts(['ib'])
try_ports = list(ports.values())
ports = try_ports if port is None else [port]
# we_connected = []
connect_timeout = 2
combos = list(itertools.product(hosts, ports))
# allocate new and/or reload disconnected but cached clients
# try:
# TODO: support multiple clients allowing for execution on
# multiple accounts (including a paper instance running on the
# same machine) and switching between accounts in the ems.
_err = None
accounts_found: dict[str, Client] = {}
# (re)load any and all clients that can be found
# from connection details in ``brokers.toml``.
for host, port in combos:
sockaddr = (host, port)
client = _client_cache.get(sockaddr)
accounts_found: dict[str, Client] = {}
if (
client and client.ib.isConnected()
sockaddr in _client_cache
or sockaddr in _scan_ignore
):
continue
try:
ib = NonShittyIB()
# XXX: not sure if we ever really need to increment the
# client id if teardown is sucessful.
client_id = 6116
for i in range(connect_retries):
try:
await ib.connectAsync(
host,
port,
@ -1002,6 +996,28 @@ async def load_aio_clients(
# careful.
timeout=connect_timeout,
)
break
except (
ConnectionRefusedError,
# TODO: if trying to scan for remote api clients
# pretty sure we need to catch this, though it
# definitely needs a shorter timeout since it hangs
# for like 5s..
asyncio.exceptions.TimeoutError,
OSError,
) as ce:
_err = ce
if i > 8:
# cache logic to avoid rescanning if we already have all
# clients loaded.
_scan_ignore.add(sockaddr)
raise
log.warning(
f'Failed to connect on {port} for {i} time, retrying...')
# create and cache client
client = Client(ib)
@ -1039,43 +1055,14 @@ async def load_aio_clients(
)
# update all actor-global caches
log.info(f"Caching client for {(host, port)}")
_client_cache[(host, port)] = client
# we_connected.append((host, port, client))
# TODO: don't do it this way, get a gud to_asyncio
# context / .start() system goin..
def pop_and_discon():
log.info(f'Disconnecting client {client}')
client.ib.disconnect()
_client_cache.pop((host, port), None)
# NOTE: the above callback **CAN'T FAIL** or shm won't get
# torn down correctly ...
tractor._actor._lifetime_stack.callback(pop_and_discon)
log.info(f"Caching client for {sockaddr}")
_client_cache[sockaddr] = client
# XXX: why aren't we just updating this directy above
# instead of using the intermediary `accounts_found`?
_accounts2clients.update(accounts_found)
except (
ConnectionRefusedError,
# TODO: if trying to scan for remote api clients
# pretty sure we need to catch this, though it
# definitely needs a shorter timeout since it hangs
# for like 5s..
asyncio.exceptions.TimeoutError,
OSError,
) as ce:
_err = ce
log.warning(f'Failed to connect on {port}')
# cache logic to avoid rescanning if we already have all
# clients loaded.
_scan_ignore.add(sockaddr)
# if we have no clients after the scan loop then error out.
if not _client_cache:
raise ConnectionError(
'No ib APIs could be found scanning @:\n'
@ -1083,16 +1070,15 @@ async def load_aio_clients(
'Check your `brokers.toml` and/or network'
) from _err
try:
yield _accounts2clients
# TODO: this in a way that works xD
# finally:
# pass
# # async with trio.CancelScope(shield=True):
# for host, port, client in we_connected:
# client.ib.disconnect()
# _client_cache.pop((host, port))
# raise
finally:
# TODO: for re-scans we'll want to not teardown clients which
# are up and stable right?
for acct, client in _accounts2clients.items():
log.info(f'Disconnecting {acct}@{client}')
client.ib.disconnect()
_client_cache.pop((host, port))
async def load_clients_for_trio(
@ -1103,7 +1089,17 @@ async def load_clients_for_trio(
'''
Pure async mngr proxy to ``load_aio_clients()``.
This is a bootstrap entrypoing to call from
a ``tractor.to_asyncio.open_channel_from()``.
'''
global _accounts2clients
if _accounts2clients:
to_trio.send_nowait(_accounts2clients)
await asyncio.sleep(float('inf'))
else:
async with load_aio_clients() as accts2clients:
to_trio.send_nowait(accts2clients)
@ -1111,28 +1107,38 @@ async def load_clients_for_trio(
await asyncio.sleep(float('inf'))
_proxies: dict[str, MethodProxy] = {}
@acm
async def open_client_proxies() -> tuple[
dict[str, MethodProxy],
dict[str, Client],
]:
proxies: dict[str, MethodProxy] = {}
async with (
tractor.to_asyncio.open_channel_from(
load_clients_for_trio,
) as (clients, from_aio),
tractor.trionics.maybe_open_context(
# acm_func=open_client_proxies,
acm_func=tractor.to_asyncio.open_channel_from,
kwargs={'target': load_clients_for_trio},
# lock around current actor task access
# TODO: maybe this should be the default in tractor?
key=tractor.current_actor().uid,
) as (cache_hit, (clients, from_aio)),
AsyncExitStack() as stack
):
if cache_hit:
log.info(f'Re-using cached clients: {clients}')
for acct_name, client in clients.items():
proxy = await stack.enter_async_context(
open_client_proxy(client),
)
proxies[acct_name] = proxy
_proxies[acct_name] = proxy
yield proxies, clients
yield _proxies, clients
def get_preferred_data_client(
@ -1511,10 +1517,15 @@ async def get_bars(
for _ in range(10):
try:
bars, bars_array = await proxy.bars(
out = await proxy.bars(
fqsn=fqsn,
end_dt=end_dt,
)
if out:
bars, bars_array = out
else:
await tractor.breakpoint()
if bars_array is None:
raise SymbolNotFound(fqsn)