Rejig scan loop for flaky TCP connects, better caching

`ib-gw` seems particularly fragile to connections from clients with the
same id (can result in weird connect hangs and even crashes) and
`ib_insync` doesn't handle intermittent tcp disconnects that
well..(especially on dockerized IBC setups). This adds a bunch of
changes to our client caching and scan loop as well a proper
task-locking-to-cache-proxies so that,

- `asyncio`-side clients aren't double-loaded/connected even when
  explicitly trying to reconnect repeatedly with a given client to work
  around the unreliability of the `asyncio.Transport` design in
  `ib_insync`.
- we can use `tractor.trionics.maybe_open_context()` to lock the `trio`
  side from loading more then one `Client` on the `asyncio` side and
  instead on cache hits only making a new `MethodProxy` around the
  reused `asyncio`-side client (since each `trio` task needs its own
  inter-task msg channel).
- a `finally:` block teardown on all clients loaded in the scan loop
  avoids stale connections.
- the connect params are now exposed as named args to
  `load_aio_clients()` can be easily controlled from caller code.

Oh, and we properly hooked up the internal `ib_insync` logging to our
own internal schema - makes it a lot easier to debug wtf is going on XD
ib_dedicated_data_client
Tyler Goodlet 2022-05-21 10:59:34 -04:00
parent 26f47227d2
commit a5389beccd
1 changed files with 132 additions and 121 deletions

View File

@ -38,8 +38,6 @@ from typing import (
import asyncio import asyncio
from pprint import pformat from pprint import pformat
import inspect import inspect
import logging
from random import randint
import time import time
from types import SimpleNamespace from types import SimpleNamespace
@ -164,13 +162,23 @@ class NonShittyIB(ibis.IB):
- Don't use named tuples - Don't use named tuples
""" """
def __init__(self): def __init__(self):
# override `ib_insync` internal loggers so we can see wtf
# it's doing..
self._logger = get_logger(
'ib_insync.ib',
)
self._createEvents() self._createEvents()
# XXX: just to override this wrapper # XXX: just to override this wrapper
self.wrapper = NonShittyWrapper(self) self.wrapper = NonShittyWrapper(self)
self.client = ib_Client(self.wrapper) self.client = ib_Client(self.wrapper)
self.client._logger = get_logger(
'ib_insync.client',
)
# self.errorEvent += self._onError # self.errorEvent += self._onError
self.client.apiEnd += self.disconnectedEvent self.client.apiEnd += self.disconnectedEvent
self._logger = logging.getLogger('ib_insync.ib')
# map of symbols to contract ids # map of symbols to contract ids
@ -883,9 +891,6 @@ _try_ports = [
_gw_port, _gw_port,
_tws_port _tws_port
] ]
# TODO: remove the randint stuff and use proper error checking in client
# factor below..
_client_ids = itertools.count(randint(1, 100))
_client_cache: dict[tuple[str, int], Client] = {} _client_cache: dict[tuple[str, int], Client] = {}
_scan_ignore: set[tuple[str, int]] = set() _scan_ignore: set[tuple[str, int]] = set()
@ -911,8 +916,12 @@ async def load_aio_clients(
host: str = '127.0.0.1', host: str = '127.0.0.1',
port: int = None, port: int = None,
client_id: int = 6116,
client_id: Optional[int] = None, # the API TCP in `ib_insync` connection can be flaky af so instead
# retry a few times to get the client going..
connect_retries: int = 3,
connect_timeout: float = 0.5,
) -> dict[str, Client]: ) -> dict[str, Client]:
''' '''
@ -949,49 +958,34 @@ async def load_aio_clients(
{ {
'gw': 4002, 'gw': 4002,
'tws': 7497, 'tws': 7497,
# 'order': ['gw', 'tws']
} }
) )
order = ports.pop('order', None) order = ports.pop('order', None)
if order: if order:
log.warning('`ports.order` section in `brokers.toml` is deprecated') log.warning('`ports.order` section in `brokers.toml` is deprecated')
_err = None
accounts_def = config.load_accounts(['ib']) accounts_def = config.load_accounts(['ib'])
try_ports = list(ports.values()) try_ports = list(ports.values())
ports = try_ports if port is None else [port] ports = try_ports if port is None else [port]
# we_connected = []
connect_timeout = 2
combos = list(itertools.product(hosts, ports)) combos = list(itertools.product(hosts, ports))
accounts_found: dict[str, Client] = {}
# allocate new and/or reload disconnected but cached clients
# try:
# TODO: support multiple clients allowing for execution on
# multiple accounts (including a paper instance running on the
# same machine) and switching between accounts in the ems.
_err = None
# (re)load any and all clients that can be found # (re)load any and all clients that can be found
# from connection details in ``brokers.toml``. # from connection details in ``brokers.toml``.
for host, port in combos: for host, port in combos:
sockaddr = (host, port) sockaddr = (host, port)
client = _client_cache.get(sockaddr)
accounts_found: dict[str, Client] = {}
if ( if (
client and client.ib.isConnected() sockaddr in _client_cache
or sockaddr in _scan_ignore or sockaddr in _scan_ignore
): ):
continue continue
try:
ib = NonShittyIB() ib = NonShittyIB()
# XXX: not sure if we ever really need to increment the for i in range(connect_retries):
# client id if teardown is sucessful. try:
client_id = 6116
await ib.connectAsync( await ib.connectAsync(
host, host,
port, port,
@ -1002,6 +996,28 @@ async def load_aio_clients(
# careful. # careful.
timeout=connect_timeout, timeout=connect_timeout,
) )
break
except (
ConnectionRefusedError,
# TODO: if trying to scan for remote api clients
# pretty sure we need to catch this, though it
# definitely needs a shorter timeout since it hangs
# for like 5s..
asyncio.exceptions.TimeoutError,
OSError,
) as ce:
_err = ce
if i > 8:
# cache logic to avoid rescanning if we already have all
# clients loaded.
_scan_ignore.add(sockaddr)
raise
log.warning(
f'Failed to connect on {port} for {i} time, retrying...')
# create and cache client # create and cache client
client = Client(ib) client = Client(ib)
@ -1039,43 +1055,14 @@ async def load_aio_clients(
) )
# update all actor-global caches # update all actor-global caches
log.info(f"Caching client for {(host, port)}") log.info(f"Caching client for {sockaddr}")
_client_cache[(host, port)] = client _client_cache[sockaddr] = client
# we_connected.append((host, port, client))
# TODO: don't do it this way, get a gud to_asyncio
# context / .start() system goin..
def pop_and_discon():
log.info(f'Disconnecting client {client}')
client.ib.disconnect()
_client_cache.pop((host, port), None)
# NOTE: the above callback **CAN'T FAIL** or shm won't get
# torn down correctly ...
tractor._actor._lifetime_stack.callback(pop_and_discon)
# XXX: why aren't we just updating this directy above # XXX: why aren't we just updating this directy above
# instead of using the intermediary `accounts_found`? # instead of using the intermediary `accounts_found`?
_accounts2clients.update(accounts_found) _accounts2clients.update(accounts_found)
except ( # if we have no clients after the scan loop then error out.
ConnectionRefusedError,
# TODO: if trying to scan for remote api clients
# pretty sure we need to catch this, though it
# definitely needs a shorter timeout since it hangs
# for like 5s..
asyncio.exceptions.TimeoutError,
OSError,
) as ce:
_err = ce
log.warning(f'Failed to connect on {port}')
# cache logic to avoid rescanning if we already have all
# clients loaded.
_scan_ignore.add(sockaddr)
if not _client_cache: if not _client_cache:
raise ConnectionError( raise ConnectionError(
'No ib APIs could be found scanning @:\n' 'No ib APIs could be found scanning @:\n'
@ -1083,16 +1070,15 @@ async def load_aio_clients(
'Check your `brokers.toml` and/or network' 'Check your `brokers.toml` and/or network'
) from _err ) from _err
try:
yield _accounts2clients yield _accounts2clients
finally:
# TODO: this in a way that works xD # TODO: for re-scans we'll want to not teardown clients which
# finally: # are up and stable right?
# pass for acct, client in _accounts2clients.items():
# # async with trio.CancelScope(shield=True): log.info(f'Disconnecting {acct}@{client}')
# for host, port, client in we_connected: client.ib.disconnect()
# client.ib.disconnect() _client_cache.pop((host, port))
# _client_cache.pop((host, port))
# raise
async def load_clients_for_trio( async def load_clients_for_trio(
@ -1103,7 +1089,17 @@ async def load_clients_for_trio(
''' '''
Pure async mngr proxy to ``load_aio_clients()``. Pure async mngr proxy to ``load_aio_clients()``.
This is a bootstrap entrypoing to call from
a ``tractor.to_asyncio.open_channel_from()``.
''' '''
global _accounts2clients
if _accounts2clients:
to_trio.send_nowait(_accounts2clients)
await asyncio.sleep(float('inf'))
else:
async with load_aio_clients() as accts2clients: async with load_aio_clients() as accts2clients:
to_trio.send_nowait(accts2clients) to_trio.send_nowait(accts2clients)
@ -1111,28 +1107,38 @@ async def load_clients_for_trio(
await asyncio.sleep(float('inf')) await asyncio.sleep(float('inf'))
_proxies: dict[str, MethodProxy] = {}
@acm @acm
async def open_client_proxies() -> tuple[ async def open_client_proxies() -> tuple[
dict[str, MethodProxy], dict[str, MethodProxy],
dict[str, Client], dict[str, Client],
]: ]:
proxies: dict[str, MethodProxy] = {}
async with ( async with (
tractor.to_asyncio.open_channel_from( tractor.trionics.maybe_open_context(
load_clients_for_trio, # acm_func=open_client_proxies,
) as (clients, from_aio), acm_func=tractor.to_asyncio.open_channel_from,
kwargs={'target': load_clients_for_trio},
# lock around current actor task access
# TODO: maybe this should be the default in tractor?
key=tractor.current_actor().uid,
) as (cache_hit, (clients, from_aio)),
AsyncExitStack() as stack AsyncExitStack() as stack
): ):
if cache_hit:
log.info(f'Re-using cached clients: {clients}')
for acct_name, client in clients.items(): for acct_name, client in clients.items():
proxy = await stack.enter_async_context( proxy = await stack.enter_async_context(
open_client_proxy(client), open_client_proxy(client),
) )
proxies[acct_name] = proxy _proxies[acct_name] = proxy
yield proxies, clients yield _proxies, clients
def get_preferred_data_client( def get_preferred_data_client(
@ -1511,10 +1517,15 @@ async def get_bars(
for _ in range(10): for _ in range(10):
try: try:
bars, bars_array = await proxy.bars( out = await proxy.bars(
fqsn=fqsn, fqsn=fqsn,
end_dt=end_dt, end_dt=end_dt,
) )
if out:
bars, bars_array = out
else:
await tractor.breakpoint()
if bars_array is None: if bars_array is None:
raise SymbolNotFound(fqsn) raise SymbolNotFound(fqsn)