Rejig scan loop for flaky TCP connects, better caching

`ib-gw` seems particularly fragile to connections from clients with the same id (can result in weird connect hangs and even crashes) and `ib_insync` doesn't handle intermittent tcp disconnects that well..(especially on dockerized IBC setups). This adds a bunch of changes to our client caching and scan loop as well a proper task-locking-to-cache-proxies so that, - `asyncio`-side clients aren't double-loaded/connected even when explicitly trying to reconnect repeatedly with a given client to work around the unreliability of the `asyncio.Transport` design in `ib_insync`. - we can use `tractor.trionics.maybe_open_context()` to lock the `trio` side from loading more then one `Client` on the `asyncio` side and instead on cache hits only making a new `MethodProxy` around the reused `asyncio`-side client (since each `trio` task needs its own inter-task msg channel). - a `finally:` block teardown on all clients loaded in the scan loop avoids stale connections. - the connect params are now exposed as named args to `load_aio_clients()` can be easily controlled from caller code. Oh, and we properly hooked up the internal `ib_insync` logging to our own internal schema - makes it a lot easier to debug wtf is going on XD
2022-05-21 10:59:34 -04:00 · 2022-05-21 10:59:34 -04:00 · a5389beccd
parent 26f47227d2
commit a5389beccd
1 changed files with 132 additions and 121 deletions
--- a/piker/brokers/ib.py
+++ b/piker/brokers/ib.py
@ -38,8 +38,6 @@ from typing import (
 import asyncio
 from pprint import pformat
 import inspect
-import logging
-from random import randint
 import time
 from types import SimpleNamespace

@ -164,13 +162,23 @@ class NonShittyIB(ibis.IB):
    - Don't use named tuples
    """
    def __init__(self):
+
+        # override `ib_insync` internal loggers so we can see wtf
+        # it's doing..
+        self._logger = get_logger(
+            'ib_insync.ib',
+        )
        self._createEvents()
+
        # XXX: just to override this wrapper
        self.wrapper = NonShittyWrapper(self)
        self.client = ib_Client(self.wrapper)
+        self.client._logger = get_logger(
+            'ib_insync.client',
+        )
+
        # self.errorEvent += self._onError
        self.client.apiEnd += self.disconnectedEvent
-        self._logger = logging.getLogger('ib_insync.ib')


 # map of symbols to contract ids
@ -883,9 +891,6 @@ _try_ports = [
    _gw_port,
    _tws_port
 ]
-# TODO: remove the randint stuff and use proper error checking in client
-# factor below..
-_client_ids = itertools.count(randint(1, 100))
 _client_cache: dict[tuple[str, int], Client] = {}
 _scan_ignore: set[tuple[str, int]] = set()

@ -911,8 +916,12 @@ async def load_aio_clients(

    host: str = '127.0.0.1',
    port: int = None,
+    client_id: int = 6116,

-    client_id: Optional[int] = None,
+    # the API TCP in `ib_insync` connection can be flaky af so instead
+    # retry a few times to get the client going..
+    connect_retries: int = 3,
+    connect_timeout: float = 0.5,

 ) -> dict[str, Client]:
    '''
@ -949,49 +958,34 @@ async def load_aio_clients(
        {
            'gw': 4002,
            'tws': 7497,
-            # 'order': ['gw', 'tws']
        }
    )
    order = ports.pop('order', None)
    if order:
        log.warning('`ports.order` section in `brokers.toml` is deprecated')

+    _err = None
    accounts_def = config.load_accounts(['ib'])
    try_ports = list(ports.values())
    ports = try_ports if port is None else [port]
-    # we_connected = []
-    connect_timeout = 2
    combos = list(itertools.product(hosts, ports))
-
-    # allocate new and/or reload disconnected but cached clients
-    # try:
-    # TODO: support multiple clients allowing for execution on
-    # multiple accounts (including a paper instance running on the
-    # same machine) and switching between accounts in the ems.
-
-    _err = None
+    accounts_found: dict[str, Client] = {}

    # (re)load any and all clients that can be found
    # from connection details in ``brokers.toml``.
    for host, port in combos:

        sockaddr = (host, port)
-        client = _client_cache.get(sockaddr)
-        accounts_found: dict[str, Client] = {}
-
        if (
-            client and client.ib.isConnected()
+            sockaddr in _client_cache
            or sockaddr in _scan_ignore
        ):
            continue

-        try:
        ib = NonShittyIB()

-            # XXX: not sure if we ever really need to increment the
-            # client id if teardown is sucessful.
-            client_id = 6116
-
+        for i in range(connect_retries):
+            try:
                await ib.connectAsync(
                    host,
                    port,
@ -1002,6 +996,28 @@ async def load_aio_clients(
                    # careful.
                    timeout=connect_timeout,
                )
+                break
+
+            except (
+                ConnectionRefusedError,
+
+                # TODO: if trying to scan for remote api clients
+                # pretty sure we need to catch this, though it
+                # definitely needs a shorter timeout since it hangs
+                # for like 5s..
+                asyncio.exceptions.TimeoutError,
+                OSError,
+            ) as ce:
+                _err = ce
+
+                if i > 8:
+                    # cache logic to avoid rescanning if we already have all
+                    # clients loaded.
+                    _scan_ignore.add(sockaddr)
+                    raise
+
+                log.warning(
+                    f'Failed to connect on {port} for {i} time, retrying...')

        # create and cache client
        client = Client(ib)
@ -1039,43 +1055,14 @@ async def load_aio_clients(
        )

        # update all actor-global caches
-            log.info(f"Caching client for {(host, port)}")
-            _client_cache[(host, port)] = client
-
-            # we_connected.append((host, port, client))
-
-            # TODO: don't do it this way, get a gud to_asyncio
-            # context / .start() system goin..
-            def pop_and_discon():
-                log.info(f'Disconnecting client {client}')
-                client.ib.disconnect()
-                _client_cache.pop((host, port), None)
-
-            # NOTE: the above callback **CAN'T FAIL** or shm won't get
-            # torn down correctly ...
-            tractor._actor._lifetime_stack.callback(pop_and_discon)
+        log.info(f"Caching client for {sockaddr}")
+        _client_cache[sockaddr] = client

        # XXX: why aren't we just updating this directy above
        # instead of using the intermediary `accounts_found`?
        _accounts2clients.update(accounts_found)

-        except (
-            ConnectionRefusedError,
-
-            # TODO: if trying to scan for remote api clients
-            # pretty sure we need to catch this, though it
-            # definitely needs a shorter timeout since it hangs
-            # for like 5s..
-            asyncio.exceptions.TimeoutError,
-            OSError,
-        ) as ce:
-            _err = ce
-            log.warning(f'Failed to connect on {port}')
-
-            # cache logic to avoid rescanning if we already have all
-            # clients loaded.
-            _scan_ignore.add(sockaddr)
-
+    # if we have no clients after the scan loop then error out.
    if not _client_cache:
        raise ConnectionError(
            'No ib APIs could be found scanning @:\n'
@ -1083,16 +1070,15 @@ async def load_aio_clients(
            'Check your `brokers.toml` and/or network'
        ) from _err

+    try:
        yield _accounts2clients
-
-    # TODO: this in a way that works xD
-    # finally:
-    #     pass
-    #     # async with trio.CancelScope(shield=True):
-    #     for host, port, client in we_connected:
-    #         client.ib.disconnect()
-    #         _client_cache.pop((host, port))
-    #     raise
+    finally:
+        # TODO: for re-scans we'll want to not teardown clients which
+        # are up and stable right?
+        for acct, client in _accounts2clients.items():
+            log.info(f'Disconnecting {acct}@{client}')
+            client.ib.disconnect()
+            _client_cache.pop((host, port))


 async def load_clients_for_trio(
@ -1103,7 +1089,17 @@ async def load_clients_for_trio(
    '''
    Pure async mngr proxy to ``load_aio_clients()``.

+    This is a bootstrap entrypoing to call from
+    a ``tractor.to_asyncio.open_channel_from()``.
+
    '''
+    global _accounts2clients
+
+    if _accounts2clients:
+        to_trio.send_nowait(_accounts2clients)
+        await asyncio.sleep(float('inf'))
+
+    else:
        async with load_aio_clients() as accts2clients:
            to_trio.send_nowait(accts2clients)

@ -1111,28 +1107,38 @@ async def load_clients_for_trio(
            await asyncio.sleep(float('inf'))


+_proxies: dict[str, MethodProxy] = {}
+
+
@acm
 async def open_client_proxies() -> tuple[
    dict[str, MethodProxy],
    dict[str, Client],
 ]:
-
-    proxies: dict[str, MethodProxy] = {}
-
    async with (
-        tractor.to_asyncio.open_channel_from(
-            load_clients_for_trio,
-        ) as (clients, from_aio),
+        tractor.trionics.maybe_open_context(
+            # acm_func=open_client_proxies,
+            acm_func=tractor.to_asyncio.open_channel_from,
+            kwargs={'target': load_clients_for_trio},
+
+            # lock around current actor task access
+            # TODO: maybe this should be the default in tractor?
+            key=tractor.current_actor().uid,
+
+        ) as (cache_hit, (clients, from_aio)),

        AsyncExitStack() as stack
    ):
+        if cache_hit:
+            log.info(f'Re-using cached clients: {clients}')
+
        for acct_name, client in clients.items():
            proxy = await stack.enter_async_context(
                open_client_proxy(client),
            )
-            proxies[acct_name] = proxy
+            _proxies[acct_name] = proxy

-        yield proxies, clients
+        yield _proxies, clients


 def get_preferred_data_client(
@ -1511,10 +1517,15 @@ async def get_bars(

    for _ in range(10):
        try:
-            bars, bars_array = await proxy.bars(
+            out = await proxy.bars(
                fqsn=fqsn,
                end_dt=end_dt,
            )
+            if out:
+                bars, bars_array = out
+
+            else:
+                await tractor.breakpoint()

            if bars_array is None:
                raise SymbolNotFound(fqsn)