Create `piker.tsp` "time series processing" subpkg

Move `.data.history` -> `.tsp.__init__.py` for now as main pkg-mod and `.data.tsp` -> `.tsp._anal` (for analysis). Obviously follow commits will change surrounding codebase (imports) to match..
2023-12-18 11:48:33 -05:00 · 2023-12-18 11:48:33 -05:00 · 4568c55f17
parent d5d68f75ea
commit 4568c55f17
2 changed files with 125 additions and 55 deletions
--- a/piker/data/history.py
+++ b/piker/data/history.py
@ -32,6 +32,7 @@ from __future__ import annotations
 from datetime import datetime
 from functools import partial
 from pathlib import Path
 from pprint import pformat
 from types import ModuleType
 from typing import (
    Callable,
@ -53,25 +54,64 @@ import polars as pl
 from ..accounting import (
    MktPair,
 )
-from ._util import (
+from ..data._util import (
    log,
 )
-from ._sharedmem import (
+from ..data._sharedmem import (
    maybe_open_shm_array,
    ShmArray,
 )
-from ._source import def_iohlcv_fields
+from ..data._source import def_iohlcv_fields
-from ._sampling import (
+from ..data._sampling import (
    open_sample_stream,
 )
-from .tsp import (
+from ._anal import (
-    dedupe,
+
    get_null_segs,
    iter_null_segs,
    sort_diff,
    Frame,
-    # Seq,
+    Seq,
    # codec-ish
    np2pl,
    pl2np,
    # `numpy` only
    slice_from_time,
    # `polars` specific
    dedupe,
    with_dts,
    detect_time_gaps,
    sort_diff,
    # TODO:
    detect_price_gaps
 )
 __all__: list[str] = [
    'dedupe',
    'get_null_segs',
    'iter_null_segs',
    'sort_diff',
    'slice_from_time',
    'Frame',
    'Seq',
    'np2pl',
    'pl2np',
    'slice_from_time',
    'with_dts',
    'detect_time_gaps',
    'sort_diff',
    # TODO:
    'detect_price_gaps'
 ]
 # TODO: break up all this shite into submods!
 from ..brokers._util import (
    DataUnavailable,
 )
@ -252,35 +292,65 @@ async def maybe_fill_null_segments(
        and
        len(null_segs[-1])
    ):
-        await tractor.pause()
+        (
            iabs_slices,
            iabs_zero_rows,
            zero_t,
        ) = null_segs
        log.warning(
            f'{len(iabs_slices)} NULL TIME SEGMENTS DETECTED!\n'
            f'{pformat(iabs_slices)}'
        )
-    array = shm.array
+        # TODO: always backfill gaps with the earliest (price) datum's
-    zeros = array[array['low'] == 0]
+        # value to avoid the y-ranger including zeros and completely
-
+        # stretching the y-axis..
-    # always backfill gaps with the earliest (price) datum's
+        # array: np.ndarray = shm.array
-    # value to avoid the y-ranger including zeros and completely
+        # zeros = array[array['low'] == 0]
-    # stretching the y-axis..
+        ohlc_fields: list[str] = [
    if 0 < zeros.size:
        zeros[[
            'open',
            'high',
            'low',
            'close',
-        ]] = shm._array[zeros['index'][0] - 1]['close']
+        ]
-    # TODO: interatively step through any remaining
+        for istart, istop in iabs_slices:
-    # time-gaps/null-segments and spawn piecewise backfiller
+
-    # tasks in a nursery?
+            # get view into buffer for null-segment
-    # -[ ] not sure that's going to work so well on the ib
+            gap: np.ndarray = shm._array[istart:istop]
-    #  backend but worth a shot?
+
-    #  -[ ] mk new history connections to make it properly
+            # copy the oldest OHLC samples forward
-    #     parallel possible no matter the backend?
+            gap[ohlc_fields] = shm._array[istart]['close']
-    # -[ ] fill algo: do queries in alternating "latest, then
+
-    #    earliest, then latest.. etc?"
+            start_t: float = shm._array[istart]['time']
-    # if (
+            t_diff: float = (istop - istart)*timeframe
-    #     next_end_dt not in frame[
+            gap['time'] = np.arange(
-    # ):
+                start=start_t,
-    #     pass
+                stop=start_t + t_diff,
                step=timeframe,
            )
            await sampler_stream.send({
                'broadcast_all': {
                    # XXX NOTE XXX: see the
                    # `.ui._display.increment_history_view()` if block
                    # that looks for this info to FORCE a hard viz
                    # redraw!
                    'backfilling': (mkt.fqme, timeframe),
                },
            })
            # TODO: interatively step through any remaining
            # time-gaps/null-segments and spawn piecewise backfiller
            # tasks in a nursery?
            # -[ ] not sure that's going to work so well on the ib
            #  backend but worth a shot?
            #  -[ ] mk new history connections to make it properly
            #     parallel possible no matter the backend?
            # -[ ] fill algo: do queries in alternating "latest, then
            #    earliest, then latest.. etc?"
            # await tractor.pause()
 async def start_backfill(
@ -1252,8 +1322,8 @@ def iter_dfs_from_shms(
        assert not opened
        ohlcv = shm.array
-        from ..data import tsp
+        from ._anal import np2pl
-        df: pl.DataFrame = tsp.np2pl(ohlcv)
+        df: pl.DataFrame = np2pl(ohlcv)
        yield (
            shmfile,
--- a/piker/tsp/_anal.py
+++ b/piker/tsp/_anal.py
@ -319,9 +319,8 @@ def get_null_segs(
    if num_gaps < 1:
        if absi_zeros.size > 1:
            absi_zsegs = [[
-                # see `get_hist()` in backend, should ALWAYS be
+                # TODO: maybe mk these max()/min() limits func
-                # able to handle a `start_dt=None`!
+                # consts instead of called more then once?
                # None,
                max(
                    absi_zeros[0] - 1,
                    0,
@ -359,7 +358,10 @@ def get_null_segs(
        # corresponding to the first zero-segment's row, we add it
        # manually here.
        absi_zsegs.append([
-            absi_zeros[0] - 1,
+            max(
                absi_zeros[0] - 1,
                0,
            ),
            None,
        ])
@ -400,14 +402,18 @@ def get_null_segs(
        else:
            if 0 < num_gaps < 2:
-                absi_zsegs[-1][1] = absi_zeros[-1] + 1
+                absi_zsegs[-1][1] = min(
                    absi_zeros[-1] + 1,
                    frame['index'][-1],
                )
            iabs_first: int = frame['index'][0]
            for start, end in absi_zsegs:
                ts_start: float = times[start - iabs_first]
                ts_end: float = times[end - iabs_first]
                if (
-                    ts_start == 0
+                    (ts_start == 0 and not start == 0)
                    or
                    ts_end == 0
                ):
@ -451,11 +457,13 @@ def iter_null_segs(
    ],
    None,
 ]:
-    if null_segs is None:
+    if not (
-        null_segs: tuple = get_null_segs(
+        null_segs := get_null_segs(
            frame,
            period=timeframe,
        )
    ):
        return
    absi_pairs_zsegs: list[list[float, float]]
    izeros: Seq
@ -502,6 +510,7 @@ def iter_null_segs(
        )
 # TODO: move to ._pl_anal
 def with_dts(
    df: pl.DataFrame,
    time_col: str = 'time',
@ -525,19 +534,6 @@ def with_dts(
    # )
 def dedup_dt(
    df: pl.DataFrame,
 ) -> pl.DataFrame:
    '''
    Drop duplicate date-time rows (normally from an OHLC frame).
    '''
    return df.unique(
        subset=['dt'],
        maintain_order=True,
    )
 t_unit: Literal = Literal[
    'days',
    'hours',
@ -651,7 +647,11 @@ def dedupe(src_df: pl.DataFrame) -> tuple[
        )
    # remove duplicated datetime samples/sections
-    deduped: pl.DataFrame = dedup_dt(df)
+    deduped: pl.DataFrame = df.unique(
        subset=['dt'],
        maintain_order=True,
    )
    deduped_gaps = detect_time_gaps(deduped)
    diff: int = (