Factor gap annotating into new `markup_gaps()`

Since we definitely want to markup gaps that are both data-errors and just plain old venue closures (time gaps), generalize the `gaps: pl.DataFrame` loop in a func and call it from the `ldshm` cmd for now. Some other tweaks to `store ldshm`: - add `np.median()` failover when detecting (ohlc) ts sample rate. - use new `tsp.dedupe()` signature. - differentiate between sample-period size gaps and venue closure gaps by calling `tsp.detect_time_gaps()` with diff size thresholds. - add todo for backfilling null-segs when detected.
2024-01-04 11:01:21 -05:00 · 2024-01-04 11:01:21 -05:00 · 6959429af8
parent 05f874001a
commit 6959429af8
1 changed files with 197 additions and 94 deletions
--- a/piker/storage/cli.py
+++ b/piker/storage/cli.py
@ -24,8 +24,13 @@ from __future__ import annotations
 #     AsyncExitStack,
 # )
 from pathlib import Path
 from math import copysign
 import time
 from types import ModuleType
 from typing import (
    Any,
    TYPE_CHECKING,
 )
 import polars as pl
 import numpy as np
@ -42,15 +47,17 @@ from piker.data import (
    ShmArray,
 )
 from piker import tsp
-from . import (
+from piker.data._formatters import BGM
-    log,
+from . import log
 )
 from . import (
    __tsdbs__,
    open_storage_client,
    StorageClient,
 )
 if TYPE_CHECKING:
    from piker.ui._remote_ctl import AnnotCtl
 store = typer.Typer()
@ -203,10 +210,12 @@ def anal(
            deduped: pl.DataFrame  # deduplicated dts
            (
                df,
                gaps,
                deduped,
                diff,
-            ) = tsp.dedupe(shm_df)
+            ) = tsp.dedupe(
                shm_df,
                period=period,
            )
            write_edits: bool = True
            if (
@ -217,7 +226,6 @@ def anal(
                )
            ):
                await tractor.pause()
                await client.write_ohlcv(
                    fqme,
                    ohlcv=deduped,
@ -229,10 +237,117 @@ def anal(
                # is there something more minimal but nearly as
                # functional as ipython?
                await tractor.pause()
                assert not null_segs
    trio.run(main)
 async def markup_gaps(
    fqme: str,
    timeframe: float,
    actl: AnnotCtl,
    wdts: pl.DataFrame,
    gaps: pl.DataFrame,
 ) -> dict[int, dict]:
    '''
    Remote annotate time-gaps in a dt-fielded ts (normally OHLC)
    with rectangles.
    '''
    aids: dict[int] = {}
    for i in range(gaps.height):
        row: pl.DataFrame = gaps[i]
        # the gap's RIGHT-most bar's OPEN value
        # at that time (sample) step.
        iend: int = row['index'][0]
        # dt: datetime = row['dt'][0]
        # dt_prev: datetime = row['dt_prev'][0]
        # dt_end_t: float = dt.timestamp()
        # TODO: can we eventually remove this
        # once we figure out why the epoch cols
        # don't match?
        # TODO: FIX HOW/WHY these aren't matching
        # and are instead off by 4hours (EST
        # vs. UTC?!?!)
        # end_t: float = row['time']
        # assert (
        #     dt.timestamp()
        #     ==
        #     end_t
        # )
        # the gap's LEFT-most bar's CLOSE value
        # at that time (sample) step.
        prev_r: pl.DataFrame = wdts.filter(
            pl.col('index') == iend - 1
        )
        istart: int = prev_r['index'][0]
        # dt_start_t: float = dt_prev.timestamp()
        # start_t: float = prev_r['time']
        # assert (
        #     dt_start_t
        #     ==
        #     start_t
        # )
        # TODO: implement px-col width measure
        # and ensure at least as many px-cols
        # shown per rect as configured by user.
        # gap_w: float = abs((iend - istart))
        # if gap_w < 6:
        #     margin: float = 6
        #     iend += margin
        #     istart -= margin
        rect_gap: float = BGM*3/8
        opn: float = row['open'][0]
        ro: tuple[float, float] = (
            # dt_end_t,
            iend + rect_gap + 1,
            opn,
        )
        cls: float = prev_r['close'][0]
        lc: tuple[float, float] = (
            # dt_start_t,
            istart - rect_gap, # + 1 ,
            cls,
        )
        color: str = 'dad_blue'
        diff: float = cls - opn
        sgn: float = copysign(1, diff)
        color: str = {
            -1: 'buy_green',
            1: 'sell_red',
        }[sgn]
        rect_kwargs: dict[str, Any] = dict(
            fqme=fqme,
            timeframe=timeframe,
            start_pos=lc,
            end_pos=ro,
            color=color,
        )
        aid: int = await actl.add_rect(**rect_kwargs)
        assert aid
        aids[aid] = rect_kwargs
    # tell chart to redraw all its
    # graphics view layers Bo
    await actl.redraw(
        fqme=fqme,
        timeframe=timeframe,
    )
    return aids
@store.command()
 def ldshm(
    fqme: str,
@ -249,7 +364,6 @@ def ldshm(
    async def main():
        from piker.ui._remote_ctl import (
            open_annot_ctl,
            AnnotCtl,
        )
        actl: AnnotCtl
        mod: ModuleType
@ -274,111 +388,97 @@ def ldshm(
                shm_df,
            ) in tsp.iter_dfs_from_shms(fqme):
                # compute ohlc properties for naming
                times: np.ndarray = shm.array['time']
-                period_s: float = float(times[-1] - times[-2])
+                d1: float = float(times[-1] - times[-2])
-                if period_s < 1.:
+                d2: float = float(times[-2] - times[-3])
                med: float = np.median(np.diff(times))
                if (
                    d1 < 1.
                    and d2 < 1.
                    and med < 1.
                ):
                    raise ValueError(
                        f'Something is wrong with time period for {shm}:\n{times}'
                    )
                period_s: float = float(max(d1, d2, med))
                # over-write back to shm?
-                df: pl.DataFrame  # with dts
+                wdts: pl.DataFrame  # with dts
                deduped: pl.DataFrame  # deduplicated dts
                (
-                    df,
+                    wdts,
                    gaps,
                    deduped,
                    diff,
-                ) = tsp.dedupe(shm_df)
+                ) = tsp.dedupe(
                    shm_df,
                    period=period_s,
                )
                null_segs: tuple = tsp.get_null_segs(
                    frame=shm.array,
                    period=period_s,
                )
-                needs_correction: bool = (
+                # detect gaps from in expected (uniform OHLC) sample period
-                    not gaps.is_empty()
+                step_gaps: pl.DataFrame = tsp.detect_time_gaps(
-                    or null_segs
+                    wdts,
                    expect_period=period_s,
                )
                # TODO: maybe only optionally enter this depending
                # on some CLI flags and/or gap detection?
                if needs_correction:
                    for i in range(gaps.height):
                        row: pl.DataFrame = gaps[i]
-                        # TODO: can we eventually remove this
+                # TODO: by default we always want to mark these up
-                        # once we figure out why the epoch cols
+                # with rects showing up/down gaps Bo
-                        # don't match?
+                venue_gaps: pl.DataFrame = tsp.detect_time_gaps(
-                        iend: int = row['index'][0]
+                    wdts,
-                        # dt: datetime = row['dt'][0]
+                    expect_period=period_s,
                        # dt_prev: datetime = row['dt_prev'][0]
-                        # the gap's right-most bar's OPEN value
+                    # TODO: actually pull the exact duration
-                        # at that time (sample) step.
+                    # expected for each venue operational period?
-                        # dt_end_t: float = dt.timestamp()
+                    gap_dt_unit='days',
                    gap_thresh=1,
                )
-                        # TODO: FIX HOW/WHY these aren't matching
+                # TODO: call null-seg fixer somehow?
-                        # and are instead off by 4hours (EST
+                if null_segs:
-                        # vs. UTC?!?!)
+                    await tractor.pause()
-                        # end_t: float = row['time']
+                #     async with (
-                        # assert (
+                #         trio.open_nursery() as tn,
-                        #     dt.timestamp()
+                #         mod.open_history_client(
-                        #     ==
+                #             mkt,
-                        #     end_t
+                #         ) as (get_hist, config),
-                        # )
+                #     ):
                #         nulls_detected: trio.Event = await tn.start(partial(
                #             tsp.maybe_fill_null_segments,
-                        # the gap's left-most bar's CLOSE value
+                #             shm=shm,
-                        # at that time (sample) step.
+                #             timeframe=timeframe,
-                        prev_r: pl.DataFrame = df.filter(
+                #             get_hist=get_hist,
-                            pl.col('index') == iend - 1
+                #             sampler_stream=sampler_stream,
                #             mkt=mkt,
                #         ))
                # TODO: find the disjoint set of step gaps from
                # venue (closure) set!
                # -[ ] do a set diff by checking for the unique
                #    gap set only in the step_gaps?
                if (
                    not venue_gaps.is_empty()
                    # and not step_gaps.is_empty()
                ):
                    do_markup_gaps: bool = True
                    if do_markup_gaps:
                        aids: dict = await markup_gaps(
                            fqme,
                            period_s,
                            actl,
                            wdts,
                            step_gaps,
                        )
-                        istart: int = prev_r['index'][0]
+                        assert aids
                        # dt_start_t: float = dt_prev.timestamp()
-                        # start_t: float = prev_r['time']
+                    # write repaired ts to parquet-file?
-                        # assert (
+                    if write_parquet:
-                        #     dt_start_t
+                        start: float = time.time()
                        #     ==
                        #     start_t
                        # )
                        # TODO: implement px-col width measure
                        # and ensure at least as many px-cols
                        # shown per rect as configured by user.
                        gap_w: float = abs((iend - istart))
                        if gap_w < 6:
                            margin: float = 6
                            iend += margin
                            istart -= margin
                        ro: tuple[float, float] = (
                            # dt_end_t,
                            iend,
                            row['open'][0],
                        )
                        lc: tuple[float, float] = (
                            # dt_start_t,
                            istart,
                            prev_r['close'][0],
                        )
                        # async with actl.open_rect(
                        # ) as aid:
                        aid: int = await actl.add_rect(
                            fqme=fqme,
                            timeframe=period_s,
                            start_pos=lc,
                            end_pos=ro,
                        )
                        assert aid
                    # write to parquet file?
                    if (
                        write_parquet
                    ):
                        # write to fs
                        start = time.time()
                        path: Path = await client.write_ohlcv(
                            fqme,
                            ohlcv=deduped,
@ -390,7 +490,7 @@ def ldshm(
                        )
                        # read back from fs
-                        start = time.time()
+                        start: float = time.time()
                        read_df: pl.DataFrame = pl.read_parquet(path)
                        read_delay: float = round(
                            time.time() - start,
@ -412,6 +512,8 @@ def ldshm(
                            shm._array.setflags(
                                write=int(1),
                            )
                            # last chance manual overwrites in REPL
                            await tractor.pause()
                            shm.push(
                                new,
                                prepend=True,
@ -419,8 +521,6 @@ def ldshm(
                                update_first=False,  # don't update ._first
                            )
                    await tractor.pause()
                    assert diff
                else:
                    # allow interaction even when no ts problems.
@ -428,8 +528,11 @@ def ldshm(
                    assert not diff
-            if df is None:
+            if shm_df is None:
-                log.error(f'No matching shm buffers for {fqme} ?')
+                log.error(
                    f'No matching shm buffers for {fqme} ?'
                )
    trio.run(main)