Lul, actually detect gaps for 1s OHLC

Turns out we were always filtering to time gaps longer then a day smh.. Instead tweak `detect_time_gaps()` to only return venue-gaps when a `gap_dt_unit: str` is passed and pass `'days'` (like it was by default before) from `dedupe()` though we should really pass in an actual venue gap duration in the future.
2023-12-27 16:55:00 -05:00 · 2023-12-27 16:55:00 -05:00 · 0d18cb65c3
parent ad565936ec
commit 0d18cb65c3
2 changed files with 55 additions and 43 deletions
--- a/piker/tsp/init.py
+++ b/piker/tsp/init.py
@ -440,8 +440,11 @@ async def start_backfill(
            # broker says there never was or is no more history to pull
            except DataUnavailable:
                log.warning(
-                    f'NO-MORE-DATA: backend {mod.name} halted history:\n'
-                    f'{timeframe}@{mkt.fqme}'
+                    f'NO-MORE-DATA in range?\n'
+                    f'`{mod.name}` halted history:\n'
+                    f'tf@fqme: {timeframe}@{mkt.fqme}\n'
+                    'bf_until <- last_start_dt:\n'
+                    f'{backfill_until_dt} <- {last_start_dt}\n'
                )

                # ugh, what's a better way?
--- a/piker/tsp/_anal.py
+++ b/piker/tsp/_anal.py
@ -510,10 +510,10 @@ def iter_null_segs(
        )


-# TODO: move to ._pl_anal
 def with_dts(
    df: pl.DataFrame,
    time_col: str = 'time',
+
 ) -> pl.DataFrame:
    '''
    Insert datetime (casted) columns to a (presumably) OHLC sampled
@ -529,9 +529,7 @@ def with_dts(
            column=pl.col(f'{time_col}_prev'),
        ).alias('dt_prev'),
        pl.col('dt').diff().alias('dt_diff'),
-    ]) #.with_columns(
-        # pl.col('dt').diff().dt.days().alias('days_dt_diff'),
-    # )
+    ])


 t_unit: Literal = Literal[
@ -546,25 +544,23 @@ t_unit: Literal = Literal[


 def detect_time_gaps(
-    df: pl.DataFrame,
+    w_dts: pl.DataFrame,

    time_col: str = 'time',
    # epoch sampling step diff
    expect_period: float = 60,

-    # datetime diff unit and gap value
-    # crypto mkts
-    # gap_dt_unit: t_unit = 'minutes',
-    # gap_thresh: int = 1,
-
    # NOTE: legacy stock mkts have venue operating hours
    # and thus gaps normally no more then 1-2 days at
    # a time.
+    gap_thresh: float = 1.,
+
+    # TODO: allow passing in a frame of operating hours?
+    # -[ ] durations/ranges for faster legit gap checks?
    # XXX -> must be valid ``polars.Expr.dt.<name>``
-    # TODO: allow passing in a frame of operating hours
-    # durations/ranges for faster legit gap checks.
-    gap_dt_unit: t_unit = 'days',
-    gap_thresh: int = 1,
+    # like 'days' which a sane default for venue closures
+    # though will detect weekend gaps which are normal :o
+    gap_dt_unit: t_unit | None = None,

 ) -> pl.DataFrame:
    '''
@ -574,20 +570,25 @@ def detect_time_gaps(
    actual missing data segments.

    '''
-    return (
-        with_dts(df)
-        # First by a seconds unit step size
-        .filter(
+    # first select by any sample-period (in seconds unit) step size
+    # greater then expected.
+    step_gaps: pl.DataFrame = w_dts.filter(
        pl.col('s_diff').abs() > expect_period
    )
-        .filter(
+
+    if gap_dt_unit is None:
+        return step_gaps
+
+    # NOTE: this flag is to indicate that on this (sampling) time
+    # scale we expect to only be filtering against larger venue
+    # closures-scale time gaps.
+    return step_gaps.filter(
        # Second by an arbitrary dt-unit step size
        getattr(
            pl.col('dt_diff').dt,
            gap_dt_unit,
        )().abs() > gap_thresh
    )
-    )


 def detect_price_gaps(
@ -624,6 +625,8 @@ def dedupe(
    src_df: pl.DataFrame,
    sort: bool = True,

+    period: float = 60,
+
 ) -> tuple[
    pl.DataFrame,  # with dts
    pl.DataFrame,  # gaps
@ -637,33 +640,39 @@ def dedupe(
    dt-deduplicated frame.

    '''
-    df: pl.DataFrame = with_dts(src_df)
-
-    # TODO: enable passing existing `with_dts` df for speedup?
-    gaps: pl.DataFrame = detect_time_gaps(df)
+    wdts: pl.DataFrame = with_dts(src_df)
+    src_gaps: pl.DataFrame = detect_time_gaps(
+        wdts,
+        expect_period=period,
+        gap_dt_unit=None if period < 60 else 'days',
+    )

    # if no gaps detected just return carbon copies
    # and no len diff.
-    if gaps.is_empty():
+    if src_gaps.is_empty():
        return (
-            df,
-            gaps,
-            df,
+            wdts,
+            src_gaps,
+            wdts,
            0,
        )

    # remove duplicated datetime samples/sections
-    deduped: pl.DataFrame = df.unique(
+    deduped: pl.DataFrame = wdts.unique(
        subset=['dt'],
        maintain_order=True,
    )
    if sort:
        deduped = deduped.sort(by='time')

-    deduped_gaps: pl.DataFrame = detect_time_gaps(deduped)
+    deduped_gaps: pl.DataFrame = detect_time_gaps(
+        deduped,
+        expect_period=period,
+        gap_dt_unit=None if period < 60 else 'days',
+    )

    diff: int = (
-        df.height
+        wdts.height
        -
        deduped.height
    )
@ -673,8 +682,8 @@ def dedupe(
        f'deduped Gaps found:\n{deduped_gaps}'
    )
    return (
-        df,
-        gaps,
+        wdts,
+        deduped_gaps,
        deduped,
        diff,
    )
@ -708,7 +717,7 @@ def sort_diff(
 # to go from numpy struct-arrays to polars dataframes and back:
 # https://stackoverflow.com/a/72054819
 def np2pl(array: np.ndarray) -> pl.DataFrame:
-    start = time.time()
+    start: float = time.time()

    # XXX: thanks to this SO answer for this conversion tip:
    # https://stackoverflow.com/a/72054819