piker/piker/tsp/_anal.py

# piker: trading gear for hackers
# Copyright (C) 2018-present  Tyler Goodlet (in stewardship of pikers)

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

'''
Financial time series processing utilities usually
pertaining to OHLCV style sampled data.

Routines are generally implemented in either ``numpy`` or
``polars`` B)

'''
from __future__ import annotations
from functools import partial
from math import (
    ceil,
    floor,
)
import time
from typing import (
    Literal,
    # AsyncGenerator,
    Generator,
)

import numpy as np
import polars as pl
from pendulum import (
    DateTime,
    from_timestamp,
)

from ..toolz.profile import (
    Profiler,
    pg_profile_enabled,
    ms_slower_then,
)
from ..log import (
    get_logger,
    get_console_log,
)
# for "time series processing"
subsys: str = 'piker.tsp'

log = get_logger(subsys)
get_console_log = partial(
    get_console_log,
    name=subsys,
)

# NOTE: union type-defs to handle generic `numpy` and `polars` types
# side-by-side Bo
# |_ TODO: schema spec typing?
#   -[ ] nptyping!
#   -[ ] wtv we can with polars?
Frame = pl.DataFrame | np.ndarray
Seq = pl.Series | np.ndarray


def slice_from_time(
    arr: np.ndarray,
    start_t: float,
    stop_t: float,
    step: float,  # sampler period step-diff

) -> slice:
    '''
    Calculate array indices mapped from a time range and return them in
    a slice.

    Given an input array with an epoch `'time'` series entry, calculate
    the indices which span the time range and return in a slice. Presume
    each `'time'` step increment is uniform and when the time stamp
    series contains gaps (the uniform presumption is untrue) use
    ``np.searchsorted()`` binary search to look up the appropriate
    index.

    '''
    profiler = Profiler(
        msg='slice_from_time()',
        disabled=not pg_profile_enabled(),
        ms_threshold=ms_slower_then,
    )

    times = arr['time']
    t_first = floor(times[0])
    t_last = ceil(times[-1])

    # the greatest index we can return which slices to the
    # end of the input array.
    read_i_max = arr.shape[0]

    # compute (presumed) uniform-time-step index offsets
    i_start_t = floor(start_t)
    read_i_start = floor(((i_start_t - t_first) // step)) - 1

    i_stop_t = ceil(stop_t)

    # XXX: edge case -> always set stop index to last in array whenever
    # the input stop time is detected to be greater then the equiv time
    # stamp at that last entry.
    if i_stop_t >= t_last:
        read_i_stop = read_i_max
    else:
        read_i_stop = ceil((i_stop_t - t_first) // step) + 1

    # always clip outputs to array support
    # for read start:
    # - never allow a start < the 0 index
    # - never allow an end index > the read array len
    read_i_start = min(
        max(0, read_i_start),
        read_i_max - 1,
    )
    read_i_stop = max(
        0,
        min(read_i_stop, read_i_max),
    )

    # check for larger-then-latest calculated index for given start
    # time, in which case we do a binary search for the correct index.
    # NOTE: this is usually the result of a time series with time gaps
    # where it is expected that each index step maps to a uniform step
    # in the time stamp series.
    t_iv_start = times[read_i_start]
    if (
        t_iv_start > i_start_t
    ):
        # do a binary search for the best index mapping to ``start_t``
        # given we measured an overshoot using the uniform-time-step
        # calculation from above.

        # TODO: once we start caching these per source-array,
        # we can just overwrite ``read_i_start`` directly.
        new_read_i_start = np.searchsorted(
            times,
            i_start_t,
            side='left',
        )

        # TODO: minimize binary search work as much as possible:
        # - cache these remap values which compensate for gaps in the
        #   uniform time step basis where we calc a later start
        #   index for the given input ``start_t``.
        # - can we shorten the input search sequence by heuristic?
        #   up_to_arith_start = index[:read_i_start]

        if (
            new_read_i_start <= read_i_start
        ):
            # t_diff = t_iv_start - start_t
            # print(
            #     f"WE'RE CUTTING OUT TIME - STEP:{step}\n"
            #     f'start_t:{start_t} -> 0index start_t:{t_iv_start}\n'
            #     f'diff: {t_diff}\n'
            #     f'REMAPPED START i: {read_i_start} -> {new_read_i_start}\n'
            # )
            read_i_start = new_read_i_start

    t_iv_stop = times[read_i_stop - 1]
    if (
        t_iv_stop > i_stop_t
    ):
        # t_diff = stop_t - t_iv_stop
        # print(
        #     f"WE'RE CUTTING OUT TIME - STEP:{step}\n"
        #     f'calced iv stop:{t_iv_stop} -> stop_t:{stop_t}\n'
        #     f'diff: {t_diff}\n'
        #     # f'SHOULD REMAP STOP: {read_i_start} -> {new_read_i_start}\n'
        # )
        new_read_i_stop = np.searchsorted(
            times[read_i_start:],
            # times,
            i_stop_t,
            side='right',
        )

        if (
            new_read_i_stop <= read_i_stop
        ):
            read_i_stop = read_i_start + new_read_i_stop + 1

    # sanity checks for range size
    # samples = (i_stop_t - i_start_t) // step
    # index_diff = read_i_stop - read_i_start + 1
    # if index_diff > (samples + 3):
    #     breakpoint()

    # read-relative indexes: gives a slice where `shm.array[read_slc]`
    # will be the data spanning the input time range `start_t` ->
    # `stop_t`
    read_slc = slice(
        int(read_i_start),
        int(read_i_stop),
    )

    profiler(
        'slicing complete'
        # f'{start_t} -> {abs_slc.start} | {read_slc.start}\n'
        # f'{stop_t} -> {abs_slc.stop} | {read_slc.stop}\n'
    )

    # NOTE: if caller needs absolute buffer indices they can
    # slice the buffer abs index like so:
    # index = arr['index']
    # abs_indx = index[read_slc]
    # abs_slc = slice(
    #     int(abs_indx[0]),
    #     int(abs_indx[-1]),
    # )

    return read_slc


def get_null_segs(
    frame: Frame,
    period: float,  # sampling step in seconds
    imargin: int = 1,
    col: str = 'time',

) -> tuple[
    # Seq,  # TODO: can we make it an array-type instead?
    list[
        list[int, int],
    ],
    Seq,
    Frame
] | None:
    '''
    Detect if there are any zero(-epoch stamped) valued
    rows in for the provided `col: str` column; by default
    presume the 'time' field/column.

    Filter to all such zero (time) segments and return
    the corresponding frame zeroed segment's,

      - gap absolute (in buffer terms) indices-endpoints as
        `absi_zsegs`
      - abs indices of all rows with zeroed `col` values as `absi_zeros`
      - the corresponding frame's row-entries (view) which are
        zeroed for the `col` as `zero_t`

    '''
    times: Seq = frame['time']
    zero_pred: Seq = (times == 0)

    if isinstance(frame, np.ndarray):
        tis_zeros: int = zero_pred.any()
    else:
        tis_zeros: int = zero_pred.any()

    if not tis_zeros:
        return None

    # TODO: use ndarray for this?!
    absi_zsegs: list[list[int, int]] = []

    if isinstance(frame, np.ndarray):
        # view of ONLY the zero segments as one continuous chunk
        zero_t: np.ndarray = frame[zero_pred]
        # abs indices of said zeroed rows
        absi_zeros = zero_t['index']
        # diff of abs index steps between each zeroed row
        absi_zdiff: np.ndarray = np.diff(absi_zeros)

        # scan for all frame-indices where the
        # zeroed-row-abs-index-step-diff is greater then the
        # expected increment of 1.
        # data  1st zero seg  data  zeros
        # ----  ------------  ----  -----  ------  ----
        # ||||..000000000000..||||..00000..||||||..0000
        # ----  ------------  ----  -----  ------  ----
        #       ^zero_t[0]                            ^zero_t[-1]
        #                           ^fi_zgaps[0]   ^fi_zgaps[1]
        #       ^absi_zsegs[0][0]   ^---^ => absi_zsegs[1]: tuple
        #  absi_zsegs[0][1]^
        #
        # NOTE: the first entry in `fi_zgaps` is where
        # the first (absolute) index step diff is > 1.
        # and it is a frame-relative index into `zero_t`.
        fi_zgaps = np.argwhere(
            absi_zdiff > 1
            # NOTE: +1 here is ensure we index to the "start" of each
            # segment (if we didn't the below loop needs to be
            # re-written to expect `fi_end_rows`!
        ) + 1
        # the rows from the contiguous zeroed segments which have
        # abs-index steps >1 compared to the previous zero row
        # (indicating an end of zeroed segment).
        fi_zseg_start_rows = zero_t[fi_zgaps]

    # TODO: equiv for pl.DataFrame case!
    else:
        izeros: pl.Series = zero_pred.arg_true()
        zero_t: pl.DataFrame = frame[izeros]

        absi_zeros = zero_t['index']
        absi_zdiff: pl.Series = absi_zeros.diff()
        fi_zgaps = (absi_zdiff > 1).arg_true()

    # XXX: our goal (in this func) is to select out slice index
    # pairs (zseg0_start, zseg_end) in abs index units for each
    # null-segment portion detected throughout entire input frame.

    # only up to one null-segment in entire frame?
    num_gaps: int = fi_zgaps.size + 1
    if num_gaps < 1:
        if absi_zeros.size > 1:
            absi_zsegs = [[
                # TODO: maybe mk these max()/min() limits func
                # consts instead of called more then once?
                max(
                    absi_zeros[0] - 1,
                    0,
                ),
                # NOTE: need the + 1 to guarantee we index "up to"
                # the next non-null row-datum.
                min(
                    absi_zeros[-1] + 1,
                    frame['index'][-1],
                ),
            ]]
        else:
            # XXX EDGE CASE: only one null-datum found so
            # mark the start abs index as None to trigger
            # a full frame-len query to the respective backend?
            absi_zsegs = [[
                # see `get_hist()` in backend, should ALWAYS be
                # able to handle a `start_dt=None`!
                # None,
                None,
                absi_zeros[0] + 1,
            ]]

    # XXX NOTE XXX: if >= 2 zeroed segments are found, there should
    # ALWAYS be more then one zero-segment-abs-index-step-diff row
    # in `absi_zdiff`, so loop through all such
    # abs-index-step-diffs >1 (i.e. the entries of `absi_zdiff`)
    # and add them as the "end index" entries for each segment.
    # Then, iif NOT iterating the first such segment end, look back
    # for the prior segments zero-segment start indext by relative
    # indexing the `zero_t` frame by -1 and grabbing the abs index
    # of what should be the prior zero-segment abs start index.
    else:
        # NOTE: since `absi_zdiff` will never have a row
        # corresponding to the first zero-segment's row, we add it
        # manually here.
        absi_zsegs.append([
            max(
                absi_zeros[0] - 1,
                0,
            ),
            None,
        ])

        # TODO: can we do it with vec ops?
        for i, (
            fi,  # frame index of zero-seg start
            zseg_start_row,  # full row for ^
        ) in enumerate(zip(
            fi_zgaps,
            fi_zseg_start_rows,
        )):
            assert (zseg_start_row == zero_t[fi]).all()
            iabs: int = zseg_start_row['index'][0]
            absi_zsegs.append([
                iabs - 1,
                None,  # backfilled on next iter
            ])

            # final iter case, backfill FINAL end iabs!
            if (i + 1) == fi_zgaps.size:
                absi_zsegs[-1][1] = absi_zeros[-1] + 1

            # NOTE: only after the first segment (due to `.diff()`
            # usage above) can we do a lookback to the prior
            # segment's end row and determine it's abs index to
            # retroactively insert to the prior
            # `absi_zsegs[i-1][1]` entry Bo
            last_end: int = absi_zsegs[i][1]
            if last_end is None:
                prev_zseg_row = zero_t[fi - 1]
                absi_post_zseg = prev_zseg_row['index'][0] + 1
                # XXX: MUST BACKFILL previous end iabs!
                absi_zsegs[i][1] = absi_post_zseg

        else:
            if 0 < num_gaps < 2:
                absi_zsegs[-1][1] = min(
                    absi_zeros[-1] + 1,
                    frame['index'][-1],
                )

            iabs_first: int = frame['index'][0]
            for start, end in absi_zsegs:

                ts_start: float = times[start - iabs_first]
                ts_end: float = times[end - iabs_first]
                if (
                    (ts_start == 0 and not start == 0)
                    or
                    ts_end == 0
                ):
                    import pdbp
                    pdbp.set_trace()

                assert end
                assert start < end

    log.warning(
        f'Frame has {len(absi_zsegs)} NULL GAPS!?\n'
        f'period: {period}\n'
        f'total null samples: {len(zero_t)}\n'
    )

    return (
        absi_zsegs,  # [start, end] abs slice indices of seg
        absi_zeros,  # all abs indices within all null-segs
        zero_t,  # sliced-view of all null-segment rows-datums
    )


def iter_null_segs(
    timeframe: float,
    frame: Frame | None = None,
    null_segs: tuple | None = None,

) -> Generator[
    tuple[
        int, int,
        int, int,
        float, float,
        float, float,

        # Seq,  # TODO: can we make it an array-type instead?
        # list[
        #     list[int, int],
        # ],
        # Seq,
        # Frame
    ],
    None,
]:
    if not (
        null_segs := get_null_segs(
            frame,
            period=timeframe,
        )
    ):
        return

    absi_pairs_zsegs: list[list[float, float]]
    izeros: Seq
    zero_t: Frame
    (
        absi_pairs_zsegs,
        izeros,
        zero_t,
    ) = null_segs

    absi_first: int = frame[0]['index']
    for (
        absi_start,
        absi_end,
    ) in absi_pairs_zsegs:

        fi_end: int = absi_end - absi_first
        end_row: Seq = frame[fi_end]
        end_t: float = end_row['time']
        end_dt: DateTime = from_timestamp(end_t)

        fi_start = None
        start_row = None
        start_t = None
        start_dt = None
        if (
            absi_start is not None
            and start_t != 0
        ):
            fi_start: int = absi_start - absi_first
            start_row: Seq = frame[fi_start]
            start_t: float = start_row['time']
            start_dt: DateTime = from_timestamp(start_t)

        if absi_start < 0:
            import pdbp
            pdbp.set_trace()

        yield (
            absi_start, absi_end,  # abs indices
            fi_start, fi_end,  # relative "frame" indices
            start_t, end_t,
            start_dt, end_dt,
        )


def with_dts(
    df: pl.DataFrame,
    time_col: str = 'time',

) -> pl.DataFrame:
    '''
    Insert datetime (casted) columns to a (presumably) OHLC sampled
    time series with an epoch-time column keyed by `time_col: str`.

    '''
    return df.with_columns([
        pl.col(time_col).shift(1).suffix('_prev'),
        pl.col(time_col).diff().alias('s_diff'),
        pl.from_epoch(pl.col(time_col)).alias('dt'),
    ]).with_columns([
        pl.from_epoch(
            column=pl.col(f'{time_col}_prev'),
        ).alias('dt_prev'),
        pl.col('dt').diff().alias('dt_diff'),
    ])


t_unit: Literal = Literal[
    'days',
    'hours',
    'minutes',
    'seconds',
    'miliseconds',
    'microseconds',
    'nanoseconds',
]


def detect_time_gaps(
    w_dts: pl.DataFrame,

    time_col: str = 'time',
    # epoch sampling step diff
    expect_period: float = 60,

    # NOTE: legacy stock mkts have venue operating hours
    # and thus gaps normally no more then 1-2 days at
    # a time.
    gap_thresh: float = 1.,

    # TODO: allow passing in a frame of operating hours?
    # -[ ] durations/ranges for faster legit gap checks?
    # XXX -> must be valid ``polars.Expr.dt.<name>``
    # like 'days' which a sane default for venue closures
    # though will detect weekend gaps which are normal :o
    gap_dt_unit: t_unit | None = None,

) -> pl.DataFrame:
    '''
    Filter to OHLC datums which contain sample step gaps.

    For eg. legacy markets which have venue close gaps and/or
    actual missing data segments.

    '''
    # first select by any sample-period (in seconds unit) step size
    # greater then expected.
    step_gaps: pl.DataFrame = w_dts.filter(
        pl.col('s_diff').abs() > expect_period
    )

    if gap_dt_unit is None:
        return step_gaps

    # NOTE: this flag is to indicate that on this (sampling) time
    # scale we expect to only be filtering against larger venue
    # closures-scale time gaps.
    return step_gaps.filter(
        # Second by an arbitrary dt-unit step size
        getattr(
            pl.col('dt_diff').dt,
            gap_dt_unit,
        )().abs() > gap_thresh
    )


def detect_price_gaps(
    df: pl.DataFrame,
    gt_multiplier: float = 2.,
    price_fields: list[str] = ['high', 'low'],

) -> pl.DataFrame:
    '''
    Detect gaps in clearing price over an OHLC series.

    2 types of gaps generally exist; up gaps and down gaps:

    - UP gap: when any next sample's lo price is strictly greater
      then the current sample's hi price.

    - DOWN gap: when any next sample's hi price is strictly
      less then the current samples lo price.

    '''
    # return df.filter(
    #     pl.col('high') - ) > expect_period,
    # ).select([
    #     pl.dt.datetime(pl.col(time_col).shift(1)).suffix('_previous'),
    #     pl.all(),
    # ]).select([
    #     pl.all(),
    #     (pl.col(time_col) - pl.col(f'{time_col}_previous')).alias('diff'),
    # ])
    ...

# TODO: probably just use the null_segs impl above?
def detect_vlm_gaps(
    df: pl.DataFrame,
    col: str = 'volume',

) -> pl.DataFrame:

    vnull: pl.DataFrame = w_dts.filter(
        pl.col(col) == 0
    )
    return vnull


def dedupe(
    src_df: pl.DataFrame,

    time_gaps: pl.DataFrame | None = None,
    sort: bool = True,
    period: float = 60,

) -> tuple[
    pl.DataFrame,  # with dts
    pl.DataFrame,  # with deduplicated dts (aka gap/repeat removal)
    int,  # len diff between input and deduped
]:
    '''
    Check for time series gaps and if found
    de-duplicate any datetime entries, check for
    a frame height diff and return the newly
    dt-deduplicated frame.

    '''
    wdts: pl.DataFrame = with_dts(src_df)

    deduped = wdts

    # remove duplicated datetime samples/sections
    deduped: pl.DataFrame = wdts.unique(
        # subset=['dt'],
        subset=['time'],
        maintain_order=True,
    )

    # maybe sort on any time field
    if sort:
        deduped = deduped.sort(by='time')
        # TODO: detect out-of-order segments which were corrected!
        # -[ ] report in log msg
        # -[ ] possibly return segment sections which were moved?

    diff: int = (
        wdts.height
        -
        deduped.height
    )
    return (
        wdts,
        deduped,
        diff,
    )


def sort_diff(
    src_df: pl.DataFrame,
    col: str = 'time',

) -> tuple[
    pl.DataFrame,  # with dts
    pl.DataFrame,  # sorted
    list[int],  # indices of segments that are out-of-order
]:
    ser: pl.Series = src_df[col]
    sortd: pl.DataFrame = ser.sort()
    diff: pl.Series = ser.diff()

    sortd_diff: pl.Series = sortd.diff()
    i_step_diff = (diff != sortd_diff).arg_true()
    frame_reorders: int = i_step_diff.len()
    if frame_reorders:
        log.warn(
            f'Resorted frame on col: {col}\n'
            f'{frame_reorders}'

        )
        # import pdbp; pdbp.set_trace()

# NOTE: thanks to this SO answer for the below conversion routines
# to go from numpy struct-arrays to polars dataframes and back:
# https://stackoverflow.com/a/72054819
def np2pl(array: np.ndarray) -> pl.DataFrame:
    start: float = time.time()

    # XXX: thanks to this SO answer for this conversion tip:
    # https://stackoverflow.com/a/72054819
    df = pl.DataFrame({
        field_name: array[field_name]
        for field_name in array.dtype.fields
    })
    delay: float = round(
        time.time() - start,
        ndigits=6,
    )
    log.info(
        f'numpy -> polars conversion took {delay} secs\n'
        f'polars df: {df}'
    )
    return df


def pl2np(
    df: pl.DataFrame,
    dtype: np.dtype,

) -> np.ndarray:

    # Create numpy struct array of the correct size and dtype
    # and loop through df columns to fill in array fields.
    array = np.empty(
        df.height,
        dtype,
    )
    for field, col in zip(
        dtype.fields,
        df.columns,
    ):
        array[field] = df.get_column(col).to_numpy()

    return array