`.data.history`: run `.tsp.dedupe()` in backloader

In an effort to catch out-of-order and/or partial-frame-duplicated
segments, add some `.tsp` calls throughout the backloader tasks
including a call to the new `.sort_diff()` to catch the out-of-order
history cases.
distribute_dis
Tyler Goodlet 2023-12-12 19:57:46 -05:00
parent e8bf4c6e04
commit b95932ea09
1 changed files with 70 additions and 39 deletions

View File

@ -1,18 +1,19 @@
# piker: trading gear for hackers # piker: trading gear for hackers
# Copyright (C) Tyler Goodlet (in stewardship for pikers) # Copyright (C) Tyler Goodlet (in stewardship for pikers)
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or
# it under the terms of the GNU Affero General Public License as published by # modify it under the terms of the GNU Affero General Public
# the Free Software Foundation, either version 3 of the License, or # License as published by the Free Software Foundation, either
# (at your option) any later version. # version 3 of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful, # This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# GNU Affero General Public License for more details. # Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public
# along with this program. If not, see <https://www.gnu.org/licenses/>. # License along with this program. If not, see
# <https://www.gnu.org/licenses/>.
''' '''
Historical data business logic for load, backfill and tsdb storage. Historical data business logic for load, backfill and tsdb storage.
@ -39,6 +40,7 @@ from pendulum import (
from_timestamp, from_timestamp,
) )
import numpy as np import numpy as np
import polars as pl
from ..accounting import ( from ..accounting import (
MktPair, MktPair,
@ -54,6 +56,7 @@ from ._source import def_iohlcv_fields
from ._sampling import ( from ._sampling import (
open_sample_stream, open_sample_stream,
) )
from . import tsp
from ..brokers._util import ( from ..brokers._util import (
DataUnavailable, DataUnavailable,
) )
@ -197,7 +200,7 @@ async def start_backfill(
# do a decently sized backfill and load it into storage. # do a decently sized backfill and load it into storage.
periods = { periods = {
1: {'days': 6}, 1: {'days': 2},
60: {'years': 6}, 60: {'years': 6},
} }
period_duration: int = periods[timeframe] period_duration: int = periods[timeframe]
@ -246,13 +249,16 @@ async def start_backfill(
# broker says there never was or is no more history to pull # broker says there never was or is no more history to pull
except DataUnavailable: except DataUnavailable:
log.warning( log.warning(
f'NO-MORE-DATA: backend {mod.name} halted history!?' f'NO-MORE-DATA: backend {mod.name} halted history:\n'
f'{timeframe}@{mkt.fqme}'
) )
# ugh, what's a better way? # ugh, what's a better way?
# TODO: fwiw, we probably want a way to signal a throttle # TODO: fwiw, we probably want a way to signal a throttle
# condition (eg. with ib) so that we can halt the # condition (eg. with ib) so that we can halt the
# request loop until the condition is resolved? # request loop until the condition is resolved?
if timeframe > 1:
await tractor.pause()
return return
# TODO: drop this? see todo above.. # TODO: drop this? see todo above..
@ -300,9 +306,11 @@ async def start_backfill(
array, array,
prepend_until_dt=backfill_until_dt, prepend_until_dt=backfill_until_dt,
) )
ln = len(to_push) ln: int = len(to_push)
if ln: if ln:
log.info(f'{ln} bars for {next_start_dt} -> {last_start_dt}') log.info(
f'{ln} bars for {next_start_dt} -> {last_start_dt}'
)
else: else:
log.warning( log.warning(
@ -388,14 +396,29 @@ async def start_backfill(
without_src=True, without_src=True,
) )
else: else:
col_sym_key: str = mkt.get_fqme(delim_char='') col_sym_key: str = mkt.get_fqme(
delim_char='',
)
# TODO: implement parquet append!?
await storage.write_ohlcv( await storage.write_ohlcv(
col_sym_key, col_sym_key,
shm.array, shm.array,
timeframe, timeframe,
) )
df: pl.DataFrame = await storage.as_df(
fqme=mkt.fqme,
period=timeframe,
load_from_offline=False,
)
(
df,
gaps,
deduped,
diff,
) = tsp.dedupe(df)
if diff:
tsp.sort_diff(df)
else: else:
# finally filled gap # finally filled gap
log.info( log.info(
@ -634,12 +657,19 @@ async def tsdb_backfill(
async with mod.open_history_client( async with mod.open_history_client(
mkt, mkt,
) as (get_hist, config): ) as (get_hist, config):
log.info(f'{mod} history client returned backfill config: {config}') log.info(
f'`{mod}` history client returned backfill config:\n'
f'{config}\n'
)
# get latest query's worth of history all the way # get latest query's worth of history all the way
# back to what is recorded in the tsdb # back to what is recorded in the tsdb
try: try:
array, mr_start_dt, mr_end_dt = await get_hist( (
array,
mr_start_dt,
mr_end_dt,
) = await get_hist(
timeframe, timeframe,
end_dt=None, end_dt=None,
) )
@ -649,6 +679,7 @@ async def tsdb_backfill(
# there's no backfilling possible. # there's no backfilling possible.
except DataUnavailable: except DataUnavailable:
task_status.started() task_status.started()
await tractor.pause()
return return
# TODO: fill in non-zero epoch time values ALWAYS! # TODO: fill in non-zero epoch time values ALWAYS!
@ -699,9 +730,8 @@ async def tsdb_backfill(
) )
except TimeseriesNotFound: except TimeseriesNotFound:
log.warning( log.warning(
f'No timeseries yet for {fqme}' f'No timeseries yet for {timeframe}@{fqme}'
) )
else: else:
( (
tsdb_history, tsdb_history,
@ -784,25 +814,24 @@ async def tsdb_backfill(
f'timeframe of {timeframe} seconds..\n' f'timeframe of {timeframe} seconds..\n'
'So yuh.. dun do dat brudder.' 'So yuh.. dun do dat brudder.'
) )
# if there is a gap to backfill from the first # if there is a gap to backfill from the first
# history frame until the last datum loaded from the tsdb # history frame until the last datum loaded from the tsdb
# continue that now in the background # continue that now in the background
bf_done = await tn.start( bf_done = await tn.start(
partial( partial(
start_backfill, start_backfill,
get_hist, get_hist=get_hist,
mod, mod=mod,
mkt, mkt=mkt,
shm, shm=shm,
timeframe, timeframe=timeframe,
backfill_from_shm_index=backfill_gap_from_shm_index, backfill_from_shm_index=backfill_gap_from_shm_index,
backfill_from_dt=mr_start_dt, backfill_from_dt=mr_start_dt,
sampler_stream=sampler_stream, sampler_stream=sampler_stream,
backfill_until_dt=last_tsdb_dt, backfill_until_dt=last_tsdb_dt,
storage=storage, storage=storage,
write_tsdb=True,
) )
) )
@ -824,8 +853,11 @@ async def tsdb_backfill(
finally: finally:
return return
# IF we need to continue backloading incrementally from the # XXX NOTE: this is legacy from when we were using
# tsdb client.. # marketstore and we needed to continue backloading
# incrementally from the tsdb client.. (bc it couldn't
# handle a single large query with gRPC for some
# reason.. classic goolag pos)
tn.start_soon( tn.start_soon(
back_load_from_tsdb, back_load_from_tsdb,
@ -994,19 +1026,18 @@ async def manage_history(
log.info(f'Connected to sampler stream: {sample_stream}') log.info(f'Connected to sampler stream: {sample_stream}')
for timeframe in [60, 1]: for timeframe in [60, 1]:
await tn.start( await tn.start(partial(
tsdb_backfill, tsdb_backfill,
mod, mod=mod,
storemod, storemod=storemod,
tn, tn=tn,
# bus, # bus,
client, storage=client,
mkt, mkt=mkt,
tf2mem[timeframe], shm=tf2mem[timeframe],
timeframe, timeframe=timeframe,
sampler_stream=sample_stream,
sample_stream, ))
)
# indicate to caller that feed can be delivered to # indicate to caller that feed can be delivered to
# remote requesting client since we've loaded history # remote requesting client since we've loaded history