Rework backfiller and null-segment task conc

For each timeframe open a sub-nursery to do the backfilling + tsdb load
+ null-segment scanning in an effort to both speed up load time (though
we need to reverse the current order to really make it faster rn since
moving to the much faster parquet file backend) and do concurrent
time-gap/null-segment checking of tsdb history while mrf (most recent
frame) history is backfilling.

The details are more or less just `trio` related task-func composition
tricks and a reordering of said funcs for optimal startup latency.
Also commented the `back_load_from_tsdb()` task for now since it's
unused.
distribute_dis
Tyler Goodlet 2023-12-15 13:11:00 -05:00
parent a4084d6a0b
commit 97e2403fb1
1 changed files with 310 additions and 197 deletions

View File

@ -36,6 +36,7 @@ import trio
from trio_typing import TaskStatus from trio_typing import TaskStatus
import tractor import tractor
from pendulum import ( from pendulum import (
DateTime,
Duration, Duration,
from_timestamp, from_timestamp,
) )
@ -172,16 +173,27 @@ async def maybe_fill_null_segments(
sampler_stream: tractor.MsgStream, sampler_stream: tractor.MsgStream,
mkt: MktPair, mkt: MktPair,
task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED,
) -> list[Frame]: ) -> list[Frame]:
null_segs_detected = trio.Event()
task_status.started(null_segs_detected)
frame: Frame = shm.array frame: Frame = shm.array
async for (
null_segs: tuple | None = get_null_segs(
frame,
period=timeframe,
)
for (
absi_start, absi_end, absi_start, absi_end,
fi_start, fi_end, fi_start, fi_end,
start_t, end_t, start_t, end_t,
start_dt, end_dt, start_dt, end_dt,
) in iter_null_segs( ) in iter_null_segs(
frame, null_segs=null_segs,
frame=frame,
timeframe=timeframe, timeframe=timeframe,
): ):
@ -191,6 +203,7 @@ async def maybe_fill_null_segments(
start_dt start_dt
and end_dt < start_dt and end_dt < start_dt
): ):
await tractor.pause()
break break
( (
@ -233,14 +246,7 @@ async def maybe_fill_null_segments(
}, },
}) })
await tractor.pause() null_segs_detected.set()
# TODO: interatively step through any remaining time gaps?
# if (
# next_end_dt not in frame[
# ):
# pass
# RECHECK for more null-gaps # RECHECK for more null-gaps
frame: Frame = shm.array frame: Frame = shm.array
null_segs: tuple | None = get_null_segs( null_segs: tuple | None = get_null_segs(
@ -254,8 +260,23 @@ async def maybe_fill_null_segments(
): ):
await tractor.pause() await tractor.pause()
# TODO: interatively step through any remaining
# time-gaps/null-segments and spawn piecewise backfiller
# tasks in a nursery?
# -[ ] not sure that's going to work so well on the ib
# backend but worth a shot?
# -[ ] mk new history connections to make it properly
# parallel possible no matter the backend?
# -[ ] fill algo: do queries in alternating "latest, then
# earliest, then latest.. etc?"
# if (
# next_end_dt not in frame[
# ):
# pass
async def start_backfill( async def start_backfill(
tn: trio.Nursery,
get_hist, get_hist,
mod: ModuleType, mod: ModuleType,
mkt: MktPair, mkt: MktPair,
@ -510,23 +531,6 @@ async def start_backfill(
f'Finished filling gap to tsdb start @ {backfill_until_dt}!' f'Finished filling gap to tsdb start @ {backfill_until_dt}!'
) )
# NOTE: conduct tsdb timestamp gap detection and backfill any
# seemingly missing (null-time) segments..
# TODO: ideally these can never exist!
# -[ ] somehow it seems sometimes we're writing zero-ed
# segments to tsdbs during teardown?
# -[ ] can we ensure that the backcfiller tasks do this
# work PREVENTAVELY instead?
# -[ ] fill in non-zero epoch time values ALWAYS!
await maybe_fill_null_segments(
shm=shm,
timeframe=timeframe,
get_hist=get_hist,
sampler_stream=sampler_stream,
mkt=mkt,
)
# XXX: extremely important, there can be no checkpoints # XXX: extremely important, there can be no checkpoints
# in the block above to avoid entering new ``frames`` # in the block above to avoid entering new ``frames``
# values while we're pipelining the current ones to # values while we're pipelining the current ones to
@ -537,6 +541,12 @@ async def start_backfill(
bf_done.set() bf_done.set()
# NOTE: originally this was used to cope with a tsdb (marketstore)
# which could not delivery very large frames of history over gRPC
# (thanks goolag) due to corruption issues. NOW, using apache
# parquet (by default in the local filesys) we don't have this
# requirement since the files can be loaded very quickly in
# entirety to memory via
async def back_load_from_tsdb( async def back_load_from_tsdb(
storemod: ModuleType, storemod: ModuleType,
storage: StorageClient, storage: StorageClient,
@ -674,37 +684,18 @@ async def back_load_from_tsdb(
# await sampler_stream.send('broadcast_all') # await sampler_stream.send('broadcast_all')
async def tsdb_backfill( async def push_latest_frame(
mod: ModuleType, dt_eps: list[DateTime, DateTime],
storemod: ModuleType,
tn: trio.Nursery,
storage: StorageClient,
mkt: MktPair,
shm: ShmArray, shm: ShmArray,
timeframe: float,
sampler_stream: tractor.MsgStream,
task_status: TaskStatus[
tuple[ShmArray, ShmArray]
] = trio.TASK_STATUS_IGNORED,
) -> None:
get_hist: Callable[ get_hist: Callable[
[int, datetime, datetime], [int, datetime, datetime],
tuple[np.ndarray, str] tuple[np.ndarray, str]
] ],
config: dict[str, int] timeframe: float,
async with mod.open_history_client( config: dict,
mkt,
) as (get_hist, config):
log.info(
f'`{mod}` history client returned backfill config:\n'
f'{config}\n'
)
task_status: TaskStatus[None] = trio.TASK_STATUS_IGNORED,
):
# get latest query's worth of history all the way # get latest query's worth of history all the way
# back to what is recorded in the tsdb # back to what is recorded in the tsdb
try: try:
@ -716,6 +707,11 @@ async def tsdb_backfill(
timeframe, timeframe,
end_dt=None, end_dt=None,
) )
# so caller can access these ep values
dt_eps.extend([
mr_start_dt,
mr_end_dt,
])
# XXX: timeframe not supported for backend (since # XXX: timeframe not supported for backend (since
# above exception type), terminate immediately since # above exception type), terminate immediately since
@ -737,34 +733,154 @@ async def tsdb_backfill(
array, array,
prepend=True, # append on first frame prepend=True, # append on first frame
) )
backfill_gap_from_shm_index: int = shm._first.value + 1
# tell parent task to continue
task_status.started()
async def load_tsdb_hist(
storage: StorageClient,
mkt: MktPair,
timeframe: float,
) -> tuple[
np.ndarray,
DateTime,
DateTime,
] | None:
# loads a (large) frame of data from the tsdb depending # loads a (large) frame of data from the tsdb depending
# on the db's query size limit; our "nativedb" (using # on the db's query size limit; our "nativedb" (using
# parquet) generally can load the entire history into mem # parquet) generally can load the entire history into mem
# but if not then below the remaining history can be lazy # but if not then below the remaining history can be lazy
# loaded? # loaded?
fqme: str = mkt.fqme fqme: str = mkt.fqme
last_tsdb_dt: datetime | None = None tsdb_entry: tuple[
np.ndarray,
DateTime,
DateTime,
]
try: try:
tsdb_entry: tuple | None = await storage.load( tsdb_entry: tuple | None = await storage.load(
fqme, fqme,
timeframe=timeframe, timeframe=timeframe,
) )
return tsdb_entry
except TimeseriesNotFound: except TimeseriesNotFound:
log.warning( log.warning(
f'No timeseries yet for {timeframe}@{fqme}' f'No timeseries yet for {timeframe}@{fqme}'
) )
else: return None
async def tsdb_backfill(
mod: ModuleType,
storemod: ModuleType,
storage: StorageClient,
mkt: MktPair,
shm: ShmArray,
timeframe: float,
sampler_stream: tractor.MsgStream,
task_status: TaskStatus[
tuple[ShmArray, ShmArray]
] = trio.TASK_STATUS_IGNORED,
) -> None:
if timeframe not in (1, 60):
raise ValueError(
'`piker` only needs to support 1m and 1s sampling '
'but ur api is trying to deliver a longer '
f'timeframe of {timeframe} seconds..\n'
'So yuh.. dun do dat brudder.'
)
get_hist: Callable[
[int, datetime, datetime],
tuple[np.ndarray, str]
]
config: dict[str, int]
async with (
mod.open_history_client(
mkt,
) as (get_hist, config),
# NOTE: this sub-nursery splits to tasks for the given
# sampling rate to concurrently load offline tsdb
# timeseries as well as new data from the venue backend!
):
log.info(
f'`{mod}` history client returned backfill config:\n'
f'{config}\n'
)
dt_eps: list[DateTime, DateTime] = []
async with trio.open_nursery() as tn:
tn.start_soon(
push_latest_frame,
dt_eps,
shm,
get_hist,
timeframe,
config,
)
# tell parent task to continue
# TODO: really we'd want this the other way with the
# tsdb load happening asap and the since the latest
# frame query will normally be the main source of
# latency?
task_status.started()
tsdb_entry: tuple = await load_tsdb_hist(
storage,
mkt,
timeframe,
)
# NOTE: iabs to start backfilling from, reverse chronological,
# ONLY AFTER the first history frame has been pushed to
# mem!
backfill_gap_from_shm_index: int = shm._first.value + 1
(
mr_start_dt,
mr_end_dt,
) = dt_eps
async with trio.open_nursery() as tn:
# Prepend any tsdb history to the shm buffer which should
# now be full of the most recent history pulled from the
# backend's last frame.
if tsdb_entry:
( (
tsdb_history, tsdb_history,
first_tsdb_dt, first_tsdb_dt,
last_tsdb_dt, last_tsdb_dt,
) = tsdb_entry ) = tsdb_entry
# if there is a gap to backfill from the first
# history frame until the last datum loaded from the tsdb
# continue that now in the background
bf_done = await tn.start(
partial(
start_backfill,
tn=tn,
get_hist=get_hist,
mod=mod,
mkt=mkt,
shm=shm,
timeframe=timeframe,
backfill_from_shm_index=backfill_gap_from_shm_index,
backfill_from_dt=mr_start_dt,
sampler_stream=sampler_stream,
backfill_until_dt=last_tsdb_dt,
storage=storage,
write_tsdb=True,
)
)
# calc the index from which the tsdb data should be # calc the index from which the tsdb data should be
# prepended, presuming there is a gap between the # prepended, presuming there is a gap between the
# latest frame (loaded/read above) and the latest # latest frame (loaded/read above) and the latest
@ -830,36 +946,32 @@ async def tsdb_backfill(
log.info(f'Loaded {to_push.shape} datums from storage') log.info(f'Loaded {to_push.shape} datums from storage')
# TODO: maybe start history anal and load missing "history # NOTE: ASYNC-conduct tsdb timestamp gap detection and backfill any
# gaps" via backend.. # seemingly missing (null-time) segments..
# TODO: ideally these can never exist!
# -[ ] somehow it seems sometimes we're writing zero-ed
# segments to tsdbs during teardown?
# -[ ] can we ensure that the backcfiller tasks do this
# work PREVENTAVELY instead?
# -[ ] fill in non-zero epoch time values ALWAYS!
# await maybe_fill_null_segments(
nulls_detected: trio.Event = await tn.start(partial(
maybe_fill_null_segments,
if timeframe not in (1, 60):
raise ValueError(
'`piker` only needs to support 1m and 1s sampling '
'but ur api is trying to deliver a longer '
f'timeframe of {timeframe} seconds..\n'
'So yuh.. dun do dat brudder.'
)
# if there is a gap to backfill from the first
# history frame until the last datum loaded from the tsdb
# continue that now in the background
bf_done = await tn.start(
partial(
start_backfill,
get_hist=get_hist,
mod=mod,
mkt=mkt,
shm=shm, shm=shm,
timeframe=timeframe, timeframe=timeframe,
backfill_from_shm_index=backfill_gap_from_shm_index, get_hist=get_hist,
backfill_from_dt=mr_start_dt,
sampler_stream=sampler_stream, sampler_stream=sampler_stream,
backfill_until_dt=last_tsdb_dt, mkt=mkt,
storage=storage, ))
write_tsdb=True,
)
) # TODO: who would want to?
await nulls_detected.wait()
await bf_done.wait()
# TODO: maybe start history anal and load missing "history
# gaps" via backend..
# if len(hist_shm.array) < 2: # if len(hist_shm.array) < 2:
# TODO: there's an edge case here to solve where if the last # TODO: there's an edge case here to solve where if the last
@ -874,32 +986,29 @@ async def tsdb_backfill(
# backload any further data from tsdb (concurrently per # backload any further data from tsdb (concurrently per
# timeframe) if not all data was able to be loaded (in memory) # timeframe) if not all data was able to be loaded (in memory)
# from the ``StorageClient.load()`` call above. # from the ``StorageClient.load()`` call above.
try:
await trio.sleep_forever() await trio.sleep_forever()
finally:
return
# XXX NOTE: this is legacy from when we were using # XXX NOTE: this is legacy from when we were using
# marketstore and we needed to continue backloading # marketstore and we needed to continue backloading
# incrementally from the tsdb client.. (bc it couldn't # incrementally from the tsdb client.. (bc it couldn't
# handle a single large query with gRPC for some # handle a single large query with gRPC for some
# reason.. classic goolag pos) # reason.. classic goolag pos)
tn.start_soon( # tn.start_soon(
back_load_from_tsdb, # back_load_from_tsdb,
storemod, # storemod,
storage, # storage,
fqme, # fqme,
tsdb_history, # tsdb_history,
last_tsdb_dt, # last_tsdb_dt,
mr_start_dt, # mr_start_dt,
mr_end_dt, # mr_end_dt,
bf_done, # bf_done,
timeframe, # timeframe,
shm, # shm,
) # )
async def manage_history( async def manage_history(
@ -1007,6 +1116,11 @@ async def manage_history(
async with ( async with (
storage.open_storage_client() as (storemod, client), storage.open_storage_client() as (storemod, client),
# NOTE: this nursery spawns a task per "timeframe" (aka
# sampling period) data set since normally differently
# sampled timeseries can be loaded / process independently
# ;)
trio.open_nursery() as tn, trio.open_nursery() as tn,
): ):
log.info( log.info(
@ -1056,7 +1170,6 @@ async def manage_history(
tsdb_backfill, tsdb_backfill,
mod=mod, mod=mod,
storemod=storemod, storemod=storemod,
tn=tn,
# bus, # bus,
storage=client, storage=client,
mkt=mkt, mkt=mkt,