Fix .parquet filenaming..

Apparently `.storage.nativedb.mk_ohlcv_shm_keyed_filepath()` was always
kinda broken if you passed in a `period: float` with an actual non-`int`
to the format string? Fixed it to strictly cast to `int()` before
str-ifying so that you don't get weird `60.0s.parquet` in there..

Further this rejigs the `sotre ldshm` gap correction-annotation loop to,
- use `StorageClient.write_ohlcv()` instead of hackily re-implementing
  it.. now that problem from above is fixed!
- use a `needs_correction: bool` var to determine if gap markup and
  de-duplictated data should be pushed to the shm buffer,
- go back to using `AnnotCtl.add_rect()` for all detected gaps such that
  they all persist (and thus are shown together) until the client
  disconnects.
distribute_dis
Tyler Goodlet 2023-12-26 17:14:26 -05:00
parent 1d7e97a295
commit a86573b5a2
2 changed files with 151 additions and 110 deletions

View File

@ -20,8 +20,12 @@ Storage middle-ware CLIs.
""" """
from __future__ import annotations from __future__ import annotations
# from datetime import datetime # from datetime import datetime
# from contextlib import (
# AsyncExitStack,
# )
from pathlib import Path from pathlib import Path
import time import time
from types import ModuleType
import polars as pl import polars as pl
import numpy as np import numpy as np
@ -34,7 +38,6 @@ import typer
from piker.service import open_piker_runtime from piker.service import open_piker_runtime
from piker.cli import cli from piker.cli import cli
from piker.config import get_conf_dir
from piker.data import ( from piker.data import (
ShmArray, ShmArray,
) )
@ -45,6 +48,7 @@ from . import (
from . import ( from . import (
__tsdbs__, __tsdbs__,
open_storage_client, open_storage_client,
StorageClient,
) )
@ -232,7 +236,8 @@ def anal(
@store.command() @store.command()
def ldshm( def ldshm(
fqme: str, fqme: str,
write_parquet: bool = False, write_parquet: bool = True,
reload_parquet_to_shm: bool = True,
) -> None: ) -> None:
''' '''
@ -242,15 +247,32 @@ def ldshm(
''' '''
async def main(): async def main():
from piker.ui._remote_ctl import (
open_annot_ctl,
AnnotCtl,
)
actl: AnnotCtl
mod: ModuleType
client: StorageClient
async with ( async with (
open_piker_runtime( open_piker_runtime(
'polars_boi', 'polars_boi',
enable_modules=['piker.data._sharedmem'], enable_modules=['piker.data._sharedmem'],
debug_mode=True, debug_mode=True,
), ),
open_storage_client() as (
mod,
client,
),
open_annot_ctl() as actl,
): ):
df: pl.DataFrame | None = None shm_df: pl.DataFrame | None = None
for shmfile, shm, shm_df in tsp.iter_dfs_from_shms(fqme): for (
shmfile,
shm,
# parquet_path,
shm_df,
) in tsp.iter_dfs_from_shms(fqme):
# compute ohlc properties for naming # compute ohlc properties for naming
times: np.ndarray = shm.array['time'] times: np.ndarray = shm.array['time']
@ -275,122 +297,136 @@ def ldshm(
period=period_s, period=period_s,
) )
# TODO: maybe only optionally enter this depending needs_correction: bool = (
# on some CLI flags and/or gap detection?
if (
not gaps.is_empty() not gaps.is_empty()
or null_segs or null_segs
): )
from piker.ui._remote_ctl import ( # TODO: maybe only optionally enter this depending
open_annot_ctl, # on some CLI flags and/or gap detection?
AnnotCtl, if needs_correction:
) for i in range(gaps.height):
annot_ctl: AnnotCtl row: pl.DataFrame = gaps[i]
async with open_annot_ctl() as annot_ctl:
for i in range(gaps.height):
row: pl.DataFrame = gaps[i] # TODO: can we eventually remove this
# once we figure out why the epoch cols
# don't match?
iend: int = row['index'][0]
# dt: datetime = row['dt'][0]
# dt_prev: datetime = row['dt_prev'][0]
# TODO: can we eventually remove this # the gap's right-most bar's OPEN value
# once we figure out why the epoch cols # at that time (sample) step.
# don't match? # dt_end_t: float = dt.timestamp()
iend: int = row['index'][0]
# dt: datetime = row['dt'][0]
# dt_prev: datetime = row['dt_prev'][0]
# the gap's right-most bar's OPEN value # TODO: FIX HOW/WHY these aren't matching
# at that time (sample) step. # and are instead off by 4hours (EST
# dt_end_t: float = dt.timestamp() # vs. UTC?!?!)
# end_t: float = row['time']
# assert (
# dt.timestamp()
# ==
# end_t
# )
# TODO: FIX HOW/WHY these aren't matching # the gap's left-most bar's CLOSE value
# and are instead off by 4hours (EST # at that time (sample) step.
# vs. UTC?!?!) prev_r: pl.DataFrame = df.filter(
# end_t: float = row['time'] pl.col('index') == iend - 1
# assert ( )
# dt.timestamp() istart: int = prev_r['index'][0]
# == # dt_start_t: float = dt_prev.timestamp()
# end_t
# )
# the gap's left-most bar's CLOSE value # start_t: float = prev_r['time']
# at that time (sample) step. # assert (
# dt_start_t
# ==
# start_t
# )
prev_r: pl.DataFrame = df.filter( # TODO: implement px-col width measure
pl.col('index') == gaps[0]['index'] - 1 # and ensure at least as many px-cols
# shown per rect as configured by user.
gap_w: float = abs((iend - istart))
if gap_w < 6:
margin: float = 6
iend += margin
istart -= margin
ro: tuple[float, float] = (
# dt_end_t,
iend,
row['open'][0],
)
lc: tuple[float, float] = (
# dt_start_t,
istart,
prev_r['close'][0],
)
# async with actl.open_rect(
# ) as aid:
aid: int = await actl.add_rect(
fqme=fqme,
timeframe=period_s,
start_pos=lc,
end_pos=ro,
)
assert aid
# write to parquet file?
if (
write_parquet
):
# write to fs
start = time.time()
path: Path = await client.write_ohlcv(
fqme,
ohlcv=deduped,
timeframe=period_s,
)
write_delay: float = round(
time.time() - start,
ndigits=6,
)
# read back from fs
start = time.time()
read_df: pl.DataFrame = pl.read_parquet(path)
read_delay: float = round(
time.time() - start,
ndigits=6,
)
log.info(
f'parquet write took {write_delay} secs\n'
f'file path: {path}'
f'parquet read took {read_delay} secs\n'
f'polars df: {read_df}'
)
if reload_parquet_to_shm:
new = tsp.pl2np(
deduped,
dtype=shm.array.dtype,
) )
istart: int = prev_r['index'][0] # since normally readonly
# dt_start_t: float = dt_prev.timestamp() shm._array.setflags(
write=int(1),
# start_t: float = prev_r['time']
# assert (
# dt_start_t
# ==
# start_t
# )
# TODO: implement px-col width measure
# and ensure at least as many px-cols
# shown per rect as configured by user.
gap_w: float = abs((iend - istart))
# await tractor.pause()
if gap_w < 6:
margin: float = 6
iend += margin
istart -= margin
ro: tuple[float, float] = (
# dt_end_t,
iend,
row['open'][0],
) )
lc: tuple[float, float] = ( shm.push(
# dt_start_t, new,
istart, prepend=True,
prev_r['close'][0], start=new['index'][-1],
update_first=False, # don't update ._first
) )
aid: int = await annot_ctl.add_rect( await tractor.pause()
fqme=fqme, assert diff
timeframe=period_s,
start_pos=lc,
end_pos=ro,
)
assert aid
await tractor.pause()
# write to parquet file? else:
if write_parquet: # allow interaction even when no ts problems.
timeframe: str = f'{period_s}s' await tractor.pause()
assert not diff
datadir: Path = get_conf_dir() / 'nativedb'
if not datadir.is_dir():
datadir.mkdir()
path: Path = datadir / f'{fqme}.{timeframe}.parquet'
# write to fs
start = time.time()
df.write_parquet(path)
delay: float = round(
time.time() - start,
ndigits=6,
)
log.info(
f'parquet write took {delay} secs\n'
f'file path: {path}'
)
# read back from fs
start = time.time()
read_df: pl.DataFrame = pl.read_parquet(path)
delay: float = round(
time.time() - start,
ndigits=6,
)
print(
f'parquet read took {delay} secs\n'
f'polars df: {read_df}'
)
if df is None: if df is None:
log.error(f'No matching shm buffers for {fqme} ?') log.error(f'No matching shm buffers for {fqme} ?')

View File

@ -95,16 +95,19 @@ def detect_period(shm: ShmArray) -> float:
def mk_ohlcv_shm_keyed_filepath( def mk_ohlcv_shm_keyed_filepath(
fqme: str, fqme: str,
period: float, # ow known as the "timeframe" period: float | int, # ow known as the "timeframe"
datadir: Path, datadir: Path,
) -> str: ) -> Path:
if period < 1.: if period < 1.:
raise ValueError('Sample period should be >= 1.!?') raise ValueError('Sample period should be >= 1.!?')
period_s: str = f'{period}s' path: Path = (
path: Path = datadir / f'{fqme}.ohlcv{period_s}.parquet' datadir
/
f'{fqme}.ohlcv{int(period)}s.parquet'
)
return path return path
@ -227,6 +230,7 @@ class NativeStorageClient:
self, self,
fqme: str, fqme: str,
period: float, period: float,
) -> Path: ) -> Path:
return mk_ohlcv_shm_keyed_filepath( return mk_ohlcv_shm_keyed_filepath(
fqme=fqme, fqme=fqme,
@ -239,6 +243,7 @@ class NativeStorageClient:
fqme: str, fqme: str,
df: pl.DataFrame, df: pl.DataFrame,
timeframe: float, timeframe: float,
) -> None: ) -> None:
# cache df for later usage since we (currently) need to # cache df for later usage since we (currently) need to
# convert to np.ndarrays to push to our `ShmArray` rt # convert to np.ndarrays to push to our `ShmArray` rt