Fix .parquet filenaming..

Apparently `.storage.nativedb.mk_ohlcv_shm_keyed_filepath()` was always
kinda broken if you passed in a `period: float` with an actual non-`int`
to the format string? Fixed it to strictly cast to `int()` before
str-ifying so that you don't get weird `60.0s.parquet` in there..

Further this rejigs the `sotre ldshm` gap correction-annotation loop to,
- use `StorageClient.write_ohlcv()` instead of hackily re-implementing
  it.. now that problem from above is fixed!
- use a `needs_correction: bool` var to determine if gap markup and
  de-duplictated data should be pushed to the shm buffer,
- go back to using `AnnotCtl.add_rect()` for all detected gaps such that
  they all persist (and thus are shown together) until the client
  disconnects.
distribute_dis
Tyler Goodlet 2023-12-26 17:14:26 -05:00
parent 1d7e97a295
commit a86573b5a2
2 changed files with 151 additions and 110 deletions

View File

@ -20,8 +20,12 @@ Storage middle-ware CLIs.
"""
from __future__ import annotations
# from datetime import datetime
# from contextlib import (
# AsyncExitStack,
# )
from pathlib import Path
import time
from types import ModuleType
import polars as pl
import numpy as np
@ -34,7 +38,6 @@ import typer
from piker.service import open_piker_runtime
from piker.cli import cli
from piker.config import get_conf_dir
from piker.data import (
ShmArray,
)
@ -45,6 +48,7 @@ from . import (
from . import (
__tsdbs__,
open_storage_client,
StorageClient,
)
@ -232,7 +236,8 @@ def anal(
@store.command()
def ldshm(
fqme: str,
write_parquet: bool = False,
write_parquet: bool = True,
reload_parquet_to_shm: bool = True,
) -> None:
'''
@ -242,15 +247,32 @@ def ldshm(
'''
async def main():
from piker.ui._remote_ctl import (
open_annot_ctl,
AnnotCtl,
)
actl: AnnotCtl
mod: ModuleType
client: StorageClient
async with (
open_piker_runtime(
'polars_boi',
enable_modules=['piker.data._sharedmem'],
debug_mode=True,
),
open_storage_client() as (
mod,
client,
),
open_annot_ctl() as actl,
):
df: pl.DataFrame | None = None
for shmfile, shm, shm_df in tsp.iter_dfs_from_shms(fqme):
shm_df: pl.DataFrame | None = None
for (
shmfile,
shm,
# parquet_path,
shm_df,
) in tsp.iter_dfs_from_shms(fqme):
# compute ohlc properties for naming
times: np.ndarray = shm.array['time']
@ -275,122 +297,136 @@ def ldshm(
period=period_s,
)
# TODO: maybe only optionally enter this depending
# on some CLI flags and/or gap detection?
if (
needs_correction: bool = (
not gaps.is_empty()
or null_segs
):
from piker.ui._remote_ctl import (
open_annot_ctl,
AnnotCtl,
)
annot_ctl: AnnotCtl
async with open_annot_ctl() as annot_ctl:
for i in range(gaps.height):
)
# TODO: maybe only optionally enter this depending
# on some CLI flags and/or gap detection?
if needs_correction:
for i in range(gaps.height):
row: pl.DataFrame = gaps[i]
row: pl.DataFrame = gaps[i]
# TODO: can we eventually remove this
# once we figure out why the epoch cols
# don't match?
iend: int = row['index'][0]
# dt: datetime = row['dt'][0]
# dt_prev: datetime = row['dt_prev'][0]
# TODO: can we eventually remove this
# once we figure out why the epoch cols
# don't match?
iend: int = row['index'][0]
# dt: datetime = row['dt'][0]
# dt_prev: datetime = row['dt_prev'][0]
# the gap's right-most bar's OPEN value
# at that time (sample) step.
# dt_end_t: float = dt.timestamp()
# the gap's right-most bar's OPEN value
# at that time (sample) step.
# dt_end_t: float = dt.timestamp()
# TODO: FIX HOW/WHY these aren't matching
# and are instead off by 4hours (EST
# vs. UTC?!?!)
# end_t: float = row['time']
# assert (
# dt.timestamp()
# ==
# end_t
# )
# TODO: FIX HOW/WHY these aren't matching
# and are instead off by 4hours (EST
# vs. UTC?!?!)
# end_t: float = row['time']
# assert (
# dt.timestamp()
# ==
# end_t
# )
# the gap's left-most bar's CLOSE value
# at that time (sample) step.
prev_r: pl.DataFrame = df.filter(
pl.col('index') == iend - 1
)
istart: int = prev_r['index'][0]
# dt_start_t: float = dt_prev.timestamp()
# the gap's left-most bar's CLOSE value
# at that time (sample) step.
# start_t: float = prev_r['time']
# assert (
# dt_start_t
# ==
# start_t
# )
prev_r: pl.DataFrame = df.filter(
pl.col('index') == gaps[0]['index'] - 1
# TODO: implement px-col width measure
# and ensure at least as many px-cols
# shown per rect as configured by user.
gap_w: float = abs((iend - istart))
if gap_w < 6:
margin: float = 6
iend += margin
istart -= margin
ro: tuple[float, float] = (
# dt_end_t,
iend,
row['open'][0],
)
lc: tuple[float, float] = (
# dt_start_t,
istart,
prev_r['close'][0],
)
# async with actl.open_rect(
# ) as aid:
aid: int = await actl.add_rect(
fqme=fqme,
timeframe=period_s,
start_pos=lc,
end_pos=ro,
)
assert aid
# write to parquet file?
if (
write_parquet
):
# write to fs
start = time.time()
path: Path = await client.write_ohlcv(
fqme,
ohlcv=deduped,
timeframe=period_s,
)
write_delay: float = round(
time.time() - start,
ndigits=6,
)
# read back from fs
start = time.time()
read_df: pl.DataFrame = pl.read_parquet(path)
read_delay: float = round(
time.time() - start,
ndigits=6,
)
log.info(
f'parquet write took {write_delay} secs\n'
f'file path: {path}'
f'parquet read took {read_delay} secs\n'
f'polars df: {read_df}'
)
if reload_parquet_to_shm:
new = tsp.pl2np(
deduped,
dtype=shm.array.dtype,
)
istart: int = prev_r['index'][0]
# dt_start_t: float = dt_prev.timestamp()
# start_t: float = prev_r['time']
# assert (
# dt_start_t
# ==
# start_t
# )
# TODO: implement px-col width measure
# and ensure at least as many px-cols
# shown per rect as configured by user.
gap_w: float = abs((iend - istart))
# await tractor.pause()
if gap_w < 6:
margin: float = 6
iend += margin
istart -= margin
ro: tuple[float, float] = (
# dt_end_t,
iend,
row['open'][0],
# since normally readonly
shm._array.setflags(
write=int(1),
)
lc: tuple[float, float] = (
# dt_start_t,
istart,
prev_r['close'][0],
shm.push(
new,
prepend=True,
start=new['index'][-1],
update_first=False, # don't update ._first
)
aid: int = await annot_ctl.add_rect(
fqme=fqme,
timeframe=period_s,
start_pos=lc,
end_pos=ro,
)
assert aid
await tractor.pause()
await tractor.pause()
assert diff
# write to parquet file?
if write_parquet:
timeframe: str = f'{period_s}s'
else:
# allow interaction even when no ts problems.
await tractor.pause()
assert not diff
datadir: Path = get_conf_dir() / 'nativedb'
if not datadir.is_dir():
datadir.mkdir()
path: Path = datadir / f'{fqme}.{timeframe}.parquet'
# write to fs
start = time.time()
df.write_parquet(path)
delay: float = round(
time.time() - start,
ndigits=6,
)
log.info(
f'parquet write took {delay} secs\n'
f'file path: {path}'
)
# read back from fs
start = time.time()
read_df: pl.DataFrame = pl.read_parquet(path)
delay: float = round(
time.time() - start,
ndigits=6,
)
print(
f'parquet read took {delay} secs\n'
f'polars df: {read_df}'
)
if df is None:
log.error(f'No matching shm buffers for {fqme} ?')

View File

@ -95,16 +95,19 @@ def detect_period(shm: ShmArray) -> float:
def mk_ohlcv_shm_keyed_filepath(
fqme: str,
period: float, # ow known as the "timeframe"
period: float | int, # ow known as the "timeframe"
datadir: Path,
) -> str:
) -> Path:
if period < 1.:
raise ValueError('Sample period should be >= 1.!?')
period_s: str = f'{period}s'
path: Path = datadir / f'{fqme}.ohlcv{period_s}.parquet'
path: Path = (
datadir
/
f'{fqme}.ohlcv{int(period)}s.parquet'
)
return path
@ -227,6 +230,7 @@ class NativeStorageClient:
self,
fqme: str,
period: float,
) -> Path:
return mk_ohlcv_shm_keyed_filepath(
fqme=fqme,
@ -239,6 +243,7 @@ class NativeStorageClient:
fqme: str,
df: pl.DataFrame,
timeframe: float,
) -> None:
# cache df for later usage since we (currently) need to
# convert to np.ndarrays to push to our `ShmArray` rt