A PoC tsdb prototype: `parqdb` using `polars`

Turns out just (over)writing `.parquet` files with >= 1M datums is like less then a second, and we can likely speed up appends using `fastparquet` (usage coming soon). Includes: - a new `clone` CLI subcmd to test this all out by ad-hoc copy of (literally hardcoded to a daemon-actor specific shm allocation X) an existing `/dev/shm/<ShmArray>` and push to `.parquet` file. - code to convert from our `ShmArray.array: np.ndarray` -> `polars.DataFrame` (thanks SO). - timing checks around the file IO and np -> polars conversion. - a `read` subcmd which i was using to test the sync `pymarketstore` client against our async one to see if the issues from https://github.com/pikers/piker/issues/443 were resolved, but nope!
2023-05-31 18:39:41 -04:00 · 2023-05-31 18:39:41 -04:00 · 94733c4a0b
parent 7d1cc47db9
commit 94733c4a0b
1 changed files with 189 additions and 4 deletions
--- a/piker/storage/cli.py
+++ b/piker/storage/cli.py
@ -19,13 +19,18 @@ Storage middle-ware CLIs.

 """
 from __future__ import annotations
+from pathlib import Path
 from typing import TYPE_CHECKING
-import trio
+
+import numpy as np
+import pendulum
 from rich.console import Console
+import trio
 # from rich.markdown import Markdown
 import typer

-from ..cli import cli
+from piker.service import open_piker_runtime
+from piker.cli import cli
 from . import (
    log,
 )
@ -44,7 +49,7 @@ def ls(
        help='Storage backends to query, default is all.'
    ),
 ):
-    from piker.service import open_piker_runtime
+    # from piker.service import open_piker_runtime
    from . import (
        __tsdbs__,
        open_storage_client,
@ -132,7 +137,6 @@ def delete(
    ``symbols``.

    '''
-    from piker.service import open_piker_runtime
    from . import open_storage_client

    async def main(symbols: list[str]):
@ -157,5 +161,186 @@ def delete(
    trio.run(main, symbols)


+@store.command()
+def read(
+    fqme: str,
+
+    limit: int = int(800e3),
+    client_type: str = 'async',
+
+) -> np.ndarray:
+
+    end: int | None = None
+
+    if client_type == 'sync':
+        import pymarketstore as pymkts
+        cli = pymkts.Client()
+
+
+        while end != 0:
+            param = pymkts.Params(
+                fqme,
+                '1Min',
+                'OHLCV',
+                limit=limit,
+                # limit_from_start=True,
+                end=end,
+            )
+            if end is not None:
+                breakpoint()
+            reply = cli.query(param)
+            ds: pymkts.results.DataSet = reply.first()
+            array: np.ndarray = ds.array
+
+            print(f'loaded {len(array)}-len array:\n{array}')
+
+            times = array['Epoch']
+            end: float = float(times[0])
+            dt = pendulum.from_timestamp(end)
+            # end: str = dt.isoformat('T')
+            breakpoint()
+            print(
+                f'trying to load next {limit} datums frame starting @ {dt}'
+            )
+    else:
+        from anyio_marketstore import (  # noqa
+            open_marketstore_client,
+            MarketstoreClient,
+            Params,
+        )
+        async def main():
+
+            end: int | None = None
+
+            async with open_marketstore_client(
+                'localhost',
+                5995,
+            ) as client:
+
+                while end != 0:
+                    params = Params(
+                        symbols=fqme,
+                        # timeframe=tfstr,
+                        timeframe='1Min',
+                        attrgroup='OHLCV',
+                        end=end,
+                        # limit_from_start=True,
+
+                        # TODO: figure the max limit here given the
+                        # ``purepc`` msg size limit of purerpc: 33554432
+                        limit=limit,
+                    )
+
+                    if end is not None:
+                        breakpoint()
+                    result = await client.query(params)
+                    data_set = result.by_symbols()[fqme]
+                    array = data_set.array
+                    times = array['Epoch']
+                    end: float = float(times[0])
+                    dt = pendulum.from_timestamp(end)
+                    breakpoint()
+                    print(
+                        f'trying to load next {limit} datums frame starting @ {dt}'
+                    )
+
+        trio.run(main)
+
+
+@store.command()
+def clone(
+    fqme: str,
+) -> None:
+    import time
+    from piker.config import get_conf_dir
+    from piker.data import (
+        maybe_open_shm_array,
+        def_iohlcv_fields,
+    )
+    import polars as pl
+
+    # open existing shm buffer for kucoin backend
+    key: str = 'piker.brokerd[d07c9bb7-b720-41].tlosusdt.kucoin.hist'
+    shmpath: Path = Path('/dev/shm') / key
+    assert shmpath.is_file()
+
+    async def main():
+        async with (
+            open_piker_runtime(
+                'polars_boi',
+                enable_modules=['piker.data._sharedmem'],
+            ),
+        ):
+            # attach to any shm buffer, load array into polars df,
+            # write to local parquet file.
+            shm, opened = maybe_open_shm_array(
+                key=key,
+                dtype=def_iohlcv_fields,
+            )
+            assert not opened
+            ohlcv = shm.array
+
+            start = time.time()
+
+            # XXX: thanks to this SO answer for this conversion tip:
+            # https://stackoverflow.com/a/72054819
+            df = pl.DataFrame({
+                field_name: ohlcv[field_name]
+                for field_name in ohlcv.dtype.fields
+            })
+            delay: float = round(
+                time.time() - start,
+                ndigits=6,
+            )
+            print(
+                f'numpy -> polars conversion took {delay} secs\n'
+                f'polars df: {df}'
+            )
+
+            # compute ohlc properties for naming
+            times: np.ndarray = ohlcv['time']
+            secs: float = times[-1] - times[-2]
+            if secs < 1.:
+                breakpoint()
+                raise ValueError(
+                    f'Something is wrong with time period for {shm}:\n{ohlcv}'
+                )
+
+            timeframe: str = f'{secs}s'
+
+            # write to parquet file
+            datadir: Path = get_conf_dir() / 'parqdb'
+            if not datadir.is_dir():
+                datadir.mkdir()
+
+            path: Path = datadir / f'{fqme}.{timeframe}.parquet'
+
+            # write to fs
+            start = time.time()
+            df.write_parquet(path)
+            delay: float = round(
+                time.time() - start,
+                ndigits=6,
+            )
+            print(
+                f'parquet write took {delay} secs\n'
+                f'file path: {path}'
+            )
+
+            # read back from fs
+            start = time.time()
+            read_df: pl.DataFrame = pl.read_parquet(path)
+            delay: float = round(
+                time.time() - start,
+                ndigits=6,
+            )
+            print(
+                f'parquet read took {delay} secs\n'
+                f'polars df: {read_df}'
+            )
+
+    trio.run(main)
+
+
 typer_click_object = typer.main.get_command(store)
 cli.add_command(typer_click_object, 'store')