Move `numpy` <-> `polars` converters into `.data.tsp`

Yet again these are (going to be) generally useful in the data proc layer as well as going forward with (possibly) moving the history and shm rt-processing layer to apache (arrow or other) shared-ds equivalents.
2023-12-11 17:53:31 -05:00 · 2023-12-11 17:53:31 -05:00 · 49c458710e
parent b94582cb35
commit 49c458710e
3 changed files with 58 additions and 52 deletions
--- a/piker/data/tsp.py
+++ b/piker/data/tsp.py
@ -28,6 +28,7 @@ from math import (
    ceil,
    floor,
 )
 import time
 from typing import Literal
 import numpy as np
@ -408,3 +409,51 @@ def dedupe(src_df: pl.DataFrame) -> tuple[
            deduped,
            was_deduped,
        )
 # NOTE: thanks to this SO answer for the below conversion routines
 # to go from numpy struct-arrays to polars dataframes and back:
 # https://stackoverflow.com/a/72054819
 def np2pl(array: np.ndarray) -> pl.DataFrame:
    start = time.time()
    # XXX: thanks to this SO answer for this conversion tip:
    # https://stackoverflow.com/a/72054819
    df = pl.DataFrame({
        field_name: array[field_name]
        for field_name in array.dtype.fields
    })
    delay: float = round(
        time.time() - start,
        ndigits=6,
    )
    log.info(
        f'numpy -> polars conversion took {delay} secs\n'
        f'polars df: {df}'
    )
    return df
    # return pl.DataFrame({
    #     field_name: array[field_name]
    #     for field_name in array.dtype.fields
    # })
 def pl2np(
    df: pl.DataFrame,
    dtype: np.dtype,
 ) -> np.ndarray:
    # Create numpy struct array of the correct size and dtype
    # and loop through df columns to fill in array fields.
    array = np.empty(
        df.height,
        dtype,
    )
    for field, col in zip(
        dtype.fields,
        df.columns,
    ):
        array[field] = df.get_column(col).to_numpy()
    return array
--- a/piker/storage/cli.py
+++ b/piker/storage/cli.py
@ -260,22 +260,8 @@ def iter_dfs_from_shms(fqme: str) -> Generator[
        assert not opened
        ohlcv = shm.array
-        start = time.time()
+        from .nativedb import np2pl
-
+        df: pl.DataFrame = np2pl(ohlcv)
        # XXX: thanks to this SO answer for this conversion tip:
        # https://stackoverflow.com/a/72054819
        df = pl.DataFrame({
            field_name: ohlcv[field_name]
            for field_name in ohlcv.dtype.fields
        })
        delay: float = round(
            time.time() - start,
            ndigits=6,
        )
        log.info(
            f'numpy -> polars conversion took {delay} secs\n'
            f'polars df: {df}'
        )
        yield (
            shmfile,
@ -316,7 +302,6 @@ def ldshm(
                        f'Something is wrong with time period for {shm}:\n{times}'
                    )
                # over-write back to shm?
                df: pl.DataFrame  # with dts
                deduped: pl.DataFrame  # deduplicated dts
--- a/piker/storage/nativedb.py
+++ b/piker/storage/nativedb.py
@ -65,8 +65,11 @@ from pendulum import (
 )
 from piker import config
-from piker.data import def_iohlcv_fields
+from piker.data import (
-from piker.data import ShmArray
+    def_iohlcv_fields,
    ShmArray,
    tsp,
 )
 from piker.log import get_logger
 from . import TimeseriesNotFound
@ -74,37 +77,6 @@ from . import TimeseriesNotFound
 log = get_logger('storage.nativedb')
 # NOTE: thanks to this SO answer for the below conversion routines
 # to go from numpy struct-arrays to polars dataframes and back:
 # https://stackoverflow.com/a/72054819
 def np2pl(array: np.ndarray) -> pl.DataFrame:
    return pl.DataFrame({
        field_name: array[field_name]
        for field_name in array.dtype.fields
    })
 def pl2np(
    df: pl.DataFrame,
    dtype: np.dtype,
 ) -> np.ndarray:
    # Create numpy struct array of the correct size and dtype
    # and loop through df columns to fill in array fields.
    array = np.empty(
        df.height,
        dtype,
    )
    for field, col in zip(
        dtype.fields,
        df.columns,
    ):
        array[field] = df.get_column(col).to_numpy()
    return array
 def detect_period(shm: ShmArray) -> float:
    '''
    Attempt to detect the series time step sampling period
@ -290,7 +262,7 @@ class NativeStorageClient:
        # TODO: filter by end and limit inputs
        # times: pl.Series = df['time']
-        array: np.ndarray = pl2np(
+        array: np.ndarray = tsp.pl2np(
            df,
            dtype=np.dtype(def_iohlcv_fields),
        )
@ -326,7 +298,7 @@ class NativeStorageClient:
            datadir=self._datadir,
        )
        if isinstance(ohlcv, np.ndarray):
-            df: pl.DataFrame = np2pl(ohlcv)
+            df: pl.DataFrame = tsp.np2pl(ohlcv)
        else:
            df = ohlcv