piker/piker/storage/cli.py

# piker: trading gear for hackers
# Copyright (C) 2018-present  Tyler Goodlet (in stewardship of piker0)

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
Storage middle-ware CLIs.

"""
from __future__ import annotations
from pathlib import Path
import time
from typing import Generator
# from typing import TYPE_CHECKING

import polars as pl
import numpy as np
import tractor
# import pendulum
from rich.console import Console
import trio
# from rich.markdown import Markdown
import typer

from piker.service import open_piker_runtime
from piker.cli import cli
from piker.config import get_conf_dir
from piker.data import (
    maybe_open_shm_array,
    def_iohlcv_fields,
    ShmArray,
)
from piker.data.history import (
    _default_hist_size,
    _default_rt_size,
)
from . import (
    log,
)
from . import (
    __tsdbs__,
    open_storage_client,
)


store = typer.Typer()


@store.command()
def ls(
    backends: list[str] = typer.Argument(
        default=None,
        help='Storage backends to query, default is all.'
    ),
):
    from rich.table import Table

    if not backends:
        backends: list[str] = __tsdbs__

    console = Console()

    async def query_all():
        nonlocal backends

        async with (
            open_piker_runtime(
                'tsdb_storage',
            ),
        ):
            for i, backend in enumerate(backends):
                table = Table()
                try:
                    async with open_storage_client(backend=backend) as (
                        mod,
                        client,
                    ):
                        table.add_column(f'{mod.name}@{client.address}')
                        keys: list[str] = await client.list_keys()
                        for key in keys:
                            table.add_row(key)

                    console.print(table)
                except Exception:
                    log.error(f'Unable to connect to storage engine: `{backend}`')

    trio.run(query_all)


@store.command()
def delete(
    symbols: list[str],

    backend: str = typer.Option(
        default=None,
        help='Storage backend to update'
    ),
    # TODO: expose this as flagged multi-option?
    timeframes: list[int] = [1, 60],
):
    '''
    Delete a storage backend's time series for (table) keys provided as
    ``symbols``.

    '''
    from . import open_storage_client

    async def main(symbols: list[str]):
        async with (
            open_piker_runtime(
                'tsdb_storage',
            ),
            open_storage_client(backend) as (_, client),
            trio.open_nursery() as n,
        ):
            # spawn queries as tasks for max conc!
            for fqme in symbols:
                for tf in timeframes:
                    n.start_soon(
                        client.delete_ts,
                        fqme,
                        tf,
                    )

    trio.run(main, symbols)


@store.command()
def anal(
    fqme: str,
    period: int = 60,

) -> np.ndarray:

    async def main():
        async with (
            open_piker_runtime(
                'tsdb_polars_anal',
                debug_mode=True,
            ),
            open_storage_client() as (mod, client),
        ):
            syms: list[str] = await client.list_keys()
            print(f'{len(syms)} FOUND for {mod.name}')

            (
                history,
                first_dt,
                last_dt,
            ) = await client.load(
                fqme,
                period,
            )
            assert first_dt < last_dt

            src_df = await client.as_df(fqme, period)
            from piker.data import _timeseries as tsmod
            df: pl.DataFrame = tsmod.with_dts(src_df)
            gaps: pl.DataFrame = tsmod.detect_time_gaps(df)

            if not gaps.is_empty():
                print(f'Gaps found:\n{gaps}')

            # TODO: something better with tab completion..
            # is there something more minimal but nearly as
            # functional as ipython?
            await tractor.pause()

    trio.run(main)


def iter_dfs_from_shms(fqme: str) -> Generator[
    tuple[Path, ShmArray, pl.DataFrame],
    None,
    None,
]:
    # shm buffer size table based on known sample rates
    sizes: dict[str, int] = {
        'hist': _default_hist_size,
        'rt': _default_rt_size,
    }

    # load all detected shm buffer files which have the
    # passed FQME pattern in the file name.
    shmfiles: list[Path] = []
    shmdir = Path('/dev/shm/')

    for shmfile in shmdir.glob(f'*{fqme}*'):
        filename: str = shmfile.name

        # skip index files
        if (
            '_first' in filename
            or '_last' in filename
        ):
            continue

        assert shmfile.is_file()
        log.debug(f'Found matching shm buffer file: {filename}')
        shmfiles.append(shmfile)

    for shmfile in shmfiles:

        # lookup array buffer size based on file suffix
        # being either .rt or .hist
        key: str = shmfile.name.rsplit('.')[-1]

        # skip FSP buffers for now..
        if key not in sizes:
            continue

        size: int = sizes[key]

        # attach to any shm buffer, load array into polars df,
        # write to local parquet file.
        shm, opened = maybe_open_shm_array(
            key=shmfile.name,
            size=size,
            dtype=def_iohlcv_fields,
            readonly=True,
        )
        assert not opened
        ohlcv = shm.array

        start = time.time()

        # XXX: thanks to this SO answer for this conversion tip:
        # https://stackoverflow.com/a/72054819
        df = pl.DataFrame({
            field_name: ohlcv[field_name]
            for field_name in ohlcv.dtype.fields
        })
        delay: float = round(
            time.time() - start,
            ndigits=6,
        )
        log.info(
            f'numpy -> polars conversion took {delay} secs\n'
            f'polars df: {df}'
        )

        yield (
            shmfile,
            shm,
            df,
        )


@store.command()
def ldshm(
    fqme: str,

    write_parquet: bool = False,

) -> None:
    '''
    Linux ONLY: load any fqme file name matching shm buffer from
    /dev/shm/ into an OHLCV numpy array and polars DataFrame,
    optionally write to .parquet file.

    '''
    async def main():
        async with (
            open_piker_runtime(
                'polars_boi',
                enable_modules=['piker.data._sharedmem'],
                debug_mode=True,
            ),
        ):
            df: pl.DataFrame | None = None
            for shmfile, shm, src_df in iter_dfs_from_shms(fqme):

                # compute ohlc properties for naming
                times: np.ndarray = shm.array['time']
                secs: float = times[-1] - times[-2]
                if secs < 1.:
                    raise ValueError(
                        f'Something is wrong with time period for {shm}:\n{times}'
                    )

                from piker.data import _timeseries as tsmod
                df: pl.DataFrame = tsmod.with_dts(src_df)
                gaps: pl.DataFrame = tsmod.detect_time_gaps(df)

                # TODO: maybe only optionally enter this depending
                # on some CLI flags and/or gap detection?
                if (
                    not gaps.is_empty()
                    or secs > 2
                ):
                    await tractor.pause()

                # write to parquet file?
                if write_parquet:
                    timeframe: str = f'{secs}s'

                    datadir: Path = get_conf_dir() / 'nativedb'
                    if not datadir.is_dir():
                        datadir.mkdir()

                    path: Path = datadir / f'{fqme}.{timeframe}.parquet'

                    # write to fs
                    start = time.time()
                    df.write_parquet(path)
                    delay: float = round(
                        time.time() - start,
                        ndigits=6,
                    )
                    log.info(
                        f'parquet write took {delay} secs\n'
                        f'file path: {path}'
                    )

                    # read back from fs
                    start = time.time()
                    read_df: pl.DataFrame = pl.read_parquet(path)
                    delay: float = round(
                        time.time() - start,
                        ndigits=6,
                    )
                    print(
                        f'parquet read took {delay} secs\n'
                        f'polars df: {read_df}'
                    )

            if df is None:
                log.error(f'No matching shm buffers for {fqme} ?')

    trio.run(main)


typer_click_object = typer.main.get_command(store)
cli.add_command(typer_click_object, 'store')