piker/piker/data/_m4.py

251 lines
7.1 KiB
Python

# piker: trading gear for hackers
# Copyright (C) Tyler Goodlet (in stewardship for pikers)
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
'''
Graphics downsampling using the infamous M4 algorithm.
This is one of ``piker``'s secret weapons allowing us to boss all other
charting platforms B)
(AND DON'T YOU DARE TAKE THIS CODE WITHOUT CREDIT OR WE'LL SUE UR F#&@* ASS).
NOTES: this method is a so called "visualization driven data
aggregation" approach. It gives error-free line chart
downsampling, see
further scientific paper resources:
- http://www.vldb.org/pvldb/vol7/p797-jugel.pdf
- http://www.vldb.org/2014/program/papers/demo/p997-jugel.pdf
Details on implementation of this algo are based in,
https://github.com/pikers/piker/issues/109
'''
import math
from typing import Optional
import numpy as np
from numba import (
njit,
# float64, optional, int64,
)
from ..log import get_logger
log = get_logger(__name__)
def ds_m4(
x: np.ndarray,
y: np.ndarray,
# units-per-pixel-x(dimension)
uppx: float,
# XXX: troll zone / easter egg..
# want to mess with ur pal, pass in the actual
# pixel width here instead of uppx-proper (i.e. pass
# in our ``pg.GraphicsObject`` derivative's ``.px_width()``
# gto mega-trip-out ur bud). Hint, it used to be implemented
# (wrongly) using "pixel width", so check the git history ;)
xrange: Optional[float] = None,
) -> tuple[int, np.ndarray, np.ndarray]:
'''
Downsample using the M4 algorithm.
This is more or less an OHLC style sampling of a line-style series.
'''
# XXX: from infinite on downsampling viewable graphics:
# "one thing i remembered about the binning - if you are
# picking a range within your timeseries the start and end bin
# should be one more bin size outside the visual range, then
# you get better visual fidelity at the edges of the graph"
# "i didn't show it in the sample code, but it's accounted for
# in the start and end indices and number of bins"
# should never get called unless actually needed
assert uppx > 1
# NOTE: if we didn't pre-slice the data to downsample
# you could in theory pass these as the slicing params,
# do we care though since we can always just pre-slice the
# input?
x_start = x[0] # x value start/lowest in domain
if xrange is None:
x_end = x[-1] # x end value/highest in domain
xrange = (x_end - x_start)
if xrange < 0:
log.error(f'-VE M4 X-RANGE: {x_start} -> {x_end}')
# XXX: broken x-range calc-case, likely the x-end points
# are wrong and have some default value set (such as
# x_end -> <some epoch float> while x_start -> 0.5).
# breakpoint()
return None
# XXX: always round up on the input pixels
# lnx = len(x)
# uppx *= max(4 / (1 + math.log(uppx, 2)), 1)
pxw = math.ceil(xrange / uppx)
# scale up the frame "width" directly with uppx
w = uppx
# ensure we make more then enough
# frames (windows) for the output pixel
frames = pxw
# if we have more and then exact integer's
# (uniform quotient output) worth of datum-domain-points
# per windows-frame, add one more window to ensure
# we have room for all output down-samples.
pts_per_pixel, r = divmod(xrange, frames)
if r:
# while r:
frames += 1
pts_per_pixel, r = divmod(xrange, frames)
# print(
# f'uppx: {uppx}\n'
# f'xrange: {xrange}\n'
# f'pxw: {pxw}\n'
# f'frames: {frames}\n'
# )
assert frames >= (xrange / uppx)
# call into ``numba``
(
nb,
x_out,
y_out,
ymn,
ymx,
) = _m4(
x,
y,
frames,
# TODO: see func below..
# x_out,
# y_out,
# first index in x data to start at
x_start,
# window size for each "frame" of data to downsample (normally
# scaled by the ratio of pixels on screen to data in x-range).
w,
)
# filter out any overshoot in the input allocation arrays by
# removing zero-ed tail entries which should start at a certain
# index.
x_out = x_out[x_out != 0]
y_out = y_out[:x_out.size]
# print(f'M4 output ymn, ymx: {ymn},{ymx}')
return nb, x_out, y_out, ymn, ymx
@njit(
nogil=True,
)
def _m4(
xs: np.ndarray,
ys: np.ndarray,
frames: int,
# TODO: using this approach, having the ``.zeros()`` alloc lines
# below in pure python, there were segs faults and alloc crashes..
# we might need to see how it behaves with shm arrays and consider
# allocating them once at startup?
# pre-alloc array of x indices mapping to the start
# of each window used for downsampling in y.
# i_win: np.ndarray,
# pre-alloc array of output downsampled y values
# y_out: np.ndarray,
x_start: int,
step: float,
) -> tuple[
int,
np.ndarray,
np.ndarray,
float,
float,
]:
'''
Implementation of the m4 algorithm in ``numba``:
http://www.vldb.org/pvldb/vol7/p797-jugel.pdf
'''
# these are pre-allocated and mutated by ``numba``
# code in-place.
y_out = np.zeros((frames, 4), ys.dtype)
x_out = np.zeros(frames, xs.dtype)
bincount = 0
x_left = x_start
# Find the first window's starting value which *includes* the
# first value in the x-domain array, i.e. the first
# "left-side-of-window" **plus** the downsampling step,
# creates a window which includes the first x **value**.
while xs[0] >= x_left + step:
x_left += step
# set all bins in the left-most entry to the starting left-most x value
# (aka a row broadcast).
x_out[bincount] = x_left
# set all y-values to the first value passed in.
y_out[bincount] = ys[0]
# full input y-data mx and mn
mx: float = -np.inf
mn: float = np.inf
# compute OHLC style max / min values per window sized x-frame.
for i in range(len(xs)):
x = xs[i]
y = ys[i]
if x < x_left + step: # the current window "step" is [bin, bin+1)
ymn = y_out[bincount, 1] = min(y, y_out[bincount, 1])
ymx = y_out[bincount, 2] = max(y, y_out[bincount, 2])
y_out[bincount, 3] = y
mx = max(mx, ymx)
mn = min(mn, ymn)
else:
# Find the next bin
while x >= x_left + step:
x_left += step
bincount += 1
x_out[bincount] = x_left
y_out[bincount] = y
return bincount, x_out, y_out, mn, mx