Add (ostensibly) working first attempt at M4 algo

All the refs are in the comments and original sample code from infinite
has been reworked to expect the input x/y arrays to already be sliced
(though we can later support passing in the start-end indexes if
desired).

The new routines are `ds_m4()` the python top level API and `_m4()` the
fast `numba` implementation.
big_data_lines
Tyler Goodlet 2022-03-15 09:06:35 -04:00
parent e7481b1634
commit 39b7c9340d
1 changed files with 131 additions and 72 deletions

View File

@ -19,13 +19,13 @@ Graphics related downsampling routines for compressing to pixel
limits on the display device. limits on the display device.
''' '''
# from typing import Optional from typing import Optional
import numpy as np import numpy as np
# from numpy.lib.recfunctions import structured_to_unstructured # from numpy.lib.recfunctions import structured_to_unstructured
from numba import ( from numba import (
jit, jit,
float64, optional, int64, # float64, optional, int64,
) )
from ..log import get_logger from ..log import get_logger
@ -36,7 +36,7 @@ log = get_logger(__name__)
def hl2mxmn( def hl2mxmn(
ohlc: np.ndarray, ohlc: np.ndarray,
downsample_by: int = 0, # downsample_by: int = 0,
) -> np.ndarray: ) -> np.ndarray:
''' '''
@ -61,17 +61,19 @@ def hl2mxmn(
trace_hl(hls, mxmn, x, index[0]) trace_hl(hls, mxmn, x, index[0])
x = x + index[0] x = x + index[0]
if downsample_by < 2:
return mxmn, x return mxmn, x
dsx, dsy = downsample( # if downsample_by < 2:
y=mxmn, # return mxmn, x
x=x,
bins=downsample_by, # dsx, dsy = downsample(
) # y=mxmn,
log.info(f'downsampling by {downsample_by}') # x=x,
print(f'downsampling by {downsample_by}') # bins=downsample_by,
return dsy, dsx # )
# log.info(f'downsampling by {downsample_by}')
# print(f'downsampling by {downsample_by}')
# return dsy, dsx
@jit( @jit(
@ -129,8 +131,11 @@ def downsample(
x: np.ndarray, x: np.ndarray,
y: np.ndarray, y: np.ndarray,
bins: int = 2, bins: int = 2,
method: str = 'peak', method: str = 'peak',
**kwargs,
) -> tuple[np.ndarray, np.ndarray]: ) -> tuple[np.ndarray, np.ndarray]:
''' '''
Downsample x/y data for lesser curve graphics gen. Downsample x/y data for lesser curve graphics gen.
@ -142,7 +147,6 @@ def downsample(
# py3.10 syntax # py3.10 syntax
match method: match method:
case 'peak': case 'peak':
# breakpoint()
if bins < 2: if bins < 2:
log.warning('No downsampling taking place?') log.warning('No downsampling taking place?')
@ -164,11 +168,37 @@ def downsample(
return x, y return x, y
# TODO: this algo from infinite, see case 'm4':
# https://github.com/pikers/piker/issues/109 return ds_m4(x, y, kwargs['px_width'])
case 'infinite_4px':
# Ex. from infinite on downsampling viewable graphics.
def ds_m4(
x: np.ndarray,
y: np.ndarray,
# this is the width of the data in view
# in display-device-local pixel units.
px_width: int,
factor: Optional[int] = None,
) -> tuple[np.ndarray, np.ndarray]:
'''
Downsample using the M4 algorithm.
'''
# NOTE: this method is a so called "visualization driven data
# aggregation" approach. It gives error-free line chart
# downsampling, see
# further scientific paper resources:
# - http://www.vldb.org/pvldb/vol7/p797-jugel.pdf
# - http://www.vldb.org/2014/program/papers/demo/p997-jugel.pdf
# Details on implementation of this algo are based in,
# https://github.com/pikers/piker/issues/109
# XXX: from infinite on downsampling viewable graphics:
# "one thing i remembered about the binning - if you are # "one thing i remembered about the binning - if you are
# picking a range within your timeseries the start and end bin # picking a range within your timeseries the start and end bin
# should be one more bin size outside the visual range, then # should be one more bin size outside the visual range, then
@ -176,75 +206,104 @@ def downsample(
# "i didn't show it in the sample code, but it's accounted for # "i didn't show it in the sample code, but it's accounted for
# in the start and end indices and number of bins" # in the start and end indices and number of bins"
def build_subchart( assert px_width > 1 # width of screen in pxs?
self,
subchart,
width, # width of screen in pxs?
chart_type,
lower, # x start?
upper, # x end?
xvals,
yvals
):
pts_per_pixel = len(xvals) / width
if pts_per_pixel > 1:
# this is mutated in-place # NOTE: if we didn't pre-slice the data to downsample
data = np.zeros((width, 4), yvals.dtype) # you could in theory pass these as the slicing params,
bins = np.zeros(width, xvals.dtype) # do we care though since we can always just pre-slice the
# input?
x_start = 0 # x index start
x_end = len(x) # x index end
nb = subset_by_x( # uppx: units-per-pixel
xvals, pts_per_pixel = len(x) / px_width
yvals, print(f'UPPX: {pts_per_pixel}')
bins,
data, # ratio of indexed x-value to width of raster in pixels.
lower, if factor is None:
# this is scaling the x-range by w = (x_end-x_start) / float(px_width)
# the width of the screen? print(f' pts/pxs = {w}')
(upper-lower)/float(width), else:
w = factor
# these are pre-allocated and mutated by ``numba``
# code in-place.
ds = np.zeros((px_width, 4), y.dtype)
i_win = np.zeros(px_width, x.dtype)
# call into ``numba``
nb = _m4(
x,
y,
i_win,
ds,
# first index in x data to start at
x_start,
# window size for each "frame" of data to downsample (normally
# scaled by the ratio of pixels on screen to data in x-range).
w,
) )
print(f'downsampled to {nb} bins') print(f'downsampled to {nb} bins')
return x, y return i_win, ds.flatten()
@jit(nopython=True) @jit(
def subset_by_x( nopython=True,
)
def _m4(
xs: np.ndarray, xs: np.ndarray,
ys: np.ndarray, ys: np.ndarray,
bins: np.ndarray,
data: np.ndarray, # pre-alloc array of x indices mapping to the start
# of each window used for downsampling in y.
i_win: np.ndarray,
# pre-alloc array of output downsampled y values
ds: np.ndarray,
x_start: int, x_start: int,
step: float, step: float,
) -> int: ) -> int:
# nbins = len(bins) # nbins = len(i_win)
count = len(xs) # count = len(xs)
bincount = 0 bincount = 0
x_left = x_start x_left = x_start
# Find the first bin # Find the first window's starting index which *includes* the
# first value in the x-domain array.
# (this allows passing in an array which is indexed (and thus smaller then)
# the ``x_start`` value normally passed in - say if you normally
# want to start 0-indexed.
first = xs[0] first = xs[0]
while first >= x_left + step: while first >= x_left + step:
x_left += step x_left += step
bins[bincount] = x_left # set all bins in the left-most entry to the starting left-most x value
data[bincount] = ys[0] # (aka a row broadcast).
i_win[bincount] = x_left
# set all y-values to the first value passed in.
ds[bincount] = ys[0]
for i in range(count): for i in range(len(xs)):
x = xs[i] x = xs[i]
y = ys[i] y = ys[i]
if x < x_left + step: # Interval is [bin, bin+1) if x < x_left + step: # the current window "step" is [bin, bin+1)
data[bincount, 1] = min(y, data[bincount, 1]) ds[bincount, 1] = min(y, ds[bincount, 1])
data[bincount, 2] = max(y, data[bincount, 2]) ds[bincount, 2] = max(y, ds[bincount, 2])
data[bincount, 3] = y ds[bincount, 3] = y
else: else:
# Find the next bin # Find the next bin
while x >= x_left + step: while x >= x_left + step:
x_left += step x_left += step
bincount += 1 bincount += 1
bins[bincount] = x_left i_win[bincount] = x_left
data[bincount] = y ds[bincount] = y
return bincount return bincount