piker/.claude/skills/timeseries-optimization/numpy-patterns.md

4.5 KiB
Raw Blame History

NumPy Structured Array Patterns

Detailed patterns for working with NumPy structured arrays in pikers financial data processing.

Pikers OHLCV Array Dtype

# typical piker array dtype
dtype = [
    ('index', 'i8'),   # absolute sequence index
    ('time', 'f8'),    # unix epoch timestamp
    ('open', 'f8'),
    ('high', 'f8'),
    ('low', 'f8'),
    ('close', 'f8'),
    ('volume', 'f8'),
]

arr = np.array(
    [(0, 1234.0, 100, 101, 99, 100.5, 1000)],
    dtype=dtype,
)

# field access
times = arr['time']     # returns view, not copy
closes = arr['close']

Structured Array Performance Gotchas

1. Field access in loops is slow

# BAD: repeated struct field access per iteration
for i, row in enumerate(arr):
    x = row['index']    # struct access!
    y = row['close']
    process(x, y)

# GOOD: extract fields once, iterate plain arrays
indices = arr['index']  # extract once
closes = arr['close']
for i in range(len(arr)):
    x = indices[i]      # plain array indexing
    y = closes[i]
    process(x, y)

2. Dict comprehensions with struct arrays

# SLOW: field access per row in Python loop
time_to_row = {
    float(row['time']): {
        'index': float(row['index']),
        'close': float(row['close']),
    }
    for row in matched_rows  # struct access!
}

# FAST: extract to plain arrays first
times = matched_rows['time'].astype(float)
indices = matched_rows['index'].astype(float)
closes = matched_rows['close'].astype(float)

time_to_row = {
    t: {'index': idx, 'close': cls}
    for t, idx, cls in zip(
        times, indices, closes,
    )
}

Vectorized Boolean Operations

Basic Filtering

# single condition
recent = array[array['time'] > cutoff_time]

# multiple conditions with &, |
filtered = array[
    (array['time'] > start_time)
    &
    (array['time'] < end_time)
    &
    (array['volume'] > min_volume)
]

# IMPORTANT: parentheses required around each!
# (operator precedence: & binds tighter than >)

Fancy Indexing

# boolean mask
mask = array['close'] > array['open']  # up bars
up_bars = array[mask]

# integer indices
indices = np.array([0, 5, 10, 15])
selected = array[indices]

# combine boolean + fancy indexing
mask = array['volume'] > threshold
high_vol_indices = np.where(mask)[0]
subset = array[high_vol_indices[::2]]  # every other

Common Financial Patterns

Gap Detection

# assume sorted by time
time_diffs = np.diff(array['time'])
expected_step = 60.0  # 1-minute bars

# find gaps larger than expected
gap_mask = time_diffs > (expected_step * 1.5)
gap_indices = np.where(gap_mask)[0]

# get gap start/end times
gap_starts = array['time'][gap_indices]
gap_ends = array['time'][gap_indices + 1]

Rolling Window Operations

# simple moving average (close)
window = 20
sma = np.convolve(
    array['close'],
    np.ones(window) / window,
    mode='valid',
)

# stride tricks for efficiency
from numpy.lib.stride_tricks import (
    sliding_window_view,
)
windows = sliding_window_view(
    array['close'], window,
)
sma = windows.mean(axis=1)

OHLC Resampling (NumPy)

# resample 1m bars to 5m bars
def resample_ohlc(arr, old_step, new_step):
    n_bars = len(arr)
    factor = int(new_step / old_step)

    # truncate to multiple of factor
    n_complete = (n_bars // factor) * factor
    arr = arr[:n_complete]

    # reshape into chunks
    reshaped = arr.reshape(-1, factor)

    # aggregate OHLC
    opens = reshaped[:, 0]['open']
    highs = reshaped['high'].max(axis=1)
    lows = reshaped['low'].min(axis=1)
    closes = reshaped[:, -1]['close']
    volumes = reshaped['volume'].sum(axis=1)

    return np.rec.fromarrays(
        [opens, highs, lows, closes, volumes],
        names=[
            'open', 'high', 'low',
            'close', 'volume',
        ],
    )

Memory Considerations

Views vs Copies

# VIEW: shares memory (fast, no copy)
times = array['time']         # field access
subset = array[10:20]         # slicing
reshaped = array.reshape(-1, 2)

# COPY: new memory allocation
filtered = array[array['time'] > cutoff]
sorted_arr = np.sort(array)
casted = array.astype(np.float32)

# force copy when needed
explicit_copy = array.copy()

In-Place Operations

# modify in-place (no new allocation)
array['close'] *= 1.01  # scale prices
array['volume'][mask] = 0  # zero out rows

# careful: compound ops may create temporaries
array['close'] = array['close'] * 1.01  # temp!
array['close'] *= 1.01  # true in-place