213 lines
4.5 KiB
Markdown
213 lines
4.5 KiB
Markdown
# NumPy Structured Array Patterns
|
|
|
|
Detailed patterns for working with NumPy structured
|
|
arrays in piker's financial data processing.
|
|
|
|
## Piker's OHLCV Array Dtype
|
|
|
|
```python
|
|
# typical piker array dtype
|
|
dtype = [
|
|
('index', 'i8'), # absolute sequence index
|
|
('time', 'f8'), # unix epoch timestamp
|
|
('open', 'f8'),
|
|
('high', 'f8'),
|
|
('low', 'f8'),
|
|
('close', 'f8'),
|
|
('volume', 'f8'),
|
|
]
|
|
|
|
arr = np.array(
|
|
[(0, 1234.0, 100, 101, 99, 100.5, 1000)],
|
|
dtype=dtype,
|
|
)
|
|
|
|
# field access
|
|
times = arr['time'] # returns view, not copy
|
|
closes = arr['close']
|
|
```
|
|
|
|
## Structured Array Performance Gotchas
|
|
|
|
### 1. Field access in loops is slow
|
|
|
|
```python
|
|
# BAD: repeated struct field access per iteration
|
|
for i, row in enumerate(arr):
|
|
x = row['index'] # struct access!
|
|
y = row['close']
|
|
process(x, y)
|
|
|
|
# GOOD: extract fields once, iterate plain arrays
|
|
indices = arr['index'] # extract once
|
|
closes = arr['close']
|
|
for i in range(len(arr)):
|
|
x = indices[i] # plain array indexing
|
|
y = closes[i]
|
|
process(x, y)
|
|
```
|
|
|
|
### 2. Dict comprehensions with struct arrays
|
|
|
|
```python
|
|
# SLOW: field access per row in Python loop
|
|
time_to_row = {
|
|
float(row['time']): {
|
|
'index': float(row['index']),
|
|
'close': float(row['close']),
|
|
}
|
|
for row in matched_rows # struct access!
|
|
}
|
|
|
|
# FAST: extract to plain arrays first
|
|
times = matched_rows['time'].astype(float)
|
|
indices = matched_rows['index'].astype(float)
|
|
closes = matched_rows['close'].astype(float)
|
|
|
|
time_to_row = {
|
|
t: {'index': idx, 'close': cls}
|
|
for t, idx, cls in zip(
|
|
times, indices, closes,
|
|
)
|
|
}
|
|
```
|
|
|
|
## Vectorized Boolean Operations
|
|
|
|
### Basic Filtering
|
|
|
|
```python
|
|
# single condition
|
|
recent = array[array['time'] > cutoff_time]
|
|
|
|
# multiple conditions with &, |
|
|
filtered = array[
|
|
(array['time'] > start_time)
|
|
&
|
|
(array['time'] < end_time)
|
|
&
|
|
(array['volume'] > min_volume)
|
|
]
|
|
|
|
# IMPORTANT: parentheses required around each!
|
|
# (operator precedence: & binds tighter than >)
|
|
```
|
|
|
|
### Fancy Indexing
|
|
|
|
```python
|
|
# boolean mask
|
|
mask = array['close'] > array['open'] # up bars
|
|
up_bars = array[mask]
|
|
|
|
# integer indices
|
|
indices = np.array([0, 5, 10, 15])
|
|
selected = array[indices]
|
|
|
|
# combine boolean + fancy indexing
|
|
mask = array['volume'] > threshold
|
|
high_vol_indices = np.where(mask)[0]
|
|
subset = array[high_vol_indices[::2]] # every other
|
|
```
|
|
|
|
## Common Financial Patterns
|
|
|
|
### Gap Detection
|
|
|
|
```python
|
|
# assume sorted by time
|
|
time_diffs = np.diff(array['time'])
|
|
expected_step = 60.0 # 1-minute bars
|
|
|
|
# find gaps larger than expected
|
|
gap_mask = time_diffs > (expected_step * 1.5)
|
|
gap_indices = np.where(gap_mask)[0]
|
|
|
|
# get gap start/end times
|
|
gap_starts = array['time'][gap_indices]
|
|
gap_ends = array['time'][gap_indices + 1]
|
|
```
|
|
|
|
### Rolling Window Operations
|
|
|
|
```python
|
|
# simple moving average (close)
|
|
window = 20
|
|
sma = np.convolve(
|
|
array['close'],
|
|
np.ones(window) / window,
|
|
mode='valid',
|
|
)
|
|
|
|
# stride tricks for efficiency
|
|
from numpy.lib.stride_tricks import (
|
|
sliding_window_view,
|
|
)
|
|
windows = sliding_window_view(
|
|
array['close'], window,
|
|
)
|
|
sma = windows.mean(axis=1)
|
|
```
|
|
|
|
### OHLC Resampling (NumPy)
|
|
|
|
```python
|
|
# resample 1m bars to 5m bars
|
|
def resample_ohlc(arr, old_step, new_step):
|
|
n_bars = len(arr)
|
|
factor = int(new_step / old_step)
|
|
|
|
# truncate to multiple of factor
|
|
n_complete = (n_bars // factor) * factor
|
|
arr = arr[:n_complete]
|
|
|
|
# reshape into chunks
|
|
reshaped = arr.reshape(-1, factor)
|
|
|
|
# aggregate OHLC
|
|
opens = reshaped[:, 0]['open']
|
|
highs = reshaped['high'].max(axis=1)
|
|
lows = reshaped['low'].min(axis=1)
|
|
closes = reshaped[:, -1]['close']
|
|
volumes = reshaped['volume'].sum(axis=1)
|
|
|
|
return np.rec.fromarrays(
|
|
[opens, highs, lows, closes, volumes],
|
|
names=[
|
|
'open', 'high', 'low',
|
|
'close', 'volume',
|
|
],
|
|
)
|
|
```
|
|
|
|
## Memory Considerations
|
|
|
|
### Views vs Copies
|
|
|
|
```python
|
|
# VIEW: shares memory (fast, no copy)
|
|
times = array['time'] # field access
|
|
subset = array[10:20] # slicing
|
|
reshaped = array.reshape(-1, 2)
|
|
|
|
# COPY: new memory allocation
|
|
filtered = array[array['time'] > cutoff]
|
|
sorted_arr = np.sort(array)
|
|
casted = array.astype(np.float32)
|
|
|
|
# force copy when needed
|
|
explicit_copy = array.copy()
|
|
```
|
|
|
|
### In-Place Operations
|
|
|
|
```python
|
|
# modify in-place (no new allocation)
|
|
array['close'] *= 1.01 # scale prices
|
|
array['volume'][mask] = 0 # zero out rows
|
|
|
|
# careful: compound ops may create temporaries
|
|
array['close'] = array['close'] * 1.01 # temp!
|
|
array['close'] *= 1.01 # true in-place
|
|
```
|