Compare commits
No commits in common. "hist_backfill_fixes" and "main" have entirely different histories.
hist_backf
...
main
|
|
@ -1187,7 +1187,7 @@ async def load_aio_clients(
|
|||
# the API TCP in `ib_insync` connection can be flaky af so instead
|
||||
# retry a few times to get the client going..
|
||||
connect_retries: int = 3,
|
||||
connect_timeout: float = 30, # in case a remote-host
|
||||
connect_timeout: float = 10,
|
||||
disconnect_on_exit: bool = True,
|
||||
|
||||
) -> dict[str, Client]:
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ from typing import (
|
|||
|
||||
import numpy as np
|
||||
|
||||
|
||||
from .. import config
|
||||
from ..service import (
|
||||
check_for_service,
|
||||
|
|
@ -151,10 +152,7 @@ class StorageConnectionError(ConnectionError):
|
|||
|
||||
'''
|
||||
|
||||
def get_storagemod(
|
||||
name: str,
|
||||
|
||||
) -> ModuleType:
|
||||
def get_storagemod(name: str) -> ModuleType:
|
||||
mod: ModuleType = import_module(
|
||||
'.' + name,
|
||||
'piker.storage',
|
||||
|
|
@ -169,10 +167,7 @@ def get_storagemod(
|
|||
async def open_storage_client(
|
||||
backend: str | None = None,
|
||||
|
||||
) -> tuple[
|
||||
ModuleType,
|
||||
StorageClient,
|
||||
]:
|
||||
) -> tuple[ModuleType, StorageClient]:
|
||||
'''
|
||||
Load the ``StorageClient`` for named backend.
|
||||
|
||||
|
|
@ -272,10 +267,7 @@ async def open_tsdb_client(
|
|||
from ..data.feed import maybe_open_feed
|
||||
|
||||
async with (
|
||||
open_storage_client() as (
|
||||
_,
|
||||
storage,
|
||||
),
|
||||
open_storage_client() as (_, storage),
|
||||
|
||||
maybe_open_feed(
|
||||
[fqme],
|
||||
|
|
@ -283,7 +275,7 @@ async def open_tsdb_client(
|
|||
|
||||
) as feed,
|
||||
):
|
||||
profiler(f'opened feed for {fqme!r}')
|
||||
profiler(f'opened feed for {fqme}')
|
||||
|
||||
# to_append = feed.hist_shm.array
|
||||
# to_prepend = None
|
||||
|
|
|
|||
|
|
@ -19,10 +19,16 @@ Storage middle-ware CLIs.
|
|||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
# from datetime import datetime
|
||||
# from contextlib import (
|
||||
# AsyncExitStack,
|
||||
# )
|
||||
from pathlib import Path
|
||||
from math import copysign
|
||||
import time
|
||||
from types import ModuleType
|
||||
from typing import (
|
||||
Any,
|
||||
TYPE_CHECKING,
|
||||
)
|
||||
|
||||
|
|
@ -41,6 +47,7 @@ from piker.data import (
|
|||
ShmArray,
|
||||
)
|
||||
from piker import tsp
|
||||
from piker.data._formatters import BGM
|
||||
from . import log
|
||||
from . import (
|
||||
__tsdbs__,
|
||||
|
|
@ -235,12 +242,122 @@ def anal(
|
|||
trio.run(main)
|
||||
|
||||
|
||||
async def markup_gaps(
|
||||
fqme: str,
|
||||
timeframe: float,
|
||||
actl: AnnotCtl,
|
||||
wdts: pl.DataFrame,
|
||||
gaps: pl.DataFrame,
|
||||
|
||||
) -> dict[int, dict]:
|
||||
'''
|
||||
Remote annotate time-gaps in a dt-fielded ts (normally OHLC)
|
||||
with rectangles.
|
||||
|
||||
'''
|
||||
aids: dict[int] = {}
|
||||
for i in range(gaps.height):
|
||||
|
||||
row: pl.DataFrame = gaps[i]
|
||||
|
||||
# the gap's RIGHT-most bar's OPEN value
|
||||
# at that time (sample) step.
|
||||
iend: int = row['index'][0]
|
||||
# dt: datetime = row['dt'][0]
|
||||
# dt_prev: datetime = row['dt_prev'][0]
|
||||
# dt_end_t: float = dt.timestamp()
|
||||
|
||||
|
||||
# TODO: can we eventually remove this
|
||||
# once we figure out why the epoch cols
|
||||
# don't match?
|
||||
# TODO: FIX HOW/WHY these aren't matching
|
||||
# and are instead off by 4hours (EST
|
||||
# vs. UTC?!?!)
|
||||
# end_t: float = row['time']
|
||||
# assert (
|
||||
# dt.timestamp()
|
||||
# ==
|
||||
# end_t
|
||||
# )
|
||||
|
||||
# the gap's LEFT-most bar's CLOSE value
|
||||
# at that time (sample) step.
|
||||
prev_r: pl.DataFrame = wdts.filter(
|
||||
pl.col('index') == iend - 1
|
||||
)
|
||||
# XXX: probably a gap in the (newly sorted or de-duplicated)
|
||||
# dt-df, so we might need to re-index first..
|
||||
if prev_r.is_empty():
|
||||
await tractor.pause()
|
||||
|
||||
istart: int = prev_r['index'][0]
|
||||
# dt_start_t: float = dt_prev.timestamp()
|
||||
|
||||
# start_t: float = prev_r['time']
|
||||
# assert (
|
||||
# dt_start_t
|
||||
# ==
|
||||
# start_t
|
||||
# )
|
||||
|
||||
# TODO: implement px-col width measure
|
||||
# and ensure at least as many px-cols
|
||||
# shown per rect as configured by user.
|
||||
# gap_w: float = abs((iend - istart))
|
||||
# if gap_w < 6:
|
||||
# margin: float = 6
|
||||
# iend += margin
|
||||
# istart -= margin
|
||||
|
||||
rect_gap: float = BGM*3/8
|
||||
opn: float = row['open'][0]
|
||||
ro: tuple[float, float] = (
|
||||
# dt_end_t,
|
||||
iend + rect_gap + 1,
|
||||
opn,
|
||||
)
|
||||
cls: float = prev_r['close'][0]
|
||||
lc: tuple[float, float] = (
|
||||
# dt_start_t,
|
||||
istart - rect_gap, # + 1 ,
|
||||
cls,
|
||||
)
|
||||
|
||||
color: str = 'dad_blue'
|
||||
diff: float = cls - opn
|
||||
sgn: float = copysign(1, diff)
|
||||
color: str = {
|
||||
-1: 'buy_green',
|
||||
1: 'sell_red',
|
||||
}[sgn]
|
||||
|
||||
rect_kwargs: dict[str, Any] = dict(
|
||||
fqme=fqme,
|
||||
timeframe=timeframe,
|
||||
start_pos=lc,
|
||||
end_pos=ro,
|
||||
color=color,
|
||||
)
|
||||
|
||||
aid: int = await actl.add_rect(**rect_kwargs)
|
||||
assert aid
|
||||
aids[aid] = rect_kwargs
|
||||
|
||||
# tell chart to redraw all its
|
||||
# graphics view layers Bo
|
||||
await actl.redraw(
|
||||
fqme=fqme,
|
||||
timeframe=timeframe,
|
||||
)
|
||||
return aids
|
||||
|
||||
|
||||
@store.command()
|
||||
def ldshm(
|
||||
fqme: str,
|
||||
write_parquet: bool = True,
|
||||
reload_parquet_to_shm: bool = True,
|
||||
pdb: bool = False, # --pdb passed?
|
||||
|
||||
) -> None:
|
||||
'''
|
||||
|
|
@ -260,7 +377,7 @@ def ldshm(
|
|||
open_piker_runtime(
|
||||
'polars_boi',
|
||||
enable_modules=['piker.data._sharedmem'],
|
||||
debug_mode=pdb,
|
||||
debug_mode=True,
|
||||
),
|
||||
open_storage_client() as (
|
||||
mod,
|
||||
|
|
@ -280,9 +397,6 @@ def ldshm(
|
|||
|
||||
times: np.ndarray = shm.array['time']
|
||||
d1: float = float(times[-1] - times[-2])
|
||||
d2: float = 0
|
||||
# XXX, take a median sample rate if sufficient data
|
||||
if times.size > 2:
|
||||
d2: float = float(times[-2] - times[-3])
|
||||
med: float = np.median(np.diff(times))
|
||||
if (
|
||||
|
|
@ -293,6 +407,7 @@ def ldshm(
|
|||
raise ValueError(
|
||||
f'Something is wrong with time period for {shm}:\n{times}'
|
||||
)
|
||||
|
||||
period_s: float = float(max(d1, d2, med))
|
||||
|
||||
null_segs: tuple = tsp.get_null_segs(
|
||||
|
|
@ -302,8 +417,6 @@ def ldshm(
|
|||
|
||||
# TODO: call null-seg fixer somehow?
|
||||
if null_segs:
|
||||
|
||||
if tractor._state.is_debug_mode():
|
||||
await tractor.pause()
|
||||
# async with (
|
||||
# trio.open_nursery() as tn,
|
||||
|
|
@ -328,35 +441,9 @@ def ldshm(
|
|||
wdts,
|
||||
deduped,
|
||||
diff,
|
||||
valid_races,
|
||||
dq_issues,
|
||||
) = tsp.dedupe_ohlcv_smart(
|
||||
) = tsp.dedupe(
|
||||
shm_df,
|
||||
)
|
||||
|
||||
# Report duplicate analysis
|
||||
if diff > 0:
|
||||
log.info(
|
||||
f'Removed {diff} duplicate timestamp(s)\n'
|
||||
)
|
||||
if valid_races is not None:
|
||||
identical: int = (
|
||||
valid_races
|
||||
.filter(pl.col('identical_bars'))
|
||||
.height
|
||||
)
|
||||
monotonic: int = valid_races.height - identical
|
||||
log.info(
|
||||
f'Valid race conditions: {valid_races.height}\n'
|
||||
f' - Identical bars: {identical}\n'
|
||||
f' - Volume monotonic: {monotonic}\n'
|
||||
)
|
||||
|
||||
if dq_issues is not None:
|
||||
log.warning(
|
||||
f'DATA QUALITY ISSUES from provider: '
|
||||
f'{dq_issues.height} timestamp(s)\n'
|
||||
f'{dq_issues}\n'
|
||||
period=period_s,
|
||||
)
|
||||
|
||||
# detect gaps from in expected (uniform OHLC) sample period
|
||||
|
|
@ -373,8 +460,7 @@ def ldshm(
|
|||
|
||||
# TODO: actually pull the exact duration
|
||||
# expected for each venue operational period?
|
||||
# gap_dt_unit='day',
|
||||
gap_dt_unit='day',
|
||||
gap_dt_unit='days',
|
||||
gap_thresh=1,
|
||||
)
|
||||
|
||||
|
|
@ -385,11 +471,8 @@ def ldshm(
|
|||
if (
|
||||
not venue_gaps.is_empty()
|
||||
or (
|
||||
not step_gaps.is_empty()
|
||||
# XXX, i presume i put this bc i was guarding
|
||||
# for ib venue gaps?
|
||||
# and
|
||||
# period_s < 60
|
||||
period_s < 60
|
||||
and not step_gaps.is_empty()
|
||||
)
|
||||
):
|
||||
# write repaired ts to parquet-file?
|
||||
|
|
@ -438,7 +521,7 @@ def ldshm(
|
|||
do_markup_gaps: bool = True
|
||||
if do_markup_gaps:
|
||||
new_df: pl.DataFrame = tsp.np2pl(new)
|
||||
aids: dict = await tsp._annotate.markup_gaps(
|
||||
aids: dict = await markup_gaps(
|
||||
fqme,
|
||||
period_s,
|
||||
actl,
|
||||
|
|
@ -451,13 +534,8 @@ def ldshm(
|
|||
tf2aids[period_s] = aids
|
||||
|
||||
else:
|
||||
# No significant gaps to handle, but may have had
|
||||
# duplicates removed (valid race conditions are ok)
|
||||
if diff > 0 and dq_issues is not None:
|
||||
log.warning(
|
||||
'Found duplicates with data quality issues '
|
||||
'but no significant time gaps!\n'
|
||||
)
|
||||
# allow interaction even when no ts problems.
|
||||
assert not diff
|
||||
|
||||
await tractor.pause()
|
||||
log.info('Exiting TSP shm anal-izer!')
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -578,22 +578,11 @@ def detect_time_gaps(
|
|||
# NOTE: this flag is to indicate that on this (sampling) time
|
||||
# scale we expect to only be filtering against larger venue
|
||||
# closures-scale time gaps.
|
||||
#
|
||||
# Map to total_ method since `dt_diff` is a duration type,
|
||||
# not datetime - modern polars requires `total_*` methods
|
||||
# for duration types (e.g. `total_days()` not `day()`)
|
||||
# Ensure plural form for polars API (e.g. 'day' -> 'days')
|
||||
unit_plural: str = (
|
||||
gap_dt_unit
|
||||
if gap_dt_unit.endswith('s')
|
||||
else f'{gap_dt_unit}s'
|
||||
)
|
||||
duration_method: str = f'total_{unit_plural}'
|
||||
return step_gaps.filter(
|
||||
# Second by an arbitrary dt-unit step size
|
||||
getattr(
|
||||
pl.col('dt_diff').dt,
|
||||
duration_method,
|
||||
gap_dt_unit,
|
||||
)().abs() > gap_thresh
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,166 +0,0 @@
|
|||
# piker: trading gear for hackers
|
||||
# Copyright (C) 2018-present Tyler Goodlet (in stewardship of pikers)
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
"""
|
||||
Time-series (remote) annotation APIs.
|
||||
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from math import copysign
|
||||
from typing import (
|
||||
Any,
|
||||
TYPE_CHECKING,
|
||||
)
|
||||
|
||||
import polars as pl
|
||||
import tractor
|
||||
|
||||
from piker.data._formatters import BGM
|
||||
from piker.storage import log
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from piker.ui._remote_ctl import AnnotCtl
|
||||
|
||||
|
||||
async def markup_gaps(
|
||||
fqme: str,
|
||||
timeframe: float,
|
||||
actl: AnnotCtl,
|
||||
wdts: pl.DataFrame,
|
||||
gaps: pl.DataFrame,
|
||||
|
||||
) -> dict[int, dict]:
|
||||
'''
|
||||
Remote annotate time-gaps in a dt-fielded ts (normally OHLC)
|
||||
with rectangles.
|
||||
|
||||
'''
|
||||
aids: dict[int] = {}
|
||||
for i in range(gaps.height):
|
||||
|
||||
row: pl.DataFrame = gaps[i]
|
||||
|
||||
# the gap's RIGHT-most bar's OPEN value
|
||||
# at that time (sample) step.
|
||||
iend: int = row['index'][0]
|
||||
# dt: datetime = row['dt'][0]
|
||||
# dt_prev: datetime = row['dt_prev'][0]
|
||||
# dt_end_t: float = dt.timestamp()
|
||||
|
||||
|
||||
# TODO: can we eventually remove this
|
||||
# once we figure out why the epoch cols
|
||||
# don't match?
|
||||
# TODO: FIX HOW/WHY these aren't matching
|
||||
# and are instead off by 4hours (EST
|
||||
# vs. UTC?!?!)
|
||||
# end_t: float = row['time']
|
||||
# assert (
|
||||
# dt.timestamp()
|
||||
# ==
|
||||
# end_t
|
||||
# )
|
||||
|
||||
# the gap's LEFT-most bar's CLOSE value
|
||||
# at that time (sample) step.
|
||||
prev_r: pl.DataFrame = wdts.filter(
|
||||
pl.col('index') == iend - 1
|
||||
)
|
||||
# XXX: probably a gap in the (newly sorted or de-duplicated)
|
||||
# dt-df, so we might need to re-index first..
|
||||
dt: pl.Series = row['dt']
|
||||
dt_prev: pl.Series = row['dt_prev']
|
||||
if prev_r.is_empty():
|
||||
|
||||
# XXX, filter out any special ignore cases,
|
||||
# - UNIX-epoch stamped datums
|
||||
# - first row
|
||||
if (
|
||||
dt_prev.dt.epoch()[0] == 0
|
||||
or
|
||||
dt.dt.epoch()[0] == 0
|
||||
):
|
||||
log.warning('Skipping row with UNIX epoch timestamp ??')
|
||||
continue
|
||||
|
||||
if wdts[0]['index'][0] == iend: # first row
|
||||
log.warning('Skipping first-row (has no previous obvi) !!')
|
||||
continue
|
||||
|
||||
# XXX, if the previous-row by shm-index is missing,
|
||||
# meaning there is a missing sample (set), get the prior
|
||||
# row by df index and attempt to use it?
|
||||
i_wdts: pl.DataFrame = wdts.with_row_index(name='i')
|
||||
i_row: int = i_wdts.filter(pl.col('index') == iend)['i'][0]
|
||||
prev_row_by_i = wdts[i_row]
|
||||
prev_r: pl.DataFrame = prev_row_by_i
|
||||
|
||||
# debug any missing pre-row
|
||||
if tractor._state.is_debug_mode():
|
||||
await tractor.pause()
|
||||
|
||||
istart: int = prev_r['index'][0]
|
||||
|
||||
# TODO: implement px-col width measure
|
||||
# and ensure at least as many px-cols
|
||||
# shown per rect as configured by user.
|
||||
# gap_w: float = abs((iend - istart))
|
||||
# if gap_w < 6:
|
||||
# margin: float = 6
|
||||
# iend += margin
|
||||
# istart -= margin
|
||||
|
||||
rect_gap: float = BGM*3/8
|
||||
opn: float = row['open'][0]
|
||||
ro: tuple[float, float] = (
|
||||
# dt_end_t,
|
||||
iend + rect_gap + 1,
|
||||
opn,
|
||||
)
|
||||
cls: float = prev_r['close'][0]
|
||||
lc: tuple[float, float] = (
|
||||
# dt_start_t,
|
||||
istart - rect_gap, # + 1 ,
|
||||
cls,
|
||||
)
|
||||
|
||||
color: str = 'dad_blue'
|
||||
diff: float = cls - opn
|
||||
sgn: float = copysign(1, diff)
|
||||
color: str = {
|
||||
-1: 'buy_green',
|
||||
1: 'sell_red',
|
||||
}[sgn]
|
||||
|
||||
rect_kwargs: dict[str, Any] = dict(
|
||||
fqme=fqme,
|
||||
timeframe=timeframe,
|
||||
start_pos=lc,
|
||||
end_pos=ro,
|
||||
color=color,
|
||||
)
|
||||
|
||||
aid: int = await actl.add_rect(**rect_kwargs)
|
||||
assert aid
|
||||
aids[aid] = rect_kwargs
|
||||
|
||||
# tell chart to redraw all its
|
||||
# graphics view layers Bo
|
||||
await actl.redraw(
|
||||
fqme=fqme,
|
||||
timeframe=timeframe,
|
||||
)
|
||||
return aids
|
||||
|
|
@ -1,206 +0,0 @@
|
|||
'''
|
||||
Smart OHLCV deduplication with data quality validation.
|
||||
|
||||
Handles concurrent write conflicts by keeping the most complete bar
|
||||
(highest volume) while detecting data quality anomalies.
|
||||
|
||||
'''
|
||||
import polars as pl
|
||||
|
||||
from ._anal import with_dts
|
||||
|
||||
|
||||
def dedupe_ohlcv_smart(
|
||||
src_df: pl.DataFrame,
|
||||
time_col: str = 'time',
|
||||
volume_col: str = 'volume',
|
||||
sort: bool = True,
|
||||
|
||||
) -> tuple[
|
||||
pl.DataFrame, # with dts
|
||||
pl.DataFrame, # deduped (keeping higher volume bars)
|
||||
int, # count of dupes removed
|
||||
pl.DataFrame|None, # valid race conditions
|
||||
pl.DataFrame|None, # data quality violations
|
||||
]:
|
||||
'''
|
||||
Smart OHLCV deduplication keeping most complete bars.
|
||||
|
||||
For duplicate timestamps, keeps bar with highest volume under
|
||||
the assumption that higher volume indicates more complete/final
|
||||
data from backfill vs partial live updates.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Tuple of:
|
||||
- wdts: original dataframe with datetime columns added
|
||||
- deduped: deduplicated frame keeping highest-volume bars
|
||||
- diff: number of duplicate rows removed
|
||||
- valid_races: duplicates meeting expected race condition pattern
|
||||
(volume monotonic, OHLC ranges valid)
|
||||
- data_quality_issues: duplicates violating expected relationships
|
||||
indicating provider data problems
|
||||
|
||||
'''
|
||||
wdts: pl.DataFrame = with_dts(src_df)
|
||||
|
||||
# Find duplicate timestamps
|
||||
dupes: pl.DataFrame = wdts.filter(
|
||||
pl.col(time_col).is_duplicated()
|
||||
)
|
||||
|
||||
if dupes.is_empty():
|
||||
# No duplicates, return as-is
|
||||
return (wdts, wdts, 0, None, None)
|
||||
|
||||
# Analyze duplicate groups for validation
|
||||
dupe_analysis: pl.DataFrame = (
|
||||
dupes
|
||||
.sort([time_col, 'index'])
|
||||
.group_by(time_col, maintain_order=True)
|
||||
.agg([
|
||||
pl.col('index').alias('indices'),
|
||||
pl.col('volume').alias('volumes'),
|
||||
pl.col('high').alias('highs'),
|
||||
pl.col('low').alias('lows'),
|
||||
pl.col('open').alias('opens'),
|
||||
pl.col('close').alias('closes'),
|
||||
pl.col('dt').first().alias('dt'),
|
||||
pl.len().alias('count'),
|
||||
])
|
||||
)
|
||||
|
||||
# Validate OHLCV monotonicity for each duplicate group
|
||||
def check_ohlcv_validity(row) -> dict[str, bool]:
|
||||
'''
|
||||
Check if duplicate bars follow expected race condition pattern.
|
||||
|
||||
For a valid live-update → backfill race:
|
||||
- volume should be monotonically increasing
|
||||
- high should be monotonically non-decreasing
|
||||
- low should be monotonically non-increasing
|
||||
- open should be identical (fixed at bar start)
|
||||
|
||||
Returns dict of violation flags.
|
||||
|
||||
'''
|
||||
vols: list = row['volumes']
|
||||
highs: list = row['highs']
|
||||
lows: list = row['lows']
|
||||
opens: list = row['opens']
|
||||
|
||||
violations: dict[str, bool] = {
|
||||
'volume_non_monotonic': False,
|
||||
'high_decreased': False,
|
||||
'low_increased': False,
|
||||
'open_mismatch': False,
|
||||
'identical_bars': False,
|
||||
}
|
||||
|
||||
# Check if all bars are identical (pure duplicate)
|
||||
if (
|
||||
len(set(vols)) == 1
|
||||
and len(set(highs)) == 1
|
||||
and len(set(lows)) == 1
|
||||
and len(set(opens)) == 1
|
||||
):
|
||||
violations['identical_bars'] = True
|
||||
return violations
|
||||
|
||||
# Check volume monotonicity
|
||||
for i in range(1, len(vols)):
|
||||
if vols[i] < vols[i-1]:
|
||||
violations['volume_non_monotonic'] = True
|
||||
break
|
||||
|
||||
# Check high monotonicity (can only increase or stay same)
|
||||
for i in range(1, len(highs)):
|
||||
if highs[i] < highs[i-1]:
|
||||
violations['high_decreased'] = True
|
||||
break
|
||||
|
||||
# Check low monotonicity (can only decrease or stay same)
|
||||
for i in range(1, len(lows)):
|
||||
if lows[i] > lows[i-1]:
|
||||
violations['low_increased'] = True
|
||||
break
|
||||
|
||||
# Check open consistency (should be fixed)
|
||||
if len(set(opens)) > 1:
|
||||
violations['open_mismatch'] = True
|
||||
|
||||
return violations
|
||||
|
||||
# Apply validation
|
||||
dupe_analysis = dupe_analysis.with_columns([
|
||||
pl.struct(['volumes', 'highs', 'lows', 'opens'])
|
||||
.map_elements(
|
||||
check_ohlcv_validity,
|
||||
return_dtype=pl.Struct([
|
||||
pl.Field('volume_non_monotonic', pl.Boolean),
|
||||
pl.Field('high_decreased', pl.Boolean),
|
||||
pl.Field('low_increased', pl.Boolean),
|
||||
pl.Field('open_mismatch', pl.Boolean),
|
||||
pl.Field('identical_bars', pl.Boolean),
|
||||
])
|
||||
)
|
||||
.alias('validity')
|
||||
])
|
||||
|
||||
# Unnest validity struct
|
||||
dupe_analysis = dupe_analysis.unnest('validity')
|
||||
|
||||
# Separate valid races from data quality issues
|
||||
valid_races: pl.DataFrame|None = (
|
||||
dupe_analysis
|
||||
.filter(
|
||||
# Valid if no violations OR just identical bars
|
||||
~pl.col('volume_non_monotonic')
|
||||
& ~pl.col('high_decreased')
|
||||
& ~pl.col('low_increased')
|
||||
& ~pl.col('open_mismatch')
|
||||
)
|
||||
)
|
||||
if valid_races.is_empty():
|
||||
valid_races = None
|
||||
|
||||
data_quality_issues: pl.DataFrame|None = (
|
||||
dupe_analysis
|
||||
.filter(
|
||||
# Issues if any non-identical violation exists
|
||||
(
|
||||
pl.col('volume_non_monotonic')
|
||||
| pl.col('high_decreased')
|
||||
| pl.col('low_increased')
|
||||
| pl.col('open_mismatch')
|
||||
)
|
||||
& ~pl.col('identical_bars')
|
||||
)
|
||||
)
|
||||
if data_quality_issues.is_empty():
|
||||
data_quality_issues = None
|
||||
|
||||
# Deduplicate: keep highest volume bar for each timestamp
|
||||
deduped: pl.DataFrame = (
|
||||
wdts
|
||||
.sort([time_col, volume_col])
|
||||
.unique(
|
||||
subset=[time_col],
|
||||
keep='last',
|
||||
maintain_order=False,
|
||||
)
|
||||
)
|
||||
|
||||
# Re-sort by time or index
|
||||
if sort:
|
||||
deduped = deduped.sort(by=time_col)
|
||||
|
||||
diff: int = wdts.height - deduped.height
|
||||
|
||||
return (
|
||||
wdts,
|
||||
deduped,
|
||||
diff,
|
||||
valid_races,
|
||||
data_quality_issues,
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -237,8 +237,8 @@ class LevelLabel(YAxisLabel):
|
|||
class L1Label(LevelLabel):
|
||||
|
||||
text_flags = (
|
||||
QtCore.Qt.TextFlag.TextDontClip
|
||||
| QtCore.Qt.AlignmentFlag.AlignLeft
|
||||
QtCore.Qt.TextDontClip
|
||||
| QtCore.Qt.AlignLeft
|
||||
)
|
||||
|
||||
def set_label_str(
|
||||
|
|
|
|||
|
|
@ -1,256 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
'''
|
||||
Programmatic debugging helper for `pdbp` REPL human-like
|
||||
interaction but built to allow `claude` to interact with
|
||||
crashes and `tractor.pause()` breakpoints along side a human dev.
|
||||
|
||||
Originally written by `clauded` during a backfiller inspection
|
||||
session with @goodboy trying to resolve duplicate/gappy ohlcv ts
|
||||
issues discovered while testing the new `nativedb` tsdb.
|
||||
|
||||
Allows `claude` to run `pdb` commands and capture output in an "offline"
|
||||
manner but generating similar output as if it was iteracting with
|
||||
the debug REPL.
|
||||
|
||||
The use of `pexpect` is heavily based on tractor's REPL UX test
|
||||
suite(s), namely various `tests/devx/test_debugger.py` patterns.
|
||||
|
||||
'''
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
|
||||
import pexpect
|
||||
from pexpect.exceptions import (
|
||||
TIMEOUT,
|
||||
EOF,
|
||||
)
|
||||
|
||||
|
||||
PROMPT: str = r'\(Pdb\+\)'
|
||||
|
||||
|
||||
def expect(
|
||||
child: pexpect.spawn,
|
||||
patt: str,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
'''
|
||||
Expect wrapper that prints last console data before failing.
|
||||
|
||||
'''
|
||||
try:
|
||||
child.expect(
|
||||
patt,
|
||||
**kwargs,
|
||||
)
|
||||
except TIMEOUT:
|
||||
before: str = (
|
||||
str(child.before.decode())
|
||||
if isinstance(child.before, bytes)
|
||||
else str(child.before)
|
||||
)
|
||||
print(
|
||||
f'TIMEOUT waiting for pattern: {patt}\n'
|
||||
f'Last seen output:\n{before}'
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
def run_pdb_commands(
|
||||
commands: list[str],
|
||||
initial_cmd: str = 'piker store ldshm xmrusdt.usdtm.perp.binance',
|
||||
timeout: int = 30,
|
||||
print_output: bool = True,
|
||||
) -> dict[str, str]:
|
||||
'''
|
||||
Spawn piker process, wait for pdb prompt, execute commands.
|
||||
|
||||
Returns dict mapping command -> output.
|
||||
|
||||
'''
|
||||
results: dict[str, str] = {}
|
||||
|
||||
# Disable colored output for easier parsing
|
||||
os.environ['PYTHON_COLORS'] = '0'
|
||||
|
||||
# Spawn the process
|
||||
if print_output:
|
||||
print(f'Spawning: {initial_cmd}')
|
||||
|
||||
child: pexpect.spawn = pexpect.spawn(
|
||||
initial_cmd,
|
||||
timeout=timeout,
|
||||
encoding='utf-8',
|
||||
echo=False,
|
||||
)
|
||||
|
||||
# Wait for pdb prompt
|
||||
try:
|
||||
expect(child, PROMPT, timeout=timeout)
|
||||
if print_output:
|
||||
print('Reached pdb prompt!')
|
||||
|
||||
# Execute each command
|
||||
for cmd in commands:
|
||||
if print_output:
|
||||
print(f'\n>>> {cmd}')
|
||||
|
||||
child.sendline(cmd)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Wait for next prompt
|
||||
expect(child, PROMPT, timeout=timeout)
|
||||
|
||||
# Capture output (everything before the prompt)
|
||||
output: str = (
|
||||
str(child.before.decode())
|
||||
if isinstance(child.before, bytes)
|
||||
else str(child.before)
|
||||
)
|
||||
results[cmd] = output
|
||||
|
||||
if print_output:
|
||||
print(output)
|
||||
|
||||
# Quit debugger gracefully
|
||||
child.sendline('quit')
|
||||
try:
|
||||
child.expect(EOF, timeout=5)
|
||||
except (TIMEOUT, EOF):
|
||||
pass
|
||||
|
||||
except TIMEOUT as e:
|
||||
print(f'Timeout: {e}')
|
||||
if child.before:
|
||||
before: str = (
|
||||
str(child.before.decode())
|
||||
if isinstance(child.before, bytes)
|
||||
else str(child.before)
|
||||
)
|
||||
print(f'Buffer:\n{before}')
|
||||
results['_error'] = str(e)
|
||||
|
||||
finally:
|
||||
if child.isalive():
|
||||
child.close(force=True)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class InteractivePdbSession:
|
||||
'''
|
||||
Interactive pdb session manager for incremental debugging.
|
||||
|
||||
'''
|
||||
def __init__(
|
||||
self,
|
||||
cmd: str = 'piker store ldshm xmrusdt.usdtm.perp.binance',
|
||||
timeout: int = 30,
|
||||
):
|
||||
self.cmd: str = cmd
|
||||
self.timeout: int = timeout
|
||||
self.child: pexpect.spawn|None = None
|
||||
self.history: list[tuple[str, str]] = []
|
||||
|
||||
def start(self) -> None:
|
||||
'''
|
||||
Start the piker process and wait for first prompt.
|
||||
|
||||
'''
|
||||
os.environ['PYTHON_COLORS'] = '0'
|
||||
|
||||
print(f'Starting: {self.cmd}')
|
||||
self.child = pexpect.spawn(
|
||||
self.cmd,
|
||||
timeout=self.timeout,
|
||||
encoding='utf-8',
|
||||
echo=False,
|
||||
)
|
||||
|
||||
# Wait for initial prompt
|
||||
expect(self.child, PROMPT, timeout=self.timeout)
|
||||
print('Ready at pdb prompt!')
|
||||
|
||||
def run(
|
||||
self,
|
||||
cmd: str,
|
||||
print_output: bool = True,
|
||||
) -> str:
|
||||
'''
|
||||
Execute a single pdb command and return output.
|
||||
|
||||
'''
|
||||
if not self.child or not self.child.isalive():
|
||||
raise RuntimeError('Session not started or dead')
|
||||
|
||||
if print_output:
|
||||
print(f'\n>>> {cmd}')
|
||||
|
||||
self.child.sendline(cmd)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Wait for next prompt
|
||||
expect(self.child, PROMPT, timeout=self.timeout)
|
||||
|
||||
output: str = (
|
||||
str(self.child.before.decode())
|
||||
if isinstance(self.child.before, bytes)
|
||||
else str(self.child.before)
|
||||
)
|
||||
self.history.append((cmd, output))
|
||||
|
||||
if print_output:
|
||||
print(output)
|
||||
|
||||
return output
|
||||
|
||||
def quit(self) -> None:
|
||||
'''
|
||||
Exit the debugger and cleanup.
|
||||
|
||||
'''
|
||||
if self.child and self.child.isalive():
|
||||
self.child.sendline('quit')
|
||||
try:
|
||||
self.child.expect(EOF, timeout=5)
|
||||
except (TIMEOUT, EOF):
|
||||
pass
|
||||
self.child.close(force=True)
|
||||
|
||||
def __enter__(self):
|
||||
self.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.quit()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Example inspection commands
|
||||
inspect_cmds: list[str] = [
|
||||
'locals().keys()',
|
||||
'type(deduped)',
|
||||
'deduped.shape',
|
||||
(
|
||||
'step_gaps.shape '
|
||||
'if "step_gaps" in locals() '
|
||||
'else "N/A"'
|
||||
),
|
||||
(
|
||||
'venue_gaps.shape '
|
||||
'if "venue_gaps" in locals() '
|
||||
'else "N/A"'
|
||||
),
|
||||
]
|
||||
|
||||
# Allow commands from CLI args
|
||||
if len(sys.argv) > 1:
|
||||
inspect_cmds = sys.argv[1:]
|
||||
|
||||
# Interactive session example
|
||||
with InteractivePdbSession() as session:
|
||||
for cmd in inspect_cmds:
|
||||
session.run(cmd)
|
||||
|
||||
print('\n=== Session Complete ===')
|
||||
Loading…
Reference in New Issue