Return the `.len()` diff from `dedupe()` instead
Since the `diff: int` serves as a predicate anyway (when `0` nothing duplicate was detected) might as well just return it directly since it's likely also useful for the caller when doing deeper anal. Also, handle the zero-diff case by just returning early with a copy of the input frame and a `diff=0`. CHERRY INTO #486distribute_dis
parent
8e4d1a48ed
commit
e8bf4c6e04
|
@ -262,7 +262,7 @@ def with_dts(
|
||||||
) -> pl.DataFrame:
|
) -> pl.DataFrame:
|
||||||
'''
|
'''
|
||||||
Insert datetime (casted) columns to a (presumably) OHLC sampled
|
Insert datetime (casted) columns to a (presumably) OHLC sampled
|
||||||
time series with an epoch-time column keyed by ``time_col``.
|
time series with an epoch-time column keyed by `time_col: str`.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
return df.with_columns([
|
return df.with_columns([
|
||||||
|
@ -270,7 +270,9 @@ def with_dts(
|
||||||
pl.col(time_col).diff().alias('s_diff'),
|
pl.col(time_col).diff().alias('s_diff'),
|
||||||
pl.from_epoch(pl.col(time_col)).alias('dt'),
|
pl.from_epoch(pl.col(time_col)).alias('dt'),
|
||||||
]).with_columns([
|
]).with_columns([
|
||||||
pl.from_epoch(pl.col(f'{time_col}_prev')).alias('dt_prev'),
|
pl.from_epoch(
|
||||||
|
pl.col(f'{time_col}_prev')
|
||||||
|
).alias('dt_prev'),
|
||||||
pl.col('dt').diff().alias('dt_diff'),
|
pl.col('dt').diff().alias('dt_diff'),
|
||||||
]) #.with_columns(
|
]) #.with_columns(
|
||||||
# pl.col('dt').diff().dt.days().alias('days_dt_diff'),
|
# pl.col('dt').diff().dt.days().alias('days_dt_diff'),
|
||||||
|
@ -369,7 +371,7 @@ def dedupe(src_df: pl.DataFrame) -> tuple[
|
||||||
pl.DataFrame, # with dts
|
pl.DataFrame, # with dts
|
||||||
pl.DataFrame, # gaps
|
pl.DataFrame, # gaps
|
||||||
pl.DataFrame, # with deduplicated dts (aka gap/repeat removal)
|
pl.DataFrame, # with deduplicated dts (aka gap/repeat removal)
|
||||||
bool,
|
int, # len diff between input and deduped
|
||||||
]:
|
]:
|
||||||
'''
|
'''
|
||||||
Check for time series gaps and if found
|
Check for time series gaps and if found
|
||||||
|
@ -380,12 +382,26 @@ def dedupe(src_df: pl.DataFrame) -> tuple[
|
||||||
'''
|
'''
|
||||||
df: pl.DataFrame = with_dts(src_df)
|
df: pl.DataFrame = with_dts(src_df)
|
||||||
gaps: pl.DataFrame = detect_time_gaps(df)
|
gaps: pl.DataFrame = detect_time_gaps(df)
|
||||||
if not gaps.is_empty():
|
|
||||||
|
# if no gaps detected just return carbon copies
|
||||||
|
# and no len diff.
|
||||||
|
if gaps.is_empty():
|
||||||
|
return (
|
||||||
|
df,
|
||||||
|
gaps,
|
||||||
|
df,
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
|
||||||
# remove duplicated datetime samples/sections
|
# remove duplicated datetime samples/sections
|
||||||
deduped: pl.DataFrame = dedup_dt(df)
|
deduped: pl.DataFrame = dedup_dt(df)
|
||||||
deduped_gaps = detect_time_gaps(deduped)
|
deduped_gaps = detect_time_gaps(deduped)
|
||||||
|
|
||||||
|
diff: int = (
|
||||||
|
df.height
|
||||||
|
-
|
||||||
|
deduped.height
|
||||||
|
)
|
||||||
log.warning(
|
log.warning(
|
||||||
f'Gaps found:\n{gaps}\n'
|
f'Gaps found:\n{gaps}\n'
|
||||||
f'deduped Gaps found:\n{deduped_gaps}'
|
f'deduped Gaps found:\n{deduped_gaps}'
|
||||||
|
@ -394,23 +410,33 @@ def dedupe(src_df: pl.DataFrame) -> tuple[
|
||||||
# ndarray to detect and remove?
|
# ndarray to detect and remove?
|
||||||
# null_gaps = detect_null_time_gap()
|
# null_gaps = detect_null_time_gap()
|
||||||
|
|
||||||
diff: int = (
|
|
||||||
df.height
|
|
||||||
-
|
|
||||||
deduped.height
|
|
||||||
)
|
|
||||||
was_deduped: bool = False
|
|
||||||
if diff:
|
|
||||||
was_deduped: bool = True
|
|
||||||
|
|
||||||
return (
|
return (
|
||||||
df,
|
df,
|
||||||
gaps,
|
gaps,
|
||||||
deduped,
|
deduped,
|
||||||
was_deduped,
|
diff,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def sort_diff(
|
||||||
|
src_df: pl.DataFrame,
|
||||||
|
col: str = 'time',
|
||||||
|
|
||||||
|
) -> tuple[
|
||||||
|
pl.DataFrame, # with dts
|
||||||
|
pl.DataFrame, # sorted
|
||||||
|
list[int], # indices of segments that are out-of-order
|
||||||
|
]:
|
||||||
|
ser: pl.Series = src_df[col]
|
||||||
|
|
||||||
|
diff: pl.Series = ser.diff()
|
||||||
|
sortd: pl.DataFrame = ser.sort()
|
||||||
|
sortd_diff: pl.Series = sortd.diff()
|
||||||
|
i_step_diff = (diff != sortd_diff).arg_true()
|
||||||
|
if i_step_diff.len():
|
||||||
|
import pdbp
|
||||||
|
pdbp.set_trace()
|
||||||
|
|
||||||
# NOTE: thanks to this SO answer for the below conversion routines
|
# NOTE: thanks to this SO answer for the below conversion routines
|
||||||
# to go from numpy struct-arrays to polars dataframes and back:
|
# to go from numpy struct-arrays to polars dataframes and back:
|
||||||
# https://stackoverflow.com/a/72054819
|
# https://stackoverflow.com/a/72054819
|
||||||
|
|
Loading…
Reference in New Issue