Return the `.len()` diff from `dedupe()` instead

Since the `diff: int` serves as a predicate anyway (when `0` nothing
duplicate was detected) might as well just return it directly since it's
likely also useful for the caller when doing deeper anal.

Also, handle the zero-diff case by just returning early with a copy of
the input frame and a `diff=0`.

CHERRY INTO #486
distribute_dis
Tyler Goodlet 2023-12-12 16:48:56 -05:00
parent 8e4d1a48ed
commit e8bf4c6e04
1 changed files with 53 additions and 27 deletions

View File

@ -262,7 +262,7 @@ def with_dts(
) -> pl.DataFrame: ) -> pl.DataFrame:
''' '''
Insert datetime (casted) columns to a (presumably) OHLC sampled Insert datetime (casted) columns to a (presumably) OHLC sampled
time series with an epoch-time column keyed by ``time_col``. time series with an epoch-time column keyed by `time_col: str`.
''' '''
return df.with_columns([ return df.with_columns([
@ -270,7 +270,9 @@ def with_dts(
pl.col(time_col).diff().alias('s_diff'), pl.col(time_col).diff().alias('s_diff'),
pl.from_epoch(pl.col(time_col)).alias('dt'), pl.from_epoch(pl.col(time_col)).alias('dt'),
]).with_columns([ ]).with_columns([
pl.from_epoch(pl.col(f'{time_col}_prev')).alias('dt_prev'), pl.from_epoch(
pl.col(f'{time_col}_prev')
).alias('dt_prev'),
pl.col('dt').diff().alias('dt_diff'), pl.col('dt').diff().alias('dt_diff'),
]) #.with_columns( ]) #.with_columns(
# pl.col('dt').diff().dt.days().alias('days_dt_diff'), # pl.col('dt').diff().dt.days().alias('days_dt_diff'),
@ -369,7 +371,7 @@ def dedupe(src_df: pl.DataFrame) -> tuple[
pl.DataFrame, # with dts pl.DataFrame, # with dts
pl.DataFrame, # gaps pl.DataFrame, # gaps
pl.DataFrame, # with deduplicated dts (aka gap/repeat removal) pl.DataFrame, # with deduplicated dts (aka gap/repeat removal)
bool, int, # len diff between input and deduped
]: ]:
''' '''
Check for time series gaps and if found Check for time series gaps and if found
@ -380,12 +382,26 @@ def dedupe(src_df: pl.DataFrame) -> tuple[
''' '''
df: pl.DataFrame = with_dts(src_df) df: pl.DataFrame = with_dts(src_df)
gaps: pl.DataFrame = detect_time_gaps(df) gaps: pl.DataFrame = detect_time_gaps(df)
if not gaps.is_empty():
# if no gaps detected just return carbon copies
# and no len diff.
if gaps.is_empty():
return (
df,
gaps,
df,
0,
)
# remove duplicated datetime samples/sections # remove duplicated datetime samples/sections
deduped: pl.DataFrame = dedup_dt(df) deduped: pl.DataFrame = dedup_dt(df)
deduped_gaps = detect_time_gaps(deduped) deduped_gaps = detect_time_gaps(deduped)
diff: int = (
df.height
-
deduped.height
)
log.warning( log.warning(
f'Gaps found:\n{gaps}\n' f'Gaps found:\n{gaps}\n'
f'deduped Gaps found:\n{deduped_gaps}' f'deduped Gaps found:\n{deduped_gaps}'
@ -394,23 +410,33 @@ def dedupe(src_df: pl.DataFrame) -> tuple[
# ndarray to detect and remove? # ndarray to detect and remove?
# null_gaps = detect_null_time_gap() # null_gaps = detect_null_time_gap()
diff: int = (
df.height
-
deduped.height
)
was_deduped: bool = False
if diff:
was_deduped: bool = True
return ( return (
df, df,
gaps, gaps,
deduped, deduped,
was_deduped, diff,
) )
def sort_diff(
src_df: pl.DataFrame,
col: str = 'time',
) -> tuple[
pl.DataFrame, # with dts
pl.DataFrame, # sorted
list[int], # indices of segments that are out-of-order
]:
ser: pl.Series = src_df[col]
diff: pl.Series = ser.diff()
sortd: pl.DataFrame = ser.sort()
sortd_diff: pl.Series = sortd.diff()
i_step_diff = (diff != sortd_diff).arg_true()
if i_step_diff.len():
import pdbp
pdbp.set_trace()
# NOTE: thanks to this SO answer for the below conversion routines # NOTE: thanks to this SO answer for the below conversion routines
# to go from numpy struct-arrays to polars dataframes and back: # to go from numpy struct-arrays to polars dataframes and back:
# https://stackoverflow.com/a/72054819 # https://stackoverflow.com/a/72054819