Drop gap detection from `dedupe()`, expect caller to handle it

distribute_dis
Tyler Goodlet 2023-12-28 10:40:08 -05:00
parent 07331a160e
commit 5702e422d8
1 changed files with 8 additions and 33 deletions

View File

@ -380,10 +380,6 @@ def get_null_segs(
None, # backfilled on next iter None, # backfilled on next iter
]) ])
# row = zero_t[fi]
# absi_pre_zseg = row['index'][0] - 1
# absi_pre_zseg = absi - 1
# final iter case, backfill FINAL end iabs! # final iter case, backfill FINAL end iabs!
if (i + 1) == fi_zgaps.size: if (i + 1) == fi_zgaps.size:
absi_zsegs[-1][1] = absi_zeros[-1] + 1 absi_zsegs[-1][1] = absi_zeros[-1] + 1
@ -623,8 +619,9 @@ def detect_price_gaps(
def dedupe( def dedupe(
src_df: pl.DataFrame, src_df: pl.DataFrame,
sort: bool = True,
time_gaps: pl.DataFrame | None = None,
sort: bool = True,
period: float = 60, period: float = 60,
) -> tuple[ ) -> tuple[
@ -641,49 +638,27 @@ def dedupe(
''' '''
wdts: pl.DataFrame = with_dts(src_df) wdts: pl.DataFrame = with_dts(src_df)
src_gaps: pl.DataFrame = detect_time_gaps(
wdts,
expect_period=period,
gap_dt_unit=None if period < 60 else 'days',
)
# if no gaps detected just return carbon copies # maybe sort on any time field
# and no len diff. if sort:
if src_gaps.is_empty(): wdts = wdts.sort(by='time')
return ( # TODO: detect out-of-order segments which were corrected!
wdts, # -[ ] report in log msg
src_gaps, # -[ ] possibly return segment sections which were moved?
wdts,
0,
)
# remove duplicated datetime samples/sections # remove duplicated datetime samples/sections
deduped: pl.DataFrame = wdts.unique( deduped: pl.DataFrame = wdts.unique(
subset=['dt'], subset=['dt'],
maintain_order=True, maintain_order=True,
) )
if sort:
deduped = deduped.sort(by='time')
deduped_gaps: pl.DataFrame = detect_time_gaps(
deduped,
expect_period=period,
gap_dt_unit=None if period < 60 else 'days',
)
diff: int = ( diff: int = (
wdts.height wdts.height
- -
deduped.height deduped.height
) )
log.warning(
f'TIME GAPs FOUND:\n'
# f'{gaps}\n'
f'deduped Gaps found:\n{deduped_gaps}'
)
return ( return (
wdts, wdts,
deduped_gaps,
deduped, deduped,
diff, diff,
) )