Drop gap detection from `dedupe()`, expect caller to handle it

distribute_dis
Tyler Goodlet 2023-12-28 10:40:08 -05:00
parent 07331a160e
commit 5702e422d8
1 changed files with 8 additions and 33 deletions

View File

@ -380,10 +380,6 @@ def get_null_segs(
None, # backfilled on next iter
])
# row = zero_t[fi]
# absi_pre_zseg = row['index'][0] - 1
# absi_pre_zseg = absi - 1
# final iter case, backfill FINAL end iabs!
if (i + 1) == fi_zgaps.size:
absi_zsegs[-1][1] = absi_zeros[-1] + 1
@ -623,8 +619,9 @@ def detect_price_gaps(
def dedupe(
src_df: pl.DataFrame,
sort: bool = True,
time_gaps: pl.DataFrame | None = None,
sort: bool = True,
period: float = 60,
) -> tuple[
@ -641,49 +638,27 @@ def dedupe(
'''
wdts: pl.DataFrame = with_dts(src_df)
src_gaps: pl.DataFrame = detect_time_gaps(
wdts,
expect_period=period,
gap_dt_unit=None if period < 60 else 'days',
)
# if no gaps detected just return carbon copies
# and no len diff.
if src_gaps.is_empty():
return (
wdts,
src_gaps,
wdts,
0,
)
# maybe sort on any time field
if sort:
wdts = wdts.sort(by='time')
# TODO: detect out-of-order segments which were corrected!
# -[ ] report in log msg
# -[ ] possibly return segment sections which were moved?
# remove duplicated datetime samples/sections
deduped: pl.DataFrame = wdts.unique(
subset=['dt'],
maintain_order=True,
)
if sort:
deduped = deduped.sort(by='time')
deduped_gaps: pl.DataFrame = detect_time_gaps(
deduped,
expect_period=period,
gap_dt_unit=None if period < 60 else 'days',
)
diff: int = (
wdts.height
-
deduped.height
)
log.warning(
f'TIME GAPs FOUND:\n'
# f'{gaps}\n'
f'deduped Gaps found:\n{deduped_gaps}'
)
return (
wdts,
deduped_gaps,
deduped,
diff,
)