Drop gap detection from `dedupe()`, expect caller to handle it
parent
07331a160e
commit
5702e422d8
|
@ -380,10 +380,6 @@ def get_null_segs(
|
||||||
None, # backfilled on next iter
|
None, # backfilled on next iter
|
||||||
])
|
])
|
||||||
|
|
||||||
# row = zero_t[fi]
|
|
||||||
# absi_pre_zseg = row['index'][0] - 1
|
|
||||||
# absi_pre_zseg = absi - 1
|
|
||||||
|
|
||||||
# final iter case, backfill FINAL end iabs!
|
# final iter case, backfill FINAL end iabs!
|
||||||
if (i + 1) == fi_zgaps.size:
|
if (i + 1) == fi_zgaps.size:
|
||||||
absi_zsegs[-1][1] = absi_zeros[-1] + 1
|
absi_zsegs[-1][1] = absi_zeros[-1] + 1
|
||||||
|
@ -623,8 +619,9 @@ def detect_price_gaps(
|
||||||
|
|
||||||
def dedupe(
|
def dedupe(
|
||||||
src_df: pl.DataFrame,
|
src_df: pl.DataFrame,
|
||||||
sort: bool = True,
|
|
||||||
|
|
||||||
|
time_gaps: pl.DataFrame | None = None,
|
||||||
|
sort: bool = True,
|
||||||
period: float = 60,
|
period: float = 60,
|
||||||
|
|
||||||
) -> tuple[
|
) -> tuple[
|
||||||
|
@ -641,49 +638,27 @@ def dedupe(
|
||||||
|
|
||||||
'''
|
'''
|
||||||
wdts: pl.DataFrame = with_dts(src_df)
|
wdts: pl.DataFrame = with_dts(src_df)
|
||||||
src_gaps: pl.DataFrame = detect_time_gaps(
|
|
||||||
wdts,
|
|
||||||
expect_period=period,
|
|
||||||
gap_dt_unit=None if period < 60 else 'days',
|
|
||||||
)
|
|
||||||
|
|
||||||
# if no gaps detected just return carbon copies
|
# maybe sort on any time field
|
||||||
# and no len diff.
|
if sort:
|
||||||
if src_gaps.is_empty():
|
wdts = wdts.sort(by='time')
|
||||||
return (
|
# TODO: detect out-of-order segments which were corrected!
|
||||||
wdts,
|
# -[ ] report in log msg
|
||||||
src_gaps,
|
# -[ ] possibly return segment sections which were moved?
|
||||||
wdts,
|
|
||||||
0,
|
|
||||||
)
|
|
||||||
|
|
||||||
# remove duplicated datetime samples/sections
|
# remove duplicated datetime samples/sections
|
||||||
deduped: pl.DataFrame = wdts.unique(
|
deduped: pl.DataFrame = wdts.unique(
|
||||||
subset=['dt'],
|
subset=['dt'],
|
||||||
maintain_order=True,
|
maintain_order=True,
|
||||||
)
|
)
|
||||||
if sort:
|
|
||||||
deduped = deduped.sort(by='time')
|
|
||||||
|
|
||||||
deduped_gaps: pl.DataFrame = detect_time_gaps(
|
|
||||||
deduped,
|
|
||||||
expect_period=period,
|
|
||||||
gap_dt_unit=None if period < 60 else 'days',
|
|
||||||
)
|
|
||||||
|
|
||||||
diff: int = (
|
diff: int = (
|
||||||
wdts.height
|
wdts.height
|
||||||
-
|
-
|
||||||
deduped.height
|
deduped.height
|
||||||
)
|
)
|
||||||
log.warning(
|
|
||||||
f'TIME GAPs FOUND:\n'
|
|
||||||
# f'{gaps}\n'
|
|
||||||
f'deduped Gaps found:\n{deduped_gaps}'
|
|
||||||
)
|
|
||||||
return (
|
return (
|
||||||
wdts,
|
wdts,
|
||||||
deduped_gaps,
|
|
||||||
deduped,
|
deduped,
|
||||||
diff,
|
diff,
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue