diff --git a/src/image_prediction/pipeline.py b/src/image_prediction/pipeline.py index 4a8a62d..8219a9d 100644 --- a/src/image_prediction/pipeline.py +++ b/src/image_prediction/pipeline.py @@ -1,6 +1,7 @@ import os from functools import lru_cache, partial from itertools import chain, tee +from typing import Iterable, Any from funcy import rcompose, first, compose, second, chunks, identity, rpartial from kn_utils.logging import logger @@ -54,7 +55,7 @@ class Pipeline: join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip)) # />--classify--\ - # --extract-->--split--+->--encode---->+--join-->reformat + # --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates # \>--identity--/ self.pipe = rcompose( @@ -63,6 +64,7 @@ class Pipeline: pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise join, # ... the streams by zipping reformat, # ... the items + filter_duplicates, # ... filter out duplicate images ) def __call__(self, pdf: bytes, page_range: range = None): @@ -72,3 +74,27 @@ class Pipeline: unit=" images", disable=not self.verbose, ) + + +def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]: + """Filter out duplicate images from the `position` (image coordinates), `page` and `representation` (perceptual hash). + See RED-10765 (RM-241): Removed redactions reappear for why this is necessary. + """ + seen = set() + for item in metadata: + key = ( + item["representation"], + item["position"]["x1"], + item["position"]["x2"], + item["position"]["y1"], + item["position"]["y2"], + item["position"]["pageNumber"], + ) + if key not in seen: + seen.add(key) + yield item + else: + logger.warning( + f"Duplicate image found: representation={key[0]}, x1={key[1]}, x2={key[2]}, y1={key[3]}, y2={key[4]}, pageNumber={key[5]}" + ) + continue diff --git a/test/data.dvc b/test/data.dvc index 7d21269..8033c20 100644 --- a/test/data.dvc +++ b/test/data.dvc @@ -1,5 +1,5 @@ outs: -- md5: ab352d3b2c62ce2293cafb57c1b41b01.dir - size: 7469082 - nfiles: 6 +- md5: 08bf8a63f04b3f19f859008556699708.dir + size: 7979836 + nfiles: 7 path: data diff --git a/test/regressions_tests/image_deduplication_test.py b/test/regressions_tests/image_deduplication_test.py new file mode 100644 index 0000000..ca43a3a --- /dev/null +++ b/test/regressions_tests/image_deduplication_test.py @@ -0,0 +1,20 @@ +from pathlib import Path + +from image_prediction.config import CONFIG +from image_prediction.pipeline import load_pipeline + + +def test_all_duplicate_images_are_filtered(): + """See RED-10765 (RM-241): Removed redactions reappear.""" + pdf_path = Path(__file__).parents[1] / "data" / "RED-10765" / "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf" + pdf_bytes = pdf_path.read_bytes() + + pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size) + predictions = list(pipeline(pdf_bytes)) + + seen = set() + for prediction in predictions: + key = (prediction['representation'], prediction['position']['x1'], prediction['position']['x2'], prediction['position']['y1'], prediction['position']['y2'], prediction['position']['pageNumber']) + assert key not in seen, f"Duplicate found: {key}" + seen.add(key) +