From 0027421628aca079506ab5c56d82ef5369231f9e Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Fri, 31 Jan 2025 12:59:46 +0100 Subject: [PATCH] feat: RED-10765: ignore perceptual hash for image deduplication and prefer to keep the ones with `allPassed` set to `True` --- src/image_prediction/pipeline.py | 37 +++++++++++-------- .../image_deduplication_test.py | 19 +++++++++- 2 files changed, 38 insertions(+), 18 deletions(-) diff --git a/src/image_prediction/pipeline.py b/src/image_prediction/pipeline.py index 8219a9d..913ed6a 100644 --- a/src/image_prediction/pipeline.py +++ b/src/image_prediction/pipeline.py @@ -77,24 +77,29 @@ class Pipeline: def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]: - """Filter out duplicate images from the `position` (image coordinates), `page` and `representation` (perceptual hash). + """Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with + `allPassed` set to True. See RED-10765 (RM-241): Removed redactions reappear for why this is necessary. """ - seen = set() - for item in metadata: - key = ( - item["representation"], - item["position"]["x1"], - item["position"]["x2"], - item["position"]["y1"], - item["position"]["y2"], - item["position"]["pageNumber"], + keep = dict() + for image_meta in metadata: + key: tuple[int, int, int, int, int] = ( + image_meta["position"]["x1"], + image_meta["position"]["x2"], + image_meta["position"]["y1"], + image_meta["position"]["y2"], + image_meta["position"]["pageNumber"], ) - if key not in seen: - seen.add(key) - yield item - else: + if key in keep: logger.warning( - f"Duplicate image found: representation={key[0]}, x1={key[1]}, x2={key[2]}, y1={key[3]}, y2={key[4]}, pageNumber={key[5]}" + f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}" ) - continue + if image_meta["filters"]["allPassed"]: + logger.warning("Setting the image with allPassed flag set to True") + keep[key] = image_meta + else: + logger.warning("Keeping the previous image since the current image has allPassed flag set to False") + else: + keep[key] = image_meta + + yield from keep.values() diff --git a/test/regressions_tests/image_deduplication_test.py b/test/regressions_tests/image_deduplication_test.py index ca43a3a..027ef3e 100644 --- a/test/regressions_tests/image_deduplication_test.py +++ b/test/regressions_tests/image_deduplication_test.py @@ -6,7 +6,12 @@ from image_prediction.pipeline import load_pipeline def test_all_duplicate_images_are_filtered(): """See RED-10765 (RM-241): Removed redactions reappear.""" - pdf_path = Path(__file__).parents[1] / "data" / "RED-10765" / "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf" + pdf_path = ( + Path(__file__).parents[1] + / "data" + / "RED-10765" + / "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf" + ) pdf_bytes = pdf_path.read_bytes() pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size) @@ -14,7 +19,17 @@ def test_all_duplicate_images_are_filtered(): seen = set() for prediction in predictions: - key = (prediction['representation'], prediction['position']['x1'], prediction['position']['x2'], prediction['position']['y1'], prediction['position']['y2'], prediction['position']['pageNumber']) + key = ( + prediction["position"]["x1"], + prediction["position"]["x2"], + prediction["position"]["y1"], + prediction["position"]["y2"], + prediction["position"]["pageNumber"], + ) assert key not in seen, f"Duplicate found: {key}" seen.add(key) + all_passed = sum(1 for prediction in predictions if prediction["filters"]["allPassed"]) + assert all_passed == 1, f"Expected 1 image with allPassed flag set to True, but got {all_passed}" + + assert len(predictions) == 177, f"Expected 177 images, but got {len(predictions)}"