chore: try to fix custom build by pushing again

feat: RED-10765: filter out classifications for 'duplicate' images present in the document
2025-01-30 14:26:51 +01:00 · 2025-01-30 12:56:50 +01:00
6 changed files with 1286 additions and 1736 deletions
--- a/config/settings.toml
+++ b/config/settings.toml
@ -4,7 +4,7 @@ level = "INFO"
 [service]
 # Print document processing progress to stdout
 verbose = false
-batch_size = 6
+batch_size = 16
 image_stiching_tolerance = 1  # in pixels
 mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"

@ -36,7 +36,4 @@ max = 10
 [filters.overrides.signature.image_to_page_quotient]
 max = 0.4

-[filters.overrides.logo.image_to_page_quotient]
-min = 0.06
-

--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "image-classification-service"
-version = "2.17.0"
+version = "2.15.1"
 description = ""
 authors = ["Team Research <research@knecon.com>"]
 readme = "README.md"
@ -10,8 +10,8 @@ packages = [{ include = "image_prediction", from = "src" }]
 python = ">=3.10,<3.11"
 # FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
 #  see RED-9948.
-pyinfra = { version = "3.4.2", source = "gitlab-research" }
-kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
+pyinfra = { version = "3.2.11", source = "gitlab-research" }
+kn-utils = { version = "^0.2.7", source = "gitlab-research" }
 dvc = "^2.34.0"
 dvc-ssh = "^2.20.0"
 dvc-azure = "^2.21.2"
--- a/scripts/run_pipeline.py
+++ b/scripts/run_pipeline.py
@ -10,8 +10,6 @@ from image_prediction.utils.pdf_annotation import annotate_pdf

 logger = get_logger()

-logger.setLevel("DEBUG")
-

 def parse_args():
    parser = argparse.ArgumentParser()
--- a/src/image_prediction/pipeline.py
+++ b/src/image_prediction/pipeline.py
@ -77,29 +77,30 @@ class Pipeline:


 def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
-    """Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
-    `allPassed` set to True.
+    """Filter out duplicate images from the `position` (image coordinates), `page` and `representation` (perceptual hash).
    See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
-    """
-    keep = dict()
-    for image_meta in metadata:
-        key: tuple[int, int, int, int, int] = (
-            image_meta["position"]["x1"],
-            image_meta["position"]["x2"],
-            image_meta["position"]["y1"],
-            image_meta["position"]["y2"],
-            image_meta["position"]["pageNumber"],
-        )
-        if key in keep:
-            logger.warning(
-                f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
-            )
-            if image_meta["filters"]["allPassed"]:
-                logger.warning("Setting the image with allPassed flag set to True")
-                keep[key] = image_meta
-            else:
-                logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
-        else:
-            keep[key] = image_meta

-    yield from keep.values()
+    Args:
+        metadata: Iterable of image metadata dicts.
+
+    Returns:
+        Iterable of image metadata dicts with duplicates removed.
+    """
+    seen = set()
+    for item in metadata:
+        key = (
+            item["representation"],
+            item["position"]["x1"],
+            item["position"]["x2"],
+            item["position"]["y1"],
+            item["position"]["y2"],
+            item["position"]["pageNumber"],
+        )
+        if key not in seen:
+            seen.add(key)
+            yield item
+        else:
+            logger.warning(
+                f"Duplicate image found: representation={key[0]}, x1={key[1]}, x2={key[2]}, y1={key[3]}, y2={key[4]}, pageNumber={key[5]}"
+            )
+            continue
--- a/test/regressions_tests/image_deduplication_test.py
+++ b/test/regressions_tests/image_deduplication_test.py
@ -6,12 +6,7 @@ from image_prediction.pipeline import load_pipeline

 def test_all_duplicate_images_are_filtered():
    """See RED-10765 (RM-241): Removed redactions reappear."""
-    pdf_path = (
-        Path(__file__).parents[1]
-        / "data"
-        / "RED-10765"
-        / "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
-    )
+    pdf_path = Path(__file__).parents[1] / "data" / "RED-10765" / "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
    pdf_bytes = pdf_path.read_bytes()

    pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
@ -19,17 +14,7 @@ def test_all_duplicate_images_are_filtered():

    seen = set()
    for prediction in predictions:
-        key = (
-            prediction["position"]["x1"],
-            prediction["position"]["x2"],
-            prediction["position"]["y1"],
-            prediction["position"]["y2"],
-            prediction["position"]["pageNumber"],
-        )
+        key = (prediction['representation'], prediction['position']['x1'], prediction['position']['x2'], prediction['position']['y1'], prediction['position']['y2'], prediction['position']['pageNumber'])
        assert key not in seen, f"Duplicate found: {key}"
        seen.add(key)

-    all_passed = sum(1 for prediction in predictions if prediction["filters"]["allPassed"])
-    assert all_passed == 1, f"Expected 1 image with allPassed flag set to True, but got {all_passed}"
-
-    assert len(predictions) == 177, f"Expected 177 images, but got {len(predictions)}"
Author	SHA1	Message	Date
Julius Unverfehrt	528ae3fafe	chore: try to fix custom build by pushing again	2025-01-30 14:26:51 +01:00
Julius Unverfehrt	e20a5623e6	feat: RED-10765: filter out classifications for 'duplicate' images present in the document	2025-01-30 12:56:50 +01:00