feat: RED-10765: filter out classifications for 'duplicate' images present in the document

This commit is contained in:
Julius Unverfehrt 2025-01-30 12:42:41 +01:00
parent 373f9f2d01
commit a3d79eb9af
3 changed files with 50 additions and 4 deletions

View File

@ -1,6 +1,7 @@
import os
from functools import lru_cache, partial
from itertools import chain, tee
from typing import Iterable, Any
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
from kn_utils.logging import logger
@ -54,7 +55,7 @@ class Pipeline:
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
# />--classify--\
# --extract-->--split--+->--encode---->+--join-->reformat
# --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
# \>--identity--/
self.pipe = rcompose(
@ -63,6 +64,7 @@ class Pipeline:
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
join, # ... the streams by zipping
reformat, # ... the items
filter_duplicates, # ... filter out duplicate images
)
def __call__(self, pdf: bytes, page_range: range = None):
@ -72,3 +74,27 @@ class Pipeline:
unit=" images",
disable=not self.verbose,
)
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
"""Filter out duplicate images from the `position` (image coordinates), `page` and `representation` (perceptual hash).
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
"""
seen = set()
for item in metadata:
key = (
item["representation"],
item["position"]["x1"],
item["position"]["x2"],
item["position"]["y1"],
item["position"]["y2"],
item["position"]["pageNumber"],
)
if key not in seen:
seen.add(key)
yield item
else:
logger.warning(
f"Duplicate image found: representation={key[0]}, x1={key[1]}, x2={key[2]}, y1={key[3]}, y2={key[4]}, pageNumber={key[5]}"
)
continue

View File

@ -1,5 +1,5 @@
outs:
- md5: ab352d3b2c62ce2293cafb57c1b41b01.dir
size: 7469082
nfiles: 6
- md5: 08bf8a63f04b3f19f859008556699708.dir
size: 7979836
nfiles: 7
path: data

View File

@ -0,0 +1,20 @@
from pathlib import Path
from image_prediction.config import CONFIG
from image_prediction.pipeline import load_pipeline
def test_all_duplicate_images_are_filtered():
"""See RED-10765 (RM-241): Removed redactions reappear."""
pdf_path = Path(__file__).parents[1] / "data" / "RED-10765" / "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
pdf_bytes = pdf_path.read_bytes()
pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
predictions = list(pipeline(pdf_bytes))
seen = set()
for prediction in predictions:
key = (prediction['representation'], prediction['position']['x1'], prediction['position']['x2'], prediction['position']['y1'], prediction['position']['y2'], prediction['position']['pageNumber'])
assert key not in seen, f"Duplicate found: {key}"
seen.add(key)