feat: RED-10765: filter out classifications for 'duplicate' images present in the document

This commit is contained in:
Julius Unverfehrt 2025-01-30 12:42:41 +01:00
parent 373f9f2d01
commit a3d79eb9af
3 changed files with 50 additions and 4 deletions

View File

@ -1,6 +1,7 @@
import os import os
from functools import lru_cache, partial from functools import lru_cache, partial
from itertools import chain, tee from itertools import chain, tee
from typing import Iterable, Any
from funcy import rcompose, first, compose, second, chunks, identity, rpartial from funcy import rcompose, first, compose, second, chunks, identity, rpartial
from kn_utils.logging import logger from kn_utils.logging import logger
@ -54,7 +55,7 @@ class Pipeline:
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip)) join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
# />--classify--\ # />--classify--\
# --extract-->--split--+->--encode---->+--join-->reformat # --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
# \>--identity--/ # \>--identity--/
self.pipe = rcompose( self.pipe = rcompose(
@ -63,6 +64,7 @@ class Pipeline:
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
join, # ... the streams by zipping join, # ... the streams by zipping
reformat, # ... the items reformat, # ... the items
filter_duplicates, # ... filter out duplicate images
) )
def __call__(self, pdf: bytes, page_range: range = None): def __call__(self, pdf: bytes, page_range: range = None):
@ -72,3 +74,27 @@ class Pipeline:
unit=" images", unit=" images",
disable=not self.verbose, disable=not self.verbose,
) )
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
"""Filter out duplicate images from the `position` (image coordinates), `page` and `representation` (perceptual hash).
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
"""
seen = set()
for item in metadata:
key = (
item["representation"],
item["position"]["x1"],
item["position"]["x2"],
item["position"]["y1"],
item["position"]["y2"],
item["position"]["pageNumber"],
)
if key not in seen:
seen.add(key)
yield item
else:
logger.warning(
f"Duplicate image found: representation={key[0]}, x1={key[1]}, x2={key[2]}, y1={key[3]}, y2={key[4]}, pageNumber={key[5]}"
)
continue

View File

@ -1,5 +1,5 @@
outs: outs:
- md5: ab352d3b2c62ce2293cafb57c1b41b01.dir - md5: 08bf8a63f04b3f19f859008556699708.dir
size: 7469082 size: 7979836
nfiles: 6 nfiles: 7
path: data path: data

View File

@ -0,0 +1,20 @@
from pathlib import Path
from image_prediction.config import CONFIG
from image_prediction.pipeline import load_pipeline
def test_all_duplicate_images_are_filtered():
"""See RED-10765 (RM-241): Removed redactions reappear."""
pdf_path = Path(__file__).parents[1] / "data" / "RED-10765" / "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
pdf_bytes = pdf_path.read_bytes()
pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
predictions = list(pipeline(pdf_bytes))
seen = set()
for prediction in predictions:
key = (prediction['representation'], prediction['position']['x1'], prediction['position']['x2'], prediction['position']['y1'], prediction['position']['y2'], prediction['position']['pageNumber'])
assert key not in seen, f"Duplicate found: {key}"
seen.add(key)