Compare commits

...

2 Commits

Author SHA1 Message Date
Julius Unverfehrt
528ae3fafe chore: try to fix custom build by pushing again 2025-01-30 14:26:51 +01:00
Julius Unverfehrt
e20a5623e6 feat: RED-10765: filter out classifications for 'duplicate' images present in the document 2025-01-30 12:56:50 +01:00
4 changed files with 57 additions and 5 deletions

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "image-classification-service" name = "image-classification-service"
version = "2.15.0" version = "2.15.1"
description = "" description = ""
authors = ["Team Research <research@knecon.com>"] authors = ["Team Research <research@knecon.com>"]
readme = "README.md" readme = "README.md"

View File

@ -1,6 +1,7 @@
import os import os
from functools import lru_cache, partial from functools import lru_cache, partial
from itertools import chain, tee from itertools import chain, tee
from typing import Iterable, Any
from funcy import rcompose, first, compose, second, chunks, identity, rpartial from funcy import rcompose, first, compose, second, chunks, identity, rpartial
from kn_utils.logging import logger from kn_utils.logging import logger
@ -54,7 +55,7 @@ class Pipeline:
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip)) join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
# />--classify--\ # />--classify--\
# --extract-->--split--+->--encode---->+--join-->reformat # --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
# \>--identity--/ # \>--identity--/
self.pipe = rcompose( self.pipe = rcompose(
@ -63,6 +64,7 @@ class Pipeline:
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
join, # ... the streams by zipping join, # ... the streams by zipping
reformat, # ... the items reformat, # ... the items
filter_duplicates, # ... filter out duplicate images
) )
def __call__(self, pdf: bytes, page_range: range = None): def __call__(self, pdf: bytes, page_range: range = None):
@ -72,3 +74,33 @@ class Pipeline:
unit=" images", unit=" images",
disable=not self.verbose, disable=not self.verbose,
) )
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
"""Filter out duplicate images from the `position` (image coordinates), `page` and `representation` (perceptual hash).
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
Args:
metadata: Iterable of image metadata dicts.
Returns:
Iterable of image metadata dicts with duplicates removed.
"""
seen = set()
for item in metadata:
key = (
item["representation"],
item["position"]["x1"],
item["position"]["x2"],
item["position"]["y1"],
item["position"]["y2"],
item["position"]["pageNumber"],
)
if key not in seen:
seen.add(key)
yield item
else:
logger.warning(
f"Duplicate image found: representation={key[0]}, x1={key[1]}, x2={key[2]}, y1={key[3]}, y2={key[4]}, pageNumber={key[5]}"
)
continue

View File

@ -1,5 +1,5 @@
outs: outs:
- md5: ab352d3b2c62ce2293cafb57c1b41b01.dir - md5: 08bf8a63f04b3f19f859008556699708.dir
size: 7469082 size: 7979836
nfiles: 6 nfiles: 7
path: data path: data

View File

@ -0,0 +1,20 @@
from pathlib import Path
from image_prediction.config import CONFIG
from image_prediction.pipeline import load_pipeline
def test_all_duplicate_images_are_filtered():
"""See RED-10765 (RM-241): Removed redactions reappear."""
pdf_path = Path(__file__).parents[1] / "data" / "RED-10765" / "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
pdf_bytes = pdf_path.read_bytes()
pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
predictions = list(pipeline(pdf_bytes))
seen = set()
for prediction in predictions:
key = (prediction['representation'], prediction['position']['x1'], prediction['position']['x2'], prediction['position']['y1'], prediction['position']['y2'], prediction['position']['pageNumber'])
assert key not in seen, f"Duplicate found: {key}"
seen.add(key)