Compare commits

...

17 Commits

Author SHA1 Message Date
Julius Unverfehrt
0027421628 feat: RED-10765: ignore perceptual hash for image deduplication and prefer to keep the ones with allPassed set to True 2025-01-31 12:59:59 +01:00
Julius Unverfehrt
00740c91b8 Merge branch 'feat/RED-10765/filter-duplicate-images' into 'master'
feat: RED-10765: filter out classifications for 'duplicate' images present in the document

Closes RED-10765

See merge request redactmanager/image-classification-service!23
2025-01-30 13:20:19 +01:00
Julius Unverfehrt
a3d79eb9af feat: RED-10765: filter out classifications for 'duplicate' images present in the document 2025-01-30 12:42:41 +01:00
Jonathan Kössler
373f9f2d01 Merge branch 'bugfix/RED-10722' into 'master'
RED-10722: fix dead letter queue

Closes RED-10722

See merge request redactmanager/image-classification-service!22
2025-01-16 09:29:11 +01:00
Jonathan Kössler
2429d90dd5 chore: update pyinfra to v3.4.2 2025-01-15 13:39:16 +01:00
Julius Unverfehrt
2b85999258 Merge branch 'fix/RM-227' into 'master'
fix: RM-227: set minimum permissable value for logos

Closes RM-227 and RED-10686

See merge request redactmanager/image-classification-service!21
2024-12-18 12:39:44 +01:00
Julius Unverfehrt
4b15d2c2ca fix: RED-10686: set minimum permissable value for logos
Reference the jira ticket for more information. This change can
introduce unwanted behavior.
2024-12-18 11:47:54 +01:00
Jonathan Kössler
bf1ca8d6f9 Merge branch 'feature/RED-10441' into 'master'
RED-10441: fix abandoned queues

Closes RED-10441

See merge request redactmanager/image-classification-service!20
2024-11-13 17:32:27 +01:00
Jonathan Kössler
9a4b8cad2b chore: update pyinfra to v3.3.5 2024-11-13 17:21:58 +01:00
Jonathan Kössler
28adb50330 chore: update pyinfra to v3.3.4 2024-11-13 16:39:49 +01:00
Jonathan Kössler
7a3fdf8fa4 chore: update pyinfra to v3.3.3 2024-11-13 14:54:29 +01:00
Jonathan Kössler
3fbcd65e9b chore: update pyinfra to v3.3.2 2024-11-13 09:56:55 +01:00
Jonathan Kössler
90a60b4b7c Merge branch 'chore/update_pyinfra' into 'master'
RES-858: fix graceful shutdown

See merge request redactmanager/image-classification-service!19
2024-09-30 11:01:24 +02:00
Jonathan Kössler
526de8984c chore: update pyinfra to v3.2.11 2024-09-30 10:12:40 +02:00
Jonathan Kössler
99cbf3c9bf Merge branch 'feature/RED-10017-fix-config' into 'master'
RED-10017: fix pyinfra config

Closes RED-10017

See merge request redactmanager/image-classification-service!18
2024-09-27 08:22:00 +02:00
Jonathan Kössler
986137e729 chore: update pyinfra to v3.2.10 2024-09-26 13:40:49 +02:00
Jonathan Kössler
f950b96cfb fix: pyinfra config 2024-09-24 14:31:10 +02:00
8 changed files with 1838 additions and 1306 deletions

View File

@ -1,4 +1,7 @@
[asyncio]
max_concurrent_tasks = 10
[dynamic_tenant_queues]
enabled = true

View File

@ -4,7 +4,7 @@ level = "INFO"
[service]
# Print document processing progress to stdout
verbose = false
batch_size = 16
batch_size = 6
image_stiching_tolerance = 1 # in pixels
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
@ -36,4 +36,7 @@ max = 10
[filters.overrides.signature.image_to_page_quotient]
max = 0.4
[filters.overrides.logo.image_to_page_quotient]
min = 0.06

3054
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "image-classification-service"
version = "2.13.0"
version = "2.17.0"
description = ""
authors = ["Team Research <research@knecon.com>"]
readme = "README.md"
@ -10,8 +10,8 @@ packages = [{ include = "image_prediction", from = "src" }]
python = ">=3.10,<3.11"
# FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
# see RED-9948.
pyinfra = { version = "3.2.8", source = "gitlab-research" }
kn-utils = { version = "^0.2.7", source = "gitlab-research" }
pyinfra = { version = "3.4.2", source = "gitlab-research" }
kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
dvc = "^2.34.0"
dvc-ssh = "^2.20.0"
dvc-azure = "^2.21.2"

View File

@ -10,6 +10,8 @@ from image_prediction.utils.pdf_annotation import annotate_pdf
logger = get_logger()
logger.setLevel("DEBUG")
def parse_args():
parser = argparse.ArgumentParser()

View File

@ -1,6 +1,7 @@
import os
from functools import lru_cache, partial
from itertools import chain, tee
from typing import Iterable, Any
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
from kn_utils.logging import logger
@ -54,7 +55,7 @@ class Pipeline:
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
# />--classify--\
# --extract-->--split--+->--encode---->+--join-->reformat
# --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
# \>--identity--/
self.pipe = rcompose(
@ -63,6 +64,7 @@ class Pipeline:
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
join, # ... the streams by zipping
reformat, # ... the items
filter_duplicates, # ... filter out duplicate images
)
def __call__(self, pdf: bytes, page_range: range = None):
@ -72,3 +74,32 @@ class Pipeline:
unit=" images",
disable=not self.verbose,
)
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
"""Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
`allPassed` set to True.
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
"""
keep = dict()
for image_meta in metadata:
key: tuple[int, int, int, int, int] = (
image_meta["position"]["x1"],
image_meta["position"]["x2"],
image_meta["position"]["y1"],
image_meta["position"]["y2"],
image_meta["position"]["pageNumber"],
)
if key in keep:
logger.warning(
f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
)
if image_meta["filters"]["allPassed"]:
logger.warning("Setting the image with allPassed flag set to True")
keep[key] = image_meta
else:
logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
else:
keep[key] = image_meta
yield from keep.values()

View File

@ -1,5 +1,5 @@
outs:
- md5: ab352d3b2c62ce2293cafb57c1b41b01.dir
size: 7469082
nfiles: 6
- md5: 08bf8a63f04b3f19f859008556699708.dir
size: 7979836
nfiles: 7
path: data

View File

@ -0,0 +1,35 @@
from pathlib import Path
from image_prediction.config import CONFIG
from image_prediction.pipeline import load_pipeline
def test_all_duplicate_images_are_filtered():
"""See RED-10765 (RM-241): Removed redactions reappear."""
pdf_path = (
Path(__file__).parents[1]
/ "data"
/ "RED-10765"
/ "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
)
pdf_bytes = pdf_path.read_bytes()
pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
predictions = list(pipeline(pdf_bytes))
seen = set()
for prediction in predictions:
key = (
prediction["position"]["x1"],
prediction["position"]["x2"],
prediction["position"]["y1"],
prediction["position"]["y2"],
prediction["position"]["pageNumber"],
)
assert key not in seen, f"Duplicate found: {key}"
seen.add(key)
all_passed = sum(1 for prediction in predictions if prediction["filters"]["allPassed"])
assert all_passed == 1, f"Expected 1 image with allPassed flag set to True, but got {all_passed}"
assert len(predictions) == 177, f"Expected 177 images, but got {len(predictions)}"