Compare commits

...

12 Commits

Author SHA1 Message Date
Julius Unverfehrt
0027421628 feat: RED-10765: ignore perceptual hash for image deduplication and prefer to keep the ones with allPassed set to True 2025-01-31 12:59:59 +01:00
Julius Unverfehrt
00740c91b8 Merge branch 'feat/RED-10765/filter-duplicate-images' into 'master'
feat: RED-10765: filter out classifications for 'duplicate' images present in the document

Closes RED-10765

See merge request redactmanager/image-classification-service!23
2025-01-30 13:20:19 +01:00
Julius Unverfehrt
a3d79eb9af feat: RED-10765: filter out classifications for 'duplicate' images present in the document 2025-01-30 12:42:41 +01:00
Jonathan Kössler
373f9f2d01 Merge branch 'bugfix/RED-10722' into 'master'
RED-10722: fix dead letter queue

Closes RED-10722

See merge request redactmanager/image-classification-service!22
2025-01-16 09:29:11 +01:00
Jonathan Kössler
2429d90dd5 chore: update pyinfra to v3.4.2 2025-01-15 13:39:16 +01:00
Julius Unverfehrt
2b85999258 Merge branch 'fix/RM-227' into 'master'
fix: RM-227: set minimum permissable value for logos

Closes RM-227 and RED-10686

See merge request redactmanager/image-classification-service!21
2024-12-18 12:39:44 +01:00
Julius Unverfehrt
4b15d2c2ca fix: RED-10686: set minimum permissable value for logos
Reference the jira ticket for more information. This change can
introduce unwanted behavior.
2024-12-18 11:47:54 +01:00
Jonathan Kössler
bf1ca8d6f9 Merge branch 'feature/RED-10441' into 'master'
RED-10441: fix abandoned queues

Closes RED-10441

See merge request redactmanager/image-classification-service!20
2024-11-13 17:32:27 +01:00
Jonathan Kössler
9a4b8cad2b chore: update pyinfra to v3.3.5 2024-11-13 17:21:58 +01:00
Jonathan Kössler
28adb50330 chore: update pyinfra to v3.3.4 2024-11-13 16:39:49 +01:00
Jonathan Kössler
7a3fdf8fa4 chore: update pyinfra to v3.3.3 2024-11-13 14:54:29 +01:00
Jonathan Kössler
3fbcd65e9b chore: update pyinfra to v3.3.2 2024-11-13 09:56:55 +01:00
7 changed files with 1765 additions and 1263 deletions

View File

@ -4,7 +4,7 @@ level = "INFO"
[service] [service]
# Print document processing progress to stdout # Print document processing progress to stdout
verbose = false verbose = false
batch_size = 16 batch_size = 6
image_stiching_tolerance = 1 # in pixels image_stiching_tolerance = 1 # in pixels
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7" mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
@ -36,4 +36,7 @@ max = 10
[filters.overrides.signature.image_to_page_quotient] [filters.overrides.signature.image_to_page_quotient]
max = 0.4 max = 0.4
[filters.overrides.logo.image_to_page_quotient]
min = 0.06

2941
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "image-classification-service" name = "image-classification-service"
version = "2.15.0" version = "2.17.0"
description = "" description = ""
authors = ["Team Research <research@knecon.com>"] authors = ["Team Research <research@knecon.com>"]
readme = "README.md" readme = "README.md"
@ -10,8 +10,8 @@ packages = [{ include = "image_prediction", from = "src" }]
python = ">=3.10,<3.11" python = ">=3.10,<3.11"
# FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also # FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
# see RED-9948. # see RED-9948.
pyinfra = { version = "3.2.11", source = "gitlab-research" } pyinfra = { version = "3.4.2", source = "gitlab-research" }
kn-utils = { version = "^0.2.7", source = "gitlab-research" } kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
dvc = "^2.34.0" dvc = "^2.34.0"
dvc-ssh = "^2.20.0" dvc-ssh = "^2.20.0"
dvc-azure = "^2.21.2" dvc-azure = "^2.21.2"

View File

@ -10,6 +10,8 @@ from image_prediction.utils.pdf_annotation import annotate_pdf
logger = get_logger() logger = get_logger()
logger.setLevel("DEBUG")
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()

View File

@ -1,6 +1,7 @@
import os import os
from functools import lru_cache, partial from functools import lru_cache, partial
from itertools import chain, tee from itertools import chain, tee
from typing import Iterable, Any
from funcy import rcompose, first, compose, second, chunks, identity, rpartial from funcy import rcompose, first, compose, second, chunks, identity, rpartial
from kn_utils.logging import logger from kn_utils.logging import logger
@ -54,7 +55,7 @@ class Pipeline:
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip)) join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
# />--classify--\ # />--classify--\
# --extract-->--split--+->--encode---->+--join-->reformat # --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
# \>--identity--/ # \>--identity--/
self.pipe = rcompose( self.pipe = rcompose(
@ -63,6 +64,7 @@ class Pipeline:
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
join, # ... the streams by zipping join, # ... the streams by zipping
reformat, # ... the items reformat, # ... the items
filter_duplicates, # ... filter out duplicate images
) )
def __call__(self, pdf: bytes, page_range: range = None): def __call__(self, pdf: bytes, page_range: range = None):
@ -72,3 +74,32 @@ class Pipeline:
unit=" images", unit=" images",
disable=not self.verbose, disable=not self.verbose,
) )
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
"""Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
`allPassed` set to True.
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
"""
keep = dict()
for image_meta in metadata:
key: tuple[int, int, int, int, int] = (
image_meta["position"]["x1"],
image_meta["position"]["x2"],
image_meta["position"]["y1"],
image_meta["position"]["y2"],
image_meta["position"]["pageNumber"],
)
if key in keep:
logger.warning(
f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
)
if image_meta["filters"]["allPassed"]:
logger.warning("Setting the image with allPassed flag set to True")
keep[key] = image_meta
else:
logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
else:
keep[key] = image_meta
yield from keep.values()

View File

@ -1,5 +1,5 @@
outs: outs:
- md5: ab352d3b2c62ce2293cafb57c1b41b01.dir - md5: 08bf8a63f04b3f19f859008556699708.dir
size: 7469082 size: 7979836
nfiles: 6 nfiles: 7
path: data path: data

View File

@ -0,0 +1,35 @@
from pathlib import Path
from image_prediction.config import CONFIG
from image_prediction.pipeline import load_pipeline
def test_all_duplicate_images_are_filtered():
"""See RED-10765 (RM-241): Removed redactions reappear."""
pdf_path = (
Path(__file__).parents[1]
/ "data"
/ "RED-10765"
/ "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
)
pdf_bytes = pdf_path.read_bytes()
pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
predictions = list(pipeline(pdf_bytes))
seen = set()
for prediction in predictions:
key = (
prediction["position"]["x1"],
prediction["position"]["x2"],
prediction["position"]["y1"],
prediction["position"]["y2"],
prediction["position"]["pageNumber"],
)
assert key not in seen, f"Duplicate found: {key}"
seen.add(key)
all_passed = sum(1 for prediction in predictions if prediction["filters"]["allPassed"])
assert all_passed == 1, f"Expected 1 image with allPassed flag set to True, but got {all_passed}"
assert len(predictions) == 177, f"Expected 177 images, but got {len(predictions)}"