Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0027421628 | ||
|
|
00740c91b8 | ||
|
|
a3d79eb9af | ||
|
|
373f9f2d01 | ||
|
|
2429d90dd5 | ||
|
|
2b85999258 | ||
|
|
4b15d2c2ca | ||
|
|
bf1ca8d6f9 | ||
|
|
9a4b8cad2b | ||
|
|
28adb50330 | ||
|
|
7a3fdf8fa4 | ||
|
|
3fbcd65e9b |
@ -4,7 +4,7 @@ level = "INFO"
|
|||||||
[service]
|
[service]
|
||||||
# Print document processing progress to stdout
|
# Print document processing progress to stdout
|
||||||
verbose = false
|
verbose = false
|
||||||
batch_size = 16
|
batch_size = 6
|
||||||
image_stiching_tolerance = 1 # in pixels
|
image_stiching_tolerance = 1 # in pixels
|
||||||
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
|
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
|
||||||
|
|
||||||
@ -36,4 +36,7 @@ max = 10
|
|||||||
[filters.overrides.signature.image_to_page_quotient]
|
[filters.overrides.signature.image_to_page_quotient]
|
||||||
max = 0.4
|
max = 0.4
|
||||||
|
|
||||||
|
[filters.overrides.logo.image_to_page_quotient]
|
||||||
|
min = 0.06
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
2941
poetry.lock
generated
2941
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "image-classification-service"
|
name = "image-classification-service"
|
||||||
version = "2.15.0"
|
version = "2.17.0"
|
||||||
description = ""
|
description = ""
|
||||||
authors = ["Team Research <research@knecon.com>"]
|
authors = ["Team Research <research@knecon.com>"]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
@ -10,8 +10,8 @@ packages = [{ include = "image_prediction", from = "src" }]
|
|||||||
python = ">=3.10,<3.11"
|
python = ">=3.10,<3.11"
|
||||||
# FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
|
# FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
|
||||||
# see RED-9948.
|
# see RED-9948.
|
||||||
pyinfra = { version = "3.2.11", source = "gitlab-research" }
|
pyinfra = { version = "3.4.2", source = "gitlab-research" }
|
||||||
kn-utils = { version = "^0.2.7", source = "gitlab-research" }
|
kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
|
||||||
dvc = "^2.34.0"
|
dvc = "^2.34.0"
|
||||||
dvc-ssh = "^2.20.0"
|
dvc-ssh = "^2.20.0"
|
||||||
dvc-azure = "^2.21.2"
|
dvc-azure = "^2.21.2"
|
||||||
|
|||||||
@ -10,6 +10,8 @@ from image_prediction.utils.pdf_annotation import annotate_pdf
|
|||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
|
logger.setLevel("DEBUG")
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
from functools import lru_cache, partial
|
from functools import lru_cache, partial
|
||||||
from itertools import chain, tee
|
from itertools import chain, tee
|
||||||
|
from typing import Iterable, Any
|
||||||
|
|
||||||
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
|
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
|
||||||
from kn_utils.logging import logger
|
from kn_utils.logging import logger
|
||||||
@ -54,7 +55,7 @@ class Pipeline:
|
|||||||
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
|
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
|
||||||
|
|
||||||
# />--classify--\
|
# />--classify--\
|
||||||
# --extract-->--split--+->--encode---->+--join-->reformat
|
# --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
|
||||||
# \>--identity--/
|
# \>--identity--/
|
||||||
|
|
||||||
self.pipe = rcompose(
|
self.pipe = rcompose(
|
||||||
@ -63,6 +64,7 @@ class Pipeline:
|
|||||||
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
|
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
|
||||||
join, # ... the streams by zipping
|
join, # ... the streams by zipping
|
||||||
reformat, # ... the items
|
reformat, # ... the items
|
||||||
|
filter_duplicates, # ... filter out duplicate images
|
||||||
)
|
)
|
||||||
|
|
||||||
def __call__(self, pdf: bytes, page_range: range = None):
|
def __call__(self, pdf: bytes, page_range: range = None):
|
||||||
@ -72,3 +74,32 @@ class Pipeline:
|
|||||||
unit=" images",
|
unit=" images",
|
||||||
disable=not self.verbose,
|
disable=not self.verbose,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
|
||||||
|
"""Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
|
||||||
|
`allPassed` set to True.
|
||||||
|
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
|
||||||
|
"""
|
||||||
|
keep = dict()
|
||||||
|
for image_meta in metadata:
|
||||||
|
key: tuple[int, int, int, int, int] = (
|
||||||
|
image_meta["position"]["x1"],
|
||||||
|
image_meta["position"]["x2"],
|
||||||
|
image_meta["position"]["y1"],
|
||||||
|
image_meta["position"]["y2"],
|
||||||
|
image_meta["position"]["pageNumber"],
|
||||||
|
)
|
||||||
|
if key in keep:
|
||||||
|
logger.warning(
|
||||||
|
f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
|
||||||
|
)
|
||||||
|
if image_meta["filters"]["allPassed"]:
|
||||||
|
logger.warning("Setting the image with allPassed flag set to True")
|
||||||
|
keep[key] = image_meta
|
||||||
|
else:
|
||||||
|
logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
|
||||||
|
else:
|
||||||
|
keep[key] = image_meta
|
||||||
|
|
||||||
|
yield from keep.values()
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
outs:
|
outs:
|
||||||
- md5: ab352d3b2c62ce2293cafb57c1b41b01.dir
|
- md5: 08bf8a63f04b3f19f859008556699708.dir
|
||||||
size: 7469082
|
size: 7979836
|
||||||
nfiles: 6
|
nfiles: 7
|
||||||
path: data
|
path: data
|
||||||
|
|||||||
35
test/regressions_tests/image_deduplication_test.py
Normal file
35
test/regressions_tests/image_deduplication_test.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from image_prediction.config import CONFIG
|
||||||
|
from image_prediction.pipeline import load_pipeline
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_duplicate_images_are_filtered():
|
||||||
|
"""See RED-10765 (RM-241): Removed redactions reappear."""
|
||||||
|
pdf_path = (
|
||||||
|
Path(__file__).parents[1]
|
||||||
|
/ "data"
|
||||||
|
/ "RED-10765"
|
||||||
|
/ "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
|
||||||
|
)
|
||||||
|
pdf_bytes = pdf_path.read_bytes()
|
||||||
|
|
||||||
|
pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
|
||||||
|
predictions = list(pipeline(pdf_bytes))
|
||||||
|
|
||||||
|
seen = set()
|
||||||
|
for prediction in predictions:
|
||||||
|
key = (
|
||||||
|
prediction["position"]["x1"],
|
||||||
|
prediction["position"]["x2"],
|
||||||
|
prediction["position"]["y1"],
|
||||||
|
prediction["position"]["y2"],
|
||||||
|
prediction["position"]["pageNumber"],
|
||||||
|
)
|
||||||
|
assert key not in seen, f"Duplicate found: {key}"
|
||||||
|
seen.add(key)
|
||||||
|
|
||||||
|
all_passed = sum(1 for prediction in predictions if prediction["filters"]["allPassed"])
|
||||||
|
assert all_passed == 1, f"Expected 1 image with allPassed flag set to True, but got {all_passed}"
|
||||||
|
|
||||||
|
assert len(predictions) == 177, f"Expected 177 images, but got {len(predictions)}"
|
||||||
Loading…
x
Reference in New Issue
Block a user