Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
528ae3fafe | ||
|
|
e20a5623e6 |
@ -4,7 +4,7 @@ level = "INFO"
|
||||
[service]
|
||||
# Print document processing progress to stdout
|
||||
verbose = false
|
||||
batch_size = 6
|
||||
batch_size = 16
|
||||
image_stiching_tolerance = 1 # in pixels
|
||||
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
|
||||
|
||||
@ -36,7 +36,4 @@ max = 10
|
||||
[filters.overrides.signature.image_to_page_quotient]
|
||||
max = 0.4
|
||||
|
||||
[filters.overrides.logo.image_to_page_quotient]
|
||||
min = 0.06
|
||||
|
||||
|
||||
|
||||
2941
poetry.lock
generated
2941
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "image-classification-service"
|
||||
version = "2.17.0"
|
||||
version = "2.15.1"
|
||||
description = ""
|
||||
authors = ["Team Research <research@knecon.com>"]
|
||||
readme = "README.md"
|
||||
@ -10,8 +10,8 @@ packages = [{ include = "image_prediction", from = "src" }]
|
||||
python = ">=3.10,<3.11"
|
||||
# FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
|
||||
# see RED-9948.
|
||||
pyinfra = { version = "3.4.2", source = "gitlab-research" }
|
||||
kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
|
||||
pyinfra = { version = "3.2.11", source = "gitlab-research" }
|
||||
kn-utils = { version = "^0.2.7", source = "gitlab-research" }
|
||||
dvc = "^2.34.0"
|
||||
dvc-ssh = "^2.20.0"
|
||||
dvc-azure = "^2.21.2"
|
||||
|
||||
@ -10,8 +10,6 @@ from image_prediction.utils.pdf_annotation import annotate_pdf
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
logger.setLevel("DEBUG")
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
@ -77,29 +77,30 @@ class Pipeline:
|
||||
|
||||
|
||||
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
|
||||
"""Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
|
||||
`allPassed` set to True.
|
||||
"""Filter out duplicate images from the `position` (image coordinates), `page` and `representation` (perceptual hash).
|
||||
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
|
||||
"""
|
||||
keep = dict()
|
||||
for image_meta in metadata:
|
||||
key: tuple[int, int, int, int, int] = (
|
||||
image_meta["position"]["x1"],
|
||||
image_meta["position"]["x2"],
|
||||
image_meta["position"]["y1"],
|
||||
image_meta["position"]["y2"],
|
||||
image_meta["position"]["pageNumber"],
|
||||
)
|
||||
if key in keep:
|
||||
logger.warning(
|
||||
f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
|
||||
)
|
||||
if image_meta["filters"]["allPassed"]:
|
||||
logger.warning("Setting the image with allPassed flag set to True")
|
||||
keep[key] = image_meta
|
||||
else:
|
||||
logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
|
||||
else:
|
||||
keep[key] = image_meta
|
||||
|
||||
yield from keep.values()
|
||||
Args:
|
||||
metadata: Iterable of image metadata dicts.
|
||||
|
||||
Returns:
|
||||
Iterable of image metadata dicts with duplicates removed.
|
||||
"""
|
||||
seen = set()
|
||||
for item in metadata:
|
||||
key = (
|
||||
item["representation"],
|
||||
item["position"]["x1"],
|
||||
item["position"]["x2"],
|
||||
item["position"]["y1"],
|
||||
item["position"]["y2"],
|
||||
item["position"]["pageNumber"],
|
||||
)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
yield item
|
||||
else:
|
||||
logger.warning(
|
||||
f"Duplicate image found: representation={key[0]}, x1={key[1]}, x2={key[2]}, y1={key[3]}, y2={key[4]}, pageNumber={key[5]}"
|
||||
)
|
||||
continue
|
||||
|
||||
@ -6,12 +6,7 @@ from image_prediction.pipeline import load_pipeline
|
||||
|
||||
def test_all_duplicate_images_are_filtered():
|
||||
"""See RED-10765 (RM-241): Removed redactions reappear."""
|
||||
pdf_path = (
|
||||
Path(__file__).parents[1]
|
||||
/ "data"
|
||||
/ "RED-10765"
|
||||
/ "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
|
||||
)
|
||||
pdf_path = Path(__file__).parents[1] / "data" / "RED-10765" / "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
|
||||
pdf_bytes = pdf_path.read_bytes()
|
||||
|
||||
pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
|
||||
@ -19,17 +14,7 @@ def test_all_duplicate_images_are_filtered():
|
||||
|
||||
seen = set()
|
||||
for prediction in predictions:
|
||||
key = (
|
||||
prediction["position"]["x1"],
|
||||
prediction["position"]["x2"],
|
||||
prediction["position"]["y1"],
|
||||
prediction["position"]["y2"],
|
||||
prediction["position"]["pageNumber"],
|
||||
)
|
||||
key = (prediction['representation'], prediction['position']['x1'], prediction['position']['x2'], prediction['position']['y1'], prediction['position']['y2'], prediction['position']['pageNumber'])
|
||||
assert key not in seen, f"Duplicate found: {key}"
|
||||
seen.add(key)
|
||||
|
||||
all_passed = sum(1 for prediction in predictions if prediction["filters"]["allPassed"])
|
||||
assert all_passed == 1, f"Expected 1 image with allPassed flag set to True, but got {all_passed}"
|
||||
|
||||
assert len(predictions) == 177, f"Expected 177 images, but got {len(predictions)}"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user