Compare commits

..

No commits in common. "master" and "2.12.0" have entirely different histories.

9 changed files with 1683 additions and 2238 deletions

View File

@ -1 +1 @@
3.10 3.10.12

View File

@ -1,7 +1,4 @@
[asyncio]
max_concurrent_tasks = 10
[dynamic_tenant_queues] [dynamic_tenant_queues]
enabled = true enabled = true

View File

@ -4,7 +4,7 @@ level = "INFO"
[service] [service]
# Print document processing progress to stdout # Print document processing progress to stdout
verbose = false verbose = false
batch_size = 6 batch_size = 16
image_stiching_tolerance = 1 # in pixels image_stiching_tolerance = 1 # in pixels
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7" mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
@ -36,7 +36,4 @@ max = 10
[filters.overrides.signature.image_to_page_quotient] [filters.overrides.signature.image_to_page_quotient]
max = 0.4 max = 0.4
[filters.overrides.logo.image_to_page_quotient]
min = 0.06

3829
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "image-classification-service" name = "image-classification-service"
version = "2.17.0" version = "2.11.0"
description = "" description = ""
authors = ["Team Research <research@knecon.com>"] authors = ["Team Research <research@knecon.com>"]
readme = "README.md" readme = "README.md"
@ -10,8 +10,8 @@ packages = [{ include = "image_prediction", from = "src" }]
python = ">=3.10,<3.11" python = ">=3.10,<3.11"
# FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also # FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
# see RED-9948. # see RED-9948.
pyinfra = { version = "3.4.2", source = "gitlab-research" } pyinfra = { version = "3.2.6.dev263", source = "gitlab-research" }
kn-utils = { version = ">=0.4.0", source = "gitlab-research" } kn-utils = { version = "0.2.7", source = "gitlab-research" }
dvc = "^2.34.0" dvc = "^2.34.0"
dvc-ssh = "^2.20.0" dvc-ssh = "^2.20.0"
dvc-azure = "^2.21.2" dvc-azure = "^2.21.2"

View File

@ -10,8 +10,6 @@ from image_prediction.utils.pdf_annotation import annotate_pdf
logger = get_logger() logger = get_logger()
logger.setLevel("DEBUG")
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()

View File

@ -1,7 +1,6 @@
import os import os
from functools import lru_cache, partial from functools import lru_cache, partial
from itertools import chain, tee from itertools import chain, tee
from typing import Iterable, Any
from funcy import rcompose, first, compose, second, chunks, identity, rpartial from funcy import rcompose, first, compose, second, chunks, identity, rpartial
from kn_utils.logging import logger from kn_utils.logging import logger
@ -55,7 +54,7 @@ class Pipeline:
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip)) join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
# />--classify--\ # />--classify--\
# --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates # --extract-->--split--+->--encode---->+--join-->reformat
# \>--identity--/ # \>--identity--/
self.pipe = rcompose( self.pipe = rcompose(
@ -64,7 +63,6 @@ class Pipeline:
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
join, # ... the streams by zipping join, # ... the streams by zipping
reformat, # ... the items reformat, # ... the items
filter_duplicates, # ... filter out duplicate images
) )
def __call__(self, pdf: bytes, page_range: range = None): def __call__(self, pdf: bytes, page_range: range = None):
@ -74,32 +72,3 @@ class Pipeline:
unit=" images", unit=" images",
disable=not self.verbose, disable=not self.verbose,
) )
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
"""Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
`allPassed` set to True.
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
"""
keep = dict()
for image_meta in metadata:
key: tuple[int, int, int, int, int] = (
image_meta["position"]["x1"],
image_meta["position"]["x2"],
image_meta["position"]["y1"],
image_meta["position"]["y2"],
image_meta["position"]["pageNumber"],
)
if key in keep:
logger.warning(
f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
)
if image_meta["filters"]["allPassed"]:
logger.warning("Setting the image with allPassed flag set to True")
keep[key] = image_meta
else:
logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
else:
keep[key] = image_meta
yield from keep.values()

View File

@ -1,5 +1,5 @@
outs: outs:
- md5: 08bf8a63f04b3f19f859008556699708.dir - md5: ab352d3b2c62ce2293cafb57c1b41b01.dir
size: 7979836 size: 7469082
nfiles: 7 nfiles: 6
path: data path: data

View File

@ -1,35 +0,0 @@
from pathlib import Path
from image_prediction.config import CONFIG
from image_prediction.pipeline import load_pipeline
def test_all_duplicate_images_are_filtered():
"""See RED-10765 (RM-241): Removed redactions reappear."""
pdf_path = (
Path(__file__).parents[1]
/ "data"
/ "RED-10765"
/ "RM-241-461c90d6d6dc0416ad5f0b05feef4dfc.UNTOUCHED_shortened.pdf"
)
pdf_bytes = pdf_path.read_bytes()
pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
predictions = list(pipeline(pdf_bytes))
seen = set()
for prediction in predictions:
key = (
prediction["position"]["x1"],
prediction["position"]["x2"],
prediction["position"]["y1"],
prediction["position"]["y2"],
prediction["position"]["pageNumber"],
)
assert key not in seen, f"Duplicate found: {key}"
seen.add(key)
all_passed = sum(1 for prediction in predictions if prediction["filters"]["allPassed"])
assert all_passed == 1, f"Expected 1 image with allPassed flag set to True, but got {all_passed}"
assert len(predictions) == 177, f"Expected 177 images, but got {len(predictions)}"