Compare commits
4 Commits
master
...
release/1.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2a62c4fca2 | ||
|
|
f48455c496 | ||
|
|
ee48f141f8 | ||
|
|
c03913e088 |
@ -18,6 +18,10 @@ filters:
|
||||
image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
|
||||
min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible
|
||||
max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible
|
||||
customized: # Customized settings per class (RED-5202)
|
||||
max:
|
||||
signature: $MAX_REL_SIGNATURE_SIZE|0.4
|
||||
|
||||
|
||||
image_width_to_height_quotient: # Image width to height ratio
|
||||
min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible
|
||||
|
||||
@ -15,6 +15,12 @@ class DotIndexable:
|
||||
def __init__(self, x):
|
||||
self.x = x
|
||||
|
||||
def get(self, item, default=None):
|
||||
try:
|
||||
return _get_item_and_maybe_make_dotindexable(self.x, item)
|
||||
except KeyError:
|
||||
return default
|
||||
|
||||
def __getattr__(self, item):
|
||||
return _get_item_and_maybe_make_dotindexable(self.x, item)
|
||||
|
||||
|
||||
@ -1,14 +1,17 @@
|
||||
import atexit
|
||||
import io
|
||||
import json
|
||||
import traceback
|
||||
from functools import partial, lru_cache
|
||||
from itertools import chain, starmap, filterfalse
|
||||
from operator import itemgetter
|
||||
from operator import itemgetter, truth
|
||||
from typing import List
|
||||
|
||||
import fitz
|
||||
from PIL import Image
|
||||
from funcy import rcompose, merge, pluck, curry, compose
|
||||
|
||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||
from image_prediction.info import Info
|
||||
from image_prediction.stitching.stitching import stitch_pairs
|
||||
@ -46,10 +49,28 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
||||
clear_caches()
|
||||
|
||||
image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
|
||||
# TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
|
||||
# validation here. Invalid images can then be split into a different stream and joined with the intact images
|
||||
# again for the formatting step.
|
||||
image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
|
||||
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
|
||||
|
||||
yield from image_metadata_pairs
|
||||
|
||||
@staticmethod
|
||||
def __filter_valid_images(image_metadata_pairs):
|
||||
def validate(image: Image.Image, metadata: dict):
|
||||
try:
|
||||
# TODO: stand-in heuristic for testing if image is valid => find cleaner solution
|
||||
image.resize((100, 100)).convert("RGB")
|
||||
return ImageMetadataPair(image, metadata)
|
||||
except (OSError, Exception) as err:
|
||||
metadata = json.dumps(EnumFormatter()(metadata), indent=2)
|
||||
logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
|
||||
return None
|
||||
|
||||
return filter(truth, starmap(validate, image_metadata_pairs))
|
||||
|
||||
|
||||
def extract_pages(doc, page_range):
|
||||
page_range = range(page_range.start + 1, page_range.stop + 1)
|
||||
|
||||
@ -20,6 +20,13 @@ def build_image_info(data: dict) -> dict:
|
||||
image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
|
||||
return image_area_sqrt / page_area_sqrt
|
||||
|
||||
def is_max_image_to_page_quotient_breached(quotient, label):
|
||||
default_max_quotient = CONFIG.filters.image_to_page_quotient.max
|
||||
customized_entries = CONFIG.filters.image_to_page_quotient.customized.max
|
||||
max_quotient = customized_entries.get(label, default_max_quotient)
|
||||
max_quotient = max_quotient if max_quotient else default_max_quotient
|
||||
return bool(quotient > max_quotient)
|
||||
|
||||
page_width, page_height, x1, x2, y1, y2, width, height, alpha = itemgetter(
|
||||
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha"
|
||||
)(data)
|
||||
@ -27,7 +34,9 @@ def build_image_info(data: dict) -> dict:
|
||||
quotient = round(compute_geometric_quotient(), 4)
|
||||
|
||||
min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
|
||||
max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
|
||||
max_image_to_page_quotient_breached = is_max_image_to_page_quotient_breached(
|
||||
quotient, data["classification"]["label"]
|
||||
)
|
||||
min_image_width_to_height_quotient_breached = bool(
|
||||
width / height < CONFIG.filters.image_width_to_height_quotient.min
|
||||
)
|
||||
|
||||
@ -10,7 +10,6 @@ from image_prediction.utils.banner import show_banner
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
def predict(pdf):
|
||||
# Keras service_estimator.predict stalls when service_estimator was loaded in different process;
|
||||
# therefore, we re-load the model (part of the pipeline) every time we process a new document.
|
||||
|
||||
@ -17,7 +17,7 @@ pytest_plugins = [
|
||||
"test.fixtures.parameters",
|
||||
"test.fixtures.pdf",
|
||||
"test.fixtures.target",
|
||||
"test.unit_tests.image_stitching_test"
|
||||
"test.unit_tests.image_stitching_test",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@ from funcy import rcompose, chunks
|
||||
|
||||
|
||||
def test_rcompose():
|
||||
f = rcompose(lambda x: x ** 2, str, lambda x: x * 2)
|
||||
f = rcompose(lambda x: x**2, str, lambda x: x * 2)
|
||||
assert f(3) == "99"
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user