From f6dbfcab43416fb2e4f3ea8617ef63abb69f4e4f Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Mon, 6 Feb 2023 11:31:43 +0100 Subject: [PATCH] Add test for handling of bad xrefs --- config.yaml | 2 +- image_prediction/exceptions.py | 4 +++ .../image_extractor/extractors/parsable.py | 28 +++++++++++-------- image_prediction/locations.py | 7 ++--- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/config.yaml b/config.yaml index 6a6111a..d0f3c96 100644 --- a/config.yaml +++ b/config.yaml @@ -3,7 +3,7 @@ webserver: port: $SERVER_PORT|5000 # webserver port service: - logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger + logging_level: $LOGGING_LEVEL_ROOT|DEBUG # Logging level for service logger verbose: $VERBOSE|True # Service prints document processing progress to stdout batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from diff --git a/image_prediction/exceptions.py b/image_prediction/exceptions.py index f03b42a..9c9ca49 100644 --- a/image_prediction/exceptions.py +++ b/image_prediction/exceptions.py @@ -36,3 +36,7 @@ class InvalidBox(Exception): class ParsingError(Exception): pass + + +class BadXref(ValueError): + pass diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 97b908e..102a9a4 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -12,7 +12,7 @@ import numpy as np from PIL import Image from funcy import merge, compose, rcompose, keep -from image_prediction.exceptions import InvalidBox +from image_prediction.exceptions import InvalidBox, BadXref from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.info import Info @@ -118,23 +118,29 @@ def get_metadata_for_images_on_page(page: fitz.Page): @lru_cache(maxsize=None) def xref_to_maybe_image(doc, xref) -> Union[Image.Image, None]: - def extrac_image(xref): - pixmap = fitz.Pixmap(doc, xref) - array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) - array = normalize_channels(array) - return Image.fromarray(array) - try: - return extrac_image(xref) - except ValueError: - logger.debug(f"Xref {xref} is invalid, skipping extraction ...") - return + return extract_image(doc, xref) + except BadXref: + return None def make_maybe_image_metadata_pair(image, metadata): return ImageMetadataPair(image, metadata) if image and metadata else None +def extract_image(doc, xref) -> Image.Image: + try: + pixmap = fitz.Pixmap(doc, xref) + except ValueError as err: + msg = f"Xref {xref} is invalid, skipping extraction ..." + logger.debug(msg) + raise BadXref(msg) from err + + array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape((pixmap.h, pixmap.w, pixmap.n)) + array = normalize_channels(array) + return Image.fromarray(array) + + def has_alpha_channel(doc, xref): maybe_image = load_image_handle_from_xref(doc, xref) diff --git a/image_prediction/locations.py b/image_prediction/locations.py index 1f14c1a..9374ace 100644 --- a/image_prediction/locations.py +++ b/image_prediction/locations.py @@ -3,15 +3,14 @@ from pathlib import Path MODULE_DIR = Path(__file__).resolve().parents[0] - PACKAGE_ROOT_DIR = MODULE_DIR.parents[0] CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml" - BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt" DATA_DIR = PACKAGE_ROOT_DIR / "data" - MLRUNS_DIR = str(DATA_DIR / "mlruns") -TEST_DATA_DIR = PACKAGE_ROOT_DIR / "test" / "data" +TEST_DIR = PACKAGE_ROOT_DIR / "test" +TEST_DATA_DIR = TEST_DIR / "data" +TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"