Add test for handling of bad xrefs

This commit is contained in:
Matthias Bisping 2023-02-06 11:31:43 +01:00
parent e63f66a126
commit f6dbfcab43
4 changed files with 25 additions and 16 deletions

View File

@ -3,7 +3,7 @@ webserver:
port: $SERVER_PORT|5000 # webserver port
service:
logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger
logging_level: $LOGGING_LEVEL_ROOT|DEBUG # Logging level for service logger
verbose: $VERBOSE|True # Service prints document processing progress to stdout
batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously
mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from

View File

@ -36,3 +36,7 @@ class InvalidBox(Exception):
class ParsingError(Exception):
pass
class BadXref(ValueError):
pass

View File

@ -12,7 +12,7 @@ import numpy as np
from PIL import Image
from funcy import merge, compose, rcompose, keep
from image_prediction.exceptions import InvalidBox
from image_prediction.exceptions import InvalidBox, BadXref
from image_prediction.formatter.formatters.enum import EnumFormatter
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.info import Info
@ -118,23 +118,29 @@ def get_metadata_for_images_on_page(page: fitz.Page):
@lru_cache(maxsize=None)
def xref_to_maybe_image(doc, xref) -> Union[Image.Image, None]:
def extrac_image(xref):
pixmap = fitz.Pixmap(doc, xref)
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
array = normalize_channels(array)
return Image.fromarray(array)
try:
return extrac_image(xref)
except ValueError:
logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
return
return extract_image(doc, xref)
except BadXref:
return None
def make_maybe_image_metadata_pair(image, metadata):
return ImageMetadataPair(image, metadata) if image and metadata else None
def extract_image(doc, xref) -> Image.Image:
try:
pixmap = fitz.Pixmap(doc, xref)
except ValueError as err:
msg = f"Xref {xref} is invalid, skipping extraction ..."
logger.debug(msg)
raise BadXref(msg) from err
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape((pixmap.h, pixmap.w, pixmap.n))
array = normalize_channels(array)
return Image.fromarray(array)
def has_alpha_channel(doc, xref):
maybe_image = load_image_handle_from_xref(doc, xref)

View File

@ -3,15 +3,14 @@
from pathlib import Path
MODULE_DIR = Path(__file__).resolve().parents[0]
PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
DATA_DIR = PACKAGE_ROOT_DIR / "data"
MLRUNS_DIR = str(DATA_DIR / "mlruns")
TEST_DATA_DIR = PACKAGE_ROOT_DIR / "test" / "data"
TEST_DIR = PACKAGE_ROOT_DIR / "test"
TEST_DATA_DIR = TEST_DIR / "data"
TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"