Add test for handling of bad xrefs
This commit is contained in:
parent
e63f66a126
commit
f6dbfcab43
@ -3,7 +3,7 @@ webserver:
|
|||||||
port: $SERVER_PORT|5000 # webserver port
|
port: $SERVER_PORT|5000 # webserver port
|
||||||
|
|
||||||
service:
|
service:
|
||||||
logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger
|
logging_level: $LOGGING_LEVEL_ROOT|DEBUG # Logging level for service logger
|
||||||
verbose: $VERBOSE|True # Service prints document processing progress to stdout
|
verbose: $VERBOSE|True # Service prints document processing progress to stdout
|
||||||
batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously
|
batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously
|
||||||
mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from
|
mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from
|
||||||
|
|||||||
@ -36,3 +36,7 @@ class InvalidBox(Exception):
|
|||||||
|
|
||||||
class ParsingError(Exception):
|
class ParsingError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class BadXref(ValueError):
|
||||||
|
pass
|
||||||
|
|||||||
@ -12,7 +12,7 @@ import numpy as np
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
from funcy import merge, compose, rcompose, keep
|
from funcy import merge, compose, rcompose, keep
|
||||||
|
|
||||||
from image_prediction.exceptions import InvalidBox
|
from image_prediction.exceptions import InvalidBox, BadXref
|
||||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||||
from image_prediction.info import Info
|
from image_prediction.info import Info
|
||||||
@ -118,23 +118,29 @@ def get_metadata_for_images_on_page(page: fitz.Page):
|
|||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def xref_to_maybe_image(doc, xref) -> Union[Image.Image, None]:
|
def xref_to_maybe_image(doc, xref) -> Union[Image.Image, None]:
|
||||||
def extrac_image(xref):
|
|
||||||
pixmap = fitz.Pixmap(doc, xref)
|
|
||||||
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
|
||||||
array = normalize_channels(array)
|
|
||||||
return Image.fromarray(array)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return extrac_image(xref)
|
return extract_image(doc, xref)
|
||||||
except ValueError:
|
except BadXref:
|
||||||
logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
|
return None
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
def make_maybe_image_metadata_pair(image, metadata):
|
def make_maybe_image_metadata_pair(image, metadata):
|
||||||
return ImageMetadataPair(image, metadata) if image and metadata else None
|
return ImageMetadataPair(image, metadata) if image and metadata else None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_image(doc, xref) -> Image.Image:
|
||||||
|
try:
|
||||||
|
pixmap = fitz.Pixmap(doc, xref)
|
||||||
|
except ValueError as err:
|
||||||
|
msg = f"Xref {xref} is invalid, skipping extraction ..."
|
||||||
|
logger.debug(msg)
|
||||||
|
raise BadXref(msg) from err
|
||||||
|
|
||||||
|
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape((pixmap.h, pixmap.w, pixmap.n))
|
||||||
|
array = normalize_channels(array)
|
||||||
|
return Image.fromarray(array)
|
||||||
|
|
||||||
|
|
||||||
def has_alpha_channel(doc, xref):
|
def has_alpha_channel(doc, xref):
|
||||||
|
|
||||||
maybe_image = load_image_handle_from_xref(doc, xref)
|
maybe_image = load_image_handle_from_xref(doc, xref)
|
||||||
|
|||||||
@ -3,15 +3,14 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
MODULE_DIR = Path(__file__).resolve().parents[0]
|
MODULE_DIR = Path(__file__).resolve().parents[0]
|
||||||
|
|
||||||
PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
|
PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
|
||||||
|
|
||||||
CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
|
CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
|
||||||
|
|
||||||
BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
|
BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
|
||||||
|
|
||||||
DATA_DIR = PACKAGE_ROOT_DIR / "data"
|
DATA_DIR = PACKAGE_ROOT_DIR / "data"
|
||||||
|
|
||||||
MLRUNS_DIR = str(DATA_DIR / "mlruns")
|
MLRUNS_DIR = str(DATA_DIR / "mlruns")
|
||||||
|
|
||||||
TEST_DATA_DIR = PACKAGE_ROOT_DIR / "test" / "data"
|
TEST_DIR = PACKAGE_ROOT_DIR / "test"
|
||||||
|
TEST_DATA_DIR = TEST_DIR / "data"
|
||||||
|
TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user