diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 27e0f33..303021b 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -10,14 +10,15 @@ from typing import Iterable, Iterator, List import fitz import numpy as np from PIL import Image -from funcy import merge, pluck, curry, compose, rcompose, remove +from funcy import merge, pluck, curry, compose, rcompose, remove, keep from image_prediction.config import CONFIG +from image_prediction.exceptions import InvalidBox from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs -from image_prediction.stitching.utils import validate_box_coords, validate_box_size +from image_prediction.stitching.utils import validate_box_coords, validate_box_size, validate_box from image_prediction.transformer.transformers.response import compute_geometric_quotient from image_prediction.utils import get_logger from image_prediction.utils.generic import lift @@ -101,9 +102,7 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page): def get_metadata_for_images_on_page(page: fitz.Page): - metadata = map(get_image_metadata, get_image_infos(page)) - metadata = add_page_metadata(page, metadata) yield from metadata @@ -111,15 +110,23 @@ def get_metadata_for_images_on_page(page: fitz.Page): def filter_valid_metadata(metadata): yield from compose( - # filter_out_page_sized_images, TODO: Link concept for extraction toggling and reclassification endpoint. - filter_invalid_metadata, # TODO: this doesn't filter but raises if images are invalid, maybe should filter - filter_out_tiny_images, # FIXME: this implicitly filters invalid metadata, e.g. for zero height images, - # This should be done in filter_invalid_metadata. + # TODO: Disabled for now, since atm since the backend needs atm the metadata and the hash of every image, even + # scanned pages. In the future, this should be resolved differently, e.g. by filtering all page-sized images + # and giving the user the ability to reclassify false positives with a separate call. + # filter_out_page_sized_images, + filter_out_tiny_images, + filter_invalid_metadata, )(metadata) def filter_invalid_metadata(metadata): - return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata) + def invalid_box_filter(box): + try: + return validate_box(box) + except InvalidBox as e: + logger.debug(f"Dropping invalid metadatum, reason: {e}") + + yield from keep(invalid_box_filter, metadata) def filter_out_page_sized_images(metadata): @@ -142,6 +149,7 @@ def xref_to_image(doc, xref) -> Image: pixmap = fitz.Pixmap(doc, xref) except ValueError: # FIXME: Invalid xrefs occur here, this shouldn't be the case. + logger.debug(f"Xref {xref} is invalid, skipping extraction ...") return array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)