adjust behavior of filtering of invalid images

This commit is contained in:
Julius Unverfehrt 2023-02-03 09:04:02 +01:00
parent c478333111
commit eff1bb4124

View File

@ -10,14 +10,15 @@ from typing import Iterable, Iterator, List
import fitz
import numpy as np
from PIL import Image
from funcy import merge, pluck, curry, compose, rcompose, remove
from funcy import merge, pluck, curry, compose, rcompose, remove, keep
from image_prediction.config import CONFIG
from image_prediction.exceptions import InvalidBox
from image_prediction.formatter.formatters.enum import EnumFormatter
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.info import Info
from image_prediction.stitching.stitching import stitch_pairs
from image_prediction.stitching.utils import validate_box_coords, validate_box_size
from image_prediction.stitching.utils import validate_box_coords, validate_box_size, validate_box
from image_prediction.transformer.transformers.response import compute_geometric_quotient
from image_prediction.utils import get_logger
from image_prediction.utils.generic import lift
@ -101,9 +102,7 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
def get_metadata_for_images_on_page(page: fitz.Page):
metadata = map(get_image_metadata, get_image_infos(page))
metadata = add_page_metadata(page, metadata)
yield from metadata
@ -111,15 +110,23 @@ def get_metadata_for_images_on_page(page: fitz.Page):
def filter_valid_metadata(metadata):
yield from compose(
# filter_out_page_sized_images, TODO: Link concept for extraction toggling and reclassification endpoint.
filter_invalid_metadata, # TODO: this doesn't filter but raises if images are invalid, maybe should filter
filter_out_tiny_images, # FIXME: this implicitly filters invalid metadata, e.g. for zero height images,
# This should be done in filter_invalid_metadata.
# TODO: Disabled for now, since atm since the backend needs atm the metadata and the hash of every image, even
# scanned pages. In the future, this should be resolved differently, e.g. by filtering all page-sized images
# and giving the user the ability to reclassify false positives with a separate call.
# filter_out_page_sized_images,
filter_out_tiny_images,
filter_invalid_metadata,
)(metadata)
def filter_invalid_metadata(metadata):
return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata)
def invalid_box_filter(box):
try:
return validate_box(box)
except InvalidBox as e:
logger.debug(f"Dropping invalid metadatum, reason: {e}")
yield from keep(invalid_box_filter, metadata)
def filter_out_page_sized_images(metadata):
@ -142,6 +149,7 @@ def xref_to_image(doc, xref) -> Image:
pixmap = fitz.Pixmap(doc, xref)
except ValueError:
# FIXME: Invalid xrefs occur here, this shouldn't be the case.
logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
return
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
# TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)