adjust behavior of filtering of invalid images
This commit is contained in:
parent
c478333111
commit
eff1bb4124
@ -10,14 +10,15 @@ from typing import Iterable, Iterator, List
|
||||
import fitz
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from funcy import merge, pluck, curry, compose, rcompose, remove
|
||||
from funcy import merge, pluck, curry, compose, rcompose, remove, keep
|
||||
|
||||
from image_prediction.config import CONFIG
|
||||
from image_prediction.exceptions import InvalidBox
|
||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||
from image_prediction.info import Info
|
||||
from image_prediction.stitching.stitching import stitch_pairs
|
||||
from image_prediction.stitching.utils import validate_box_coords, validate_box_size
|
||||
from image_prediction.stitching.utils import validate_box_coords, validate_box_size, validate_box
|
||||
from image_prediction.transformer.transformers.response import compute_geometric_quotient
|
||||
from image_prediction.utils import get_logger
|
||||
from image_prediction.utils.generic import lift
|
||||
@ -101,9 +102,7 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
|
||||
|
||||
|
||||
def get_metadata_for_images_on_page(page: fitz.Page):
|
||||
|
||||
metadata = map(get_image_metadata, get_image_infos(page))
|
||||
|
||||
metadata = add_page_metadata(page, metadata)
|
||||
|
||||
yield from metadata
|
||||
@ -111,15 +110,23 @@ def get_metadata_for_images_on_page(page: fitz.Page):
|
||||
|
||||
def filter_valid_metadata(metadata):
|
||||
yield from compose(
|
||||
# filter_out_page_sized_images, TODO: Link concept for extraction toggling and reclassification endpoint.
|
||||
filter_invalid_metadata, # TODO: this doesn't filter but raises if images are invalid, maybe should filter
|
||||
filter_out_tiny_images, # FIXME: this implicitly filters invalid metadata, e.g. for zero height images,
|
||||
# This should be done in filter_invalid_metadata.
|
||||
# TODO: Disabled for now, since atm since the backend needs atm the metadata and the hash of every image, even
|
||||
# scanned pages. In the future, this should be resolved differently, e.g. by filtering all page-sized images
|
||||
# and giving the user the ability to reclassify false positives with a separate call.
|
||||
# filter_out_page_sized_images,
|
||||
filter_out_tiny_images,
|
||||
filter_invalid_metadata,
|
||||
)(metadata)
|
||||
|
||||
|
||||
def filter_invalid_metadata(metadata):
|
||||
return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata)
|
||||
def invalid_box_filter(box):
|
||||
try:
|
||||
return validate_box(box)
|
||||
except InvalidBox as e:
|
||||
logger.debug(f"Dropping invalid metadatum, reason: {e}")
|
||||
|
||||
yield from keep(invalid_box_filter, metadata)
|
||||
|
||||
|
||||
def filter_out_page_sized_images(metadata):
|
||||
@ -142,6 +149,7 @@ def xref_to_image(doc, xref) -> Image:
|
||||
pixmap = fitz.Pixmap(doc, xref)
|
||||
except ValueError:
|
||||
# FIXME: Invalid xrefs occur here, this shouldn't be the case.
|
||||
logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
|
||||
return
|
||||
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
||||
# TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user