refactoring

This commit is contained in:
Julius Unverfehrt 2023-02-03 11:14:14 +01:00
parent eff1bb4124
commit 2995d5ee48

View File

@ -5,7 +5,7 @@ from _operator import itemgetter
from functools import partial, lru_cache
from itertools import chain, starmap, filterfalse
from operator import itemgetter, truth
from typing import Iterable, Iterator, List
from typing import Iterable, Iterator, List, Union
import fitz
import numpy as np
@ -18,7 +18,7 @@ from image_prediction.formatter.formatters.enum import EnumFormatter
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.info import Info
from image_prediction.stitching.stitching import stitch_pairs
from image_prediction.stitching.utils import validate_box_coords, validate_box_size, validate_box
from image_prediction.stitching.utils import validate_box
from image_prediction.transformer.transformers.response import compute_geometric_quotient
from image_prediction.utils import get_logger
from image_prediction.utils.generic import lift
@ -95,7 +95,7 @@ def get_images_on_page(doc, metadata):
def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
return compose(
list,
partial(add_alpha_channel_info, doc, page),
partial(add_alpha_channel_info, doc),
filter_valid_metadata,
get_metadata_for_images_on_page,
)(page)
@ -115,18 +115,18 @@ def filter_valid_metadata(metadata):
# and giving the user the ability to reclassify false positives with a separate call.
# filter_out_page_sized_images,
filter_out_tiny_images,
filter_invalid_metadata,
filter_out_invalid_metadata,
)(metadata)
def filter_invalid_metadata(metadata):
def invalid_box_filter(box):
def filter_out_invalid_metadata(metadata):
def __validate_box(box):
try:
return validate_box(box)
except InvalidBox as e:
logger.debug(f"Dropping invalid metadatum, reason: {e}")
except InvalidBox as err:
logger.debug(f"Dropping invalid metadatum, reason: {err}")
yield from keep(invalid_box_filter, metadata)
yield from keep(__validate_box, metadata)
def filter_out_page_sized_images(metadata):
@ -143,18 +143,17 @@ def get_image_infos(page: fitz.Page) -> List[dict]:
@lru_cache(maxsize=None)
def xref_to_image(doc, xref) -> Image:
def xref_to_image(doc, xref) -> Union[Image.Image, None]:
# NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream
try:
pixmap = fitz.Pixmap(doc, xref)
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
# TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)
array = array[:, :, 0] if array.shape[2] == 1 else array
return Image.fromarray(array)
except ValueError:
# FIXME: Invalid xrefs occur here, this shouldn't be the case.
logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
return
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
# TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)
array = array[:, :, 0] if array.shape[2] == 1 else array
return Image.fromarray(array)
def get_image_metadata(image_info):
@ -176,29 +175,19 @@ def get_image_metadata(image_info):
}
def validate_coords_and_passthrough(metadata):
yield from map(validate_box_coords, metadata)
def validate_size_and_passthrough(metadata):
yield from map(validate_box_size, metadata)
def add_page_metadata(page, metadata):
yield from map(partial(merge, get_page_metadata(page)), metadata)
def add_alpha_channel_info(doc, page, metadata):
def add_alpha_channel_info(doc, metadata):
def add_alpha_value_to_metadatum(metadatum):
alpha = metadatum_to_alpha_value(metadatum)
return {**metadatum, Info.ALPHA: alpha}
page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
xref_to_alpha = partial(has_alpha_channel, doc)
page_to_alpha_value_per_image = compose(lift(xref_to_alpha), page_to_xrefs)
alpha_to_dict = compose(dict, lambda a: [(Info.ALPHA, a)])
page_to_alpha_mapping_per_image = compose(lift(alpha_to_dict), page_to_alpha_value_per_image)
metadatum_to_alpha_value = compose(xref_to_alpha, itemgetter(Info.XREF))
metadata = starmap(merge, zip(metadata, page_to_alpha_mapping_per_image(page)))
yield from metadata
yield from map(add_alpha_value_to_metadatum, metadata)
@lru_cache(maxsize=None)