refactoring

This commit is contained in:
Julius Unverfehrt 2023-02-03 11:14:14 +01:00
parent eff1bb4124
commit 2995d5ee48

View File

@ -5,7 +5,7 @@ from _operator import itemgetter
from functools import partial, lru_cache from functools import partial, lru_cache
from itertools import chain, starmap, filterfalse from itertools import chain, starmap, filterfalse
from operator import itemgetter, truth from operator import itemgetter, truth
from typing import Iterable, Iterator, List from typing import Iterable, Iterator, List, Union
import fitz import fitz
import numpy as np import numpy as np
@ -18,7 +18,7 @@ from image_prediction.formatter.formatters.enum import EnumFormatter
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.info import Info from image_prediction.info import Info
from image_prediction.stitching.stitching import stitch_pairs from image_prediction.stitching.stitching import stitch_pairs
from image_prediction.stitching.utils import validate_box_coords, validate_box_size, validate_box from image_prediction.stitching.utils import validate_box
from image_prediction.transformer.transformers.response import compute_geometric_quotient from image_prediction.transformer.transformers.response import compute_geometric_quotient
from image_prediction.utils import get_logger from image_prediction.utils import get_logger
from image_prediction.utils.generic import lift from image_prediction.utils.generic import lift
@ -95,7 +95,7 @@ def get_images_on_page(doc, metadata):
def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page): def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
return compose( return compose(
list, list,
partial(add_alpha_channel_info, doc, page), partial(add_alpha_channel_info, doc),
filter_valid_metadata, filter_valid_metadata,
get_metadata_for_images_on_page, get_metadata_for_images_on_page,
)(page) )(page)
@ -115,18 +115,18 @@ def filter_valid_metadata(metadata):
# and giving the user the ability to reclassify false positives with a separate call. # and giving the user the ability to reclassify false positives with a separate call.
# filter_out_page_sized_images, # filter_out_page_sized_images,
filter_out_tiny_images, filter_out_tiny_images,
filter_invalid_metadata, filter_out_invalid_metadata,
)(metadata) )(metadata)
def filter_invalid_metadata(metadata): def filter_out_invalid_metadata(metadata):
def invalid_box_filter(box): def __validate_box(box):
try: try:
return validate_box(box) return validate_box(box)
except InvalidBox as e: except InvalidBox as err:
logger.debug(f"Dropping invalid metadatum, reason: {e}") logger.debug(f"Dropping invalid metadatum, reason: {err}")
yield from keep(invalid_box_filter, metadata) yield from keep(__validate_box, metadata)
def filter_out_page_sized_images(metadata): def filter_out_page_sized_images(metadata):
@ -143,18 +143,17 @@ def get_image_infos(page: fitz.Page) -> List[dict]:
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def xref_to_image(doc, xref) -> Image: def xref_to_image(doc, xref) -> Union[Image.Image, None]:
# NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream
try: try:
pixmap = fitz.Pixmap(doc, xref) pixmap = fitz.Pixmap(doc, xref)
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
# TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)
array = array[:, :, 0] if array.shape[2] == 1 else array
return Image.fromarray(array)
except ValueError: except ValueError:
# FIXME: Invalid xrefs occur here, this shouldn't be the case.
logger.debug(f"Xref {xref} is invalid, skipping extraction ...") logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
return return
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
# TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)
array = array[:, :, 0] if array.shape[2] == 1 else array
return Image.fromarray(array)
def get_image_metadata(image_info): def get_image_metadata(image_info):
@ -176,29 +175,19 @@ def get_image_metadata(image_info):
} }
def validate_coords_and_passthrough(metadata):
yield from map(validate_box_coords, metadata)
def validate_size_and_passthrough(metadata):
yield from map(validate_box_size, metadata)
def add_page_metadata(page, metadata): def add_page_metadata(page, metadata):
yield from map(partial(merge, get_page_metadata(page)), metadata) yield from map(partial(merge, get_page_metadata(page)), metadata)
def add_alpha_channel_info(doc, page, metadata): def add_alpha_channel_info(doc, metadata):
def add_alpha_value_to_metadatum(metadatum):
alpha = metadatum_to_alpha_value(metadatum)
return {**metadatum, Info.ALPHA: alpha}
page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
xref_to_alpha = partial(has_alpha_channel, doc) xref_to_alpha = partial(has_alpha_channel, doc)
page_to_alpha_value_per_image = compose(lift(xref_to_alpha), page_to_xrefs) metadatum_to_alpha_value = compose(xref_to_alpha, itemgetter(Info.XREF))
alpha_to_dict = compose(dict, lambda a: [(Info.ALPHA, a)])
page_to_alpha_mapping_per_image = compose(lift(alpha_to_dict), page_to_alpha_value_per_image)
metadata = starmap(merge, zip(metadata, page_to_alpha_mapping_per_image(page))) yield from map(add_alpha_value_to_metadatum, metadata)
yield from metadata
@lru_cache(maxsize=None) @lru_cache(maxsize=None)