From 2995d5ee4827f5233cced0d469efc764407e60ea Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Fri, 3 Feb 2023 11:14:14 +0100 Subject: [PATCH] refactoring --- .../image_extractor/extractors/parsable.py | 51 ++++++++----------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 303021b..75c98f2 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -5,7 +5,7 @@ from _operator import itemgetter from functools import partial, lru_cache from itertools import chain, starmap, filterfalse from operator import itemgetter, truth -from typing import Iterable, Iterator, List +from typing import Iterable, Iterator, List, Union import fitz import numpy as np @@ -18,7 +18,7 @@ from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs -from image_prediction.stitching.utils import validate_box_coords, validate_box_size, validate_box +from image_prediction.stitching.utils import validate_box from image_prediction.transformer.transformers.response import compute_geometric_quotient from image_prediction.utils import get_logger from image_prediction.utils.generic import lift @@ -95,7 +95,7 @@ def get_images_on_page(doc, metadata): def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page): return compose( list, - partial(add_alpha_channel_info, doc, page), + partial(add_alpha_channel_info, doc), filter_valid_metadata, get_metadata_for_images_on_page, )(page) @@ -115,18 +115,18 @@ def filter_valid_metadata(metadata): # and giving the user the ability to reclassify false positives with a separate call. # filter_out_page_sized_images, filter_out_tiny_images, - filter_invalid_metadata, + filter_out_invalid_metadata, )(metadata) -def filter_invalid_metadata(metadata): - def invalid_box_filter(box): +def filter_out_invalid_metadata(metadata): + def __validate_box(box): try: return validate_box(box) - except InvalidBox as e: - logger.debug(f"Dropping invalid metadatum, reason: {e}") + except InvalidBox as err: + logger.debug(f"Dropping invalid metadatum, reason: {err}") - yield from keep(invalid_box_filter, metadata) + yield from keep(__validate_box, metadata) def filter_out_page_sized_images(metadata): @@ -143,18 +143,17 @@ def get_image_infos(page: fitz.Page) -> List[dict]: @lru_cache(maxsize=None) -def xref_to_image(doc, xref) -> Image: +def xref_to_image(doc, xref) -> Union[Image.Image, None]: # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream try: pixmap = fitz.Pixmap(doc, xref) + array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) + # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w) + array = array[:, :, 0] if array.shape[2] == 1 else array + return Image.fromarray(array) except ValueError: - # FIXME: Invalid xrefs occur here, this shouldn't be the case. logger.debug(f"Xref {xref} is invalid, skipping extraction ...") return - array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) - # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w) - array = array[:, :, 0] if array.shape[2] == 1 else array - return Image.fromarray(array) def get_image_metadata(image_info): @@ -176,29 +175,19 @@ def get_image_metadata(image_info): } -def validate_coords_and_passthrough(metadata): - yield from map(validate_box_coords, metadata) - - -def validate_size_and_passthrough(metadata): - yield from map(validate_box_size, metadata) - - def add_page_metadata(page, metadata): yield from map(partial(merge, get_page_metadata(page)), metadata) -def add_alpha_channel_info(doc, page, metadata): +def add_alpha_channel_info(doc, metadata): + def add_alpha_value_to_metadatum(metadatum): + alpha = metadatum_to_alpha_value(metadatum) + return {**metadatum, Info.ALPHA: alpha} - page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos) xref_to_alpha = partial(has_alpha_channel, doc) - page_to_alpha_value_per_image = compose(lift(xref_to_alpha), page_to_xrefs) - alpha_to_dict = compose(dict, lambda a: [(Info.ALPHA, a)]) - page_to_alpha_mapping_per_image = compose(lift(alpha_to_dict), page_to_alpha_value_per_image) + metadatum_to_alpha_value = compose(xref_to_alpha, itemgetter(Info.XREF)) - metadata = starmap(merge, zip(metadata, page_to_alpha_mapping_per_image(page))) - - yield from metadata + yield from map(add_alpha_value_to_metadatum, metadata) @lru_cache(maxsize=None)