refactoring

2023-02-03 11:14:14 +01:00 · 2023-02-03 11:14:14 +01:00 · 2995d5ee48
commit 2995d5ee48
parent eff1bb4124
1 changed files with 20 additions and 31 deletions
--- a/image_prediction/image_extractor/extractors/parsable.py
+++ b/image_prediction/image_extractor/extractors/parsable.py
@ -5,7 +5,7 @@ from _operator import itemgetter
 from functools import partial, lru_cache
 from itertools import chain, starmap, filterfalse
 from operator import itemgetter, truth
-from typing import Iterable, Iterator, List
+from typing import Iterable, Iterator, List, Union
 import fitz
 import numpy as np
@ -18,7 +18,7 @@ from image_prediction.formatter.formatters.enum import EnumFormatter
 from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
 from image_prediction.info import Info
 from image_prediction.stitching.stitching import stitch_pairs
-from image_prediction.stitching.utils import validate_box_coords, validate_box_size, validate_box
+from image_prediction.stitching.utils import validate_box
 from image_prediction.transformer.transformers.response import compute_geometric_quotient
 from image_prediction.utils import get_logger
 from image_prediction.utils.generic import lift
@ -95,7 +95,7 @@ def get_images_on_page(doc, metadata):
 def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
    return compose(
        list,
-        partial(add_alpha_channel_info, doc, page),
+        partial(add_alpha_channel_info, doc),
        filter_valid_metadata,
        get_metadata_for_images_on_page,
    )(page)
@ -115,18 +115,18 @@ def filter_valid_metadata(metadata):
        #  and giving the user the ability to reclassify false positives with a separate call.
        # filter_out_page_sized_images,
        filter_out_tiny_images,
-        filter_invalid_metadata,
+        filter_out_invalid_metadata,
    )(metadata)
-def filter_invalid_metadata(metadata):
+def filter_out_invalid_metadata(metadata):
-    def invalid_box_filter(box):
+    def __validate_box(box):
        try:
            return validate_box(box)
-        except InvalidBox as e:
+        except InvalidBox as err:
-            logger.debug(f"Dropping invalid metadatum, reason: {e}")
+            logger.debug(f"Dropping invalid metadatum, reason: {err}")
-    yield from keep(invalid_box_filter, metadata)
+    yield from keep(__validate_box, metadata)
 def filter_out_page_sized_images(metadata):
@ -143,18 +143,17 @@ def get_image_infos(page: fitz.Page) -> List[dict]:
@lru_cache(maxsize=None)
-def xref_to_image(doc, xref) -> Image:
+def xref_to_image(doc, xref) -> Union[Image.Image, None]:
    # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream
    try:
        pixmap = fitz.Pixmap(doc, xref)
        array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
        # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)
        array = array[:, :, 0] if array.shape[2] == 1 else array
        return Image.fromarray(array)
    except ValueError:
        # FIXME: Invalid xrefs occur here, this shouldn't be the case.
        logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
        return
    array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
    # TODO: Find a better solution: PIL.Image.fromarray doesn't take grayscale images of the shape (h, w, 1) but (h, w)
    array = array[:, :, 0] if array.shape[2] == 1 else array
    return Image.fromarray(array)
 def get_image_metadata(image_info):
@ -176,29 +175,19 @@ def get_image_metadata(image_info):
    }
 def validate_coords_and_passthrough(metadata):
    yield from map(validate_box_coords, metadata)
 def validate_size_and_passthrough(metadata):
    yield from map(validate_box_size, metadata)
 def add_page_metadata(page, metadata):
    yield from map(partial(merge, get_page_metadata(page)), metadata)
-def add_alpha_channel_info(doc, page, metadata):
+def add_alpha_channel_info(doc, metadata):
    def add_alpha_value_to_metadatum(metadatum):
        alpha = metadatum_to_alpha_value(metadatum)
        return {**metadatum, Info.ALPHA: alpha}
    page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
    xref_to_alpha = partial(has_alpha_channel, doc)
-    page_to_alpha_value_per_image = compose(lift(xref_to_alpha), page_to_xrefs)
+    metadatum_to_alpha_value = compose(xref_to_alpha, itemgetter(Info.XREF))
    alpha_to_dict = compose(dict, lambda a: [(Info.ALPHA, a)])
    page_to_alpha_mapping_per_image = compose(lift(alpha_to_dict), page_to_alpha_value_per_image)
-    metadata = starmap(merge, zip(metadata, page_to_alpha_mapping_per_image(page)))
+    yield from map(add_alpha_value_to_metadatum, metadata)
    yield from metadata
@lru_cache(maxsize=None)