From e63f66a126478d439f2efb7a20d9735248d7e0a6 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Mon, 6 Feb 2023 10:46:54 +0100 Subject: [PATCH] Refactoring - Rename metadata -> metadatum in some more places to make it clear that it is the metadata of a single image in that context - Re-order function definitions according to caller hierarchy --- .../image_extractor/extractors/parsable.py | 192 ++++++++---------- 1 file changed, 86 insertions(+), 106 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 3f3f21e..97b908e 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -4,22 +4,20 @@ import traceback from _operator import itemgetter from functools import partial, lru_cache from itertools import chain, starmap, filterfalse -from operator import itemgetter, truth +from operator import itemgetter from typing import Iterable, Iterator, List, Union import fitz import numpy as np from PIL import Image -from funcy import merge, pluck, compose, rcompose, remove, keep +from funcy import merge, compose, rcompose, keep -from image_prediction.config import CONFIG from image_prediction.exceptions import InvalidBox from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs from image_prediction.stitching.utils import validate_box -from image_prediction.transformer.transformers.response import compute_geometric_quotient from image_prediction.utils import get_logger logger = get_logger() @@ -34,7 +32,7 @@ class ParsablePDFImageExtractor(ImageExtractor): tolerance: The tolerance in pixels for the distance between images, beyond which they will not be stitched together """ - self.doc: fitz.fitz.Document = None + self.doc: Union[fitz.fitz.Document, None] = None self.verbose = verbose self.tolerance = tolerance @@ -49,11 +47,9 @@ class ParsablePDFImageExtractor(ImageExtractor): def __process_images_on_page(self, page: fitz.fitz.Page): metadata = extract_valid_metadata(self.doc, page) - images = get_images_on_page(self.doc, metadata) - + image_metadata_pairs = map(partial(metadatum_to_image_metadata_pair, self.doc), metadata) clear_caches() - image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata))) # TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the # validation here. Invalid images can then be split into a different stream and joined with the intact images # again for the formatting step. @@ -64,17 +60,17 @@ class ParsablePDFImageExtractor(ImageExtractor): @staticmethod def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]: - def validate(image: Image.Image, metadata: dict): + def validate(image: Image.Image, metadatum: dict): try: # TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148) image.resize((100, 100)).convert("RGB") - return ImageMetadataPair(image, metadata) - except (OSError, Exception) as err: - metadata = json.dumps(EnumFormatter()(metadata), indent=2) - logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}") + return ImageMetadataPair(image, metadatum) + except (OSError, Exception): + metadatum = json.dumps(EnumFormatter()(metadatum), indent=2) + logger.warning(f"Invalid image encountered. Image metadata:\n{metadatum}\n\n{traceback.format_exc()}") return None - return filter(truth, starmap(validate, image_metadata_pairs)) + return keep(starmap(validate, image_metadata_pairs)) def extract_pages(doc, page_range): @@ -84,13 +80,6 @@ def extract_pages(doc, page_range): yield from pages -def get_images_on_page(doc, metadata): - xrefs = pluck(Info.XREF, metadata) - images = map(partial(xref_to_image, doc), xrefs) - - yield from images - - def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page): return compose( list, @@ -100,6 +89,26 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page): )(page) +def metadatum_to_image_metadata_pair(doc, metadatum: dict): + maybe_image = xref_to_maybe_image(doc, metadatum[Info.XREF]) + return make_maybe_image_metadata_pair(maybe_image, metadatum) + + +def add_alpha_channel_info(doc, metadata): + def add_alpha_value_to_metadatum(metadatum): + alpha = metadatum_to_alpha_value(metadatum) + return {**metadatum, Info.ALPHA: alpha} + + xref_to_alpha = partial(has_alpha_channel, doc) + metadatum_to_alpha_value = compose(xref_to_alpha, itemgetter(Info.XREF)) + + yield from map(add_alpha_value_to_metadatum, metadata) + + +def filter_valid_metadata(metadata): + yield from compose(filter_out_tiny_images, filter_out_invalid_metadata)(metadata) + + def get_metadata_for_images_on_page(page: fitz.Page): metadata = map(get_image_metadata, get_image_infos(page)) metadata = add_page_metadata(page, metadata) @@ -107,15 +116,42 @@ def get_metadata_for_images_on_page(page: fitz.Page): yield from metadata -def filter_valid_metadata(metadata): - yield from compose( - # TODO: Disabled for now, since atm since the backend needs atm the metadata and the hash of every image, even - # scanned pages. In the future, this should be resolved differently, e.g. by filtering all page-sized images - # and giving the user the ability to reclassify false positives with a separate call. - # filter_out_page_sized_images, - filter_out_tiny_images, - filter_out_invalid_metadata, - )(metadata) +@lru_cache(maxsize=None) +def xref_to_maybe_image(doc, xref) -> Union[Image.Image, None]: + def extrac_image(xref): + pixmap = fitz.Pixmap(doc, xref) + array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) + array = normalize_channels(array) + return Image.fromarray(array) + + try: + return extrac_image(xref) + except ValueError: + logger.debug(f"Xref {xref} is invalid, skipping extraction ...") + return + + +def make_maybe_image_metadata_pair(image, metadata): + return ImageMetadataPair(image, metadata) if image and metadata else None + + +def has_alpha_channel(doc, xref): + + maybe_image = load_image_handle_from_xref(doc, xref) + maybe_smask = maybe_image["smask"] if maybe_image else None + + if maybe_smask: + return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)]) + else: + try: + return bool(fitz.Pixmap(doc, xref).alpha) + except ValueError: + logger.debug(f"Encountered invalid xref `{xref}` in {doc.metadata.get('title', '')}.") + return False + + +def filter_out_tiny_images(metadata): + yield from filterfalse(tiny, metadata) def filter_out_invalid_metadata(metadata): @@ -128,47 +164,6 @@ def filter_out_invalid_metadata(metadata): yield from keep(__validate_box, metadata) -def filter_out_page_sized_images(metadata): - yield from remove(breaches_image_to_page_quotient, metadata) - - -def filter_out_tiny_images(metadata): - yield from filterfalse(tiny, metadata) - - -@lru_cache(maxsize=None) -def get_image_infos(page: fitz.Page) -> List[dict]: - return page.get_image_info(xrefs=True) - - -@lru_cache(maxsize=None) -def xref_to_image(doc, xref) -> Union[Image.Image, None]: - # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream - try: - pixmap = fitz.Pixmap(doc, xref) - array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) - array = normalize_channels(array) - return Image.fromarray(array) - except ValueError: - logger.debug(f"Xref {xref} is invalid, skipping extraction ...") - return - - -def normalize_channels(array: np.ndarray): - if not array.ndim == 3: - array = np.expand_dims(array, axis=-1) - - if array.shape[-1] == 4: - array = array[..., :3] - elif array.shape[-1] == 1: - array = np.concatenate([array, array, array], axis=-1) - elif array.shape[-1] != 3: - logger.warning(f"Unexpected image format: {array.shape}.") - raise ValueError(f"Unexpected image format: {array.shape}.") - - return array - - def get_image_metadata(image_info): xref, coords = itemgetter("xref", "bbox")(image_info) @@ -188,19 +183,28 @@ def get_image_metadata(image_info): } +@lru_cache(maxsize=None) +def get_image_infos(page: fitz.Page) -> List[dict]: + return page.get_image_info(xrefs=True) + + def add_page_metadata(page, metadata): yield from map(partial(merge, get_page_metadata(page)), metadata) -def add_alpha_channel_info(doc, metadata): - def add_alpha_value_to_metadatum(metadatum): - alpha = metadatum_to_alpha_value(metadatum) - return {**metadatum, Info.ALPHA: alpha} +def normalize_channels(array: np.ndarray): + if not array.ndim == 3: + array = np.expand_dims(array, axis=-1) - xref_to_alpha = partial(has_alpha_channel, doc) - metadatum_to_alpha_value = compose(xref_to_alpha, itemgetter(Info.XREF)) + if array.shape[-1] == 4: + array = array[..., :3] + elif array.shape[-1] == 1: + array = np.concatenate([array, array, array], axis=-1) + elif array.shape[-1] != 3: + logger.warning(f"Unexpected image format: {array.shape}.") + raise ValueError(f"Unexpected image format: {array.shape}.") - yield from map(add_alpha_value_to_metadatum, metadata) + return array @lru_cache(maxsize=None) @@ -208,9 +212,6 @@ def load_image_handle_from_xref(doc, xref): return doc.extract_image(xref) -rounder = rcompose(round, int) - - def get_page_metadata(page): page_width, page_height = map(rounder, page.mediabox_size) @@ -221,38 +222,17 @@ def get_page_metadata(page): } -def has_alpha_channel(doc, xref): - - maybe_image = load_image_handle_from_xref(doc, xref) - maybe_smask = maybe_image["smask"] if maybe_image else None - - if maybe_smask: - return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)]) - else: - try: - return bool(fitz.Pixmap(doc, xref).alpha) - except ValueError: - logger.debug(f"Encountered invalid xref `{xref}` in {doc.metadata.get('title', '')}.") - return False +rounder = rcompose(round, int) -def tiny(metadata): - return metadata[Info.WIDTH] * metadata[Info.HEIGHT] <= 4 +def tiny(metadatum): + return metadatum[Info.WIDTH] * metadatum[Info.HEIGHT] <= 4 def clear_caches(): get_image_infos.cache_clear() load_image_handle_from_xref.cache_clear() - xref_to_image.cache_clear() + xref_to_maybe_image.cache_clear() atexit.register(clear_caches) - - -def breaches_image_to_page_quotient(metadatum): - page_width, page_height, x1, x2, y1, y2, width, height = itemgetter( - Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT - )(metadatum) - geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1) - quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max) - return quotient_breached