diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 0199d49..970bc03 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -5,15 +5,14 @@ import traceback from functools import partial, lru_cache from itertools import chain, starmap, filterfalse from operator import itemgetter, truth -from typing import Iterable, Iterator +from typing import Iterable, Iterator, List import fitz from PIL import Image -from funcy import merge, pluck, curry, compose +from funcy import merge, pluck, curry, compose, rcompose from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair -from image_prediction.image_extractor.extractors.utils import get_image_infos, get_image_metadata, add_page_metadata from image_prediction.image_extractor.filters import filter_scanned_pages from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs @@ -47,7 +46,8 @@ class ParsablePDFImageExtractor(ImageExtractor): yield from image_metadata_pairs # FIXME: Heuristic filtering shouldn't take place here, - # consider introducing a preprocessing step before extracting images. + # consider introducing a preprocessing step before extracting images, + # e.g. together with a image validation step for broken images. @filter_scanned_pages def __process_images_on_page(self, page: fitz.fitz.Page): images = get_images_on_page(self.doc, page) @@ -109,12 +109,34 @@ def get_metadata_for_images_on_page(doc, page: fitz.Page): yield from metadata +@lru_cache(maxsize=None) +def get_image_infos(page: fitz.Page) -> List[dict]: + return page.get_image_info(xrefs=True) + + @lru_cache(maxsize=None) def xref_to_image(doc, xref) -> Image: maybe_image = load_image_handle_from_xref(doc, xref) return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None +def get_image_metadata(image_info): + + x1, y1, x2, y2 = map(rounder, image_info["bbox"]) + + width = abs(x2 - x1) + height = abs(y2 - y1) + + return { + Info.WIDTH: width, + Info.HEIGHT: height, + Info.X1: x1, + Info.X2: x2, + Info.Y1: y1, + Info.Y2: y2, + } + + def validate_coords_and_passthrough(metadata): yield from map(validate_box_coords, metadata) @@ -127,6 +149,10 @@ def validate_size_and_passthrough(metadata): yield from map(validate_box_size, metadata) +def add_page_metadata(page, metadata): + yield from map(partial(merge, get_page_metadata(page)), metadata) + + def add_alpha_channel_info(doc, page, metadata): page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos) @@ -145,6 +171,19 @@ def load_image_handle_from_xref(doc, xref): return doc.extract_image(xref) +rounder = rcompose(round, int) + + +def get_page_metadata(page): + page_width, page_height = map(rounder, page.mediabox_size) + + return { + Info.PAGE_WIDTH: page_width, + Info.PAGE_HEIGHT: page_height, + Info.PAGE_IDX: page.number, + } + + def has_alpha_channel(doc, xref): maybe_image = load_image_handle_from_xref(doc, xref) diff --git a/image_prediction/image_extractor/filters.py b/image_prediction/image_extractor/filters.py new file mode 100644 index 0000000..d3f59b1 --- /dev/null +++ b/image_prediction/image_extractor/filters.py @@ -0,0 +1,74 @@ +from _operator import itemgetter +from typing import Callable, List + +import fitz +from funcy import first, compose, lmap + +from image_prediction.config import CONFIG +from image_prediction.image_extractor.extractor import ImageExtractor +from image_prediction.transformer.transformers.response import compute_geometric_quotient +from image_prediction.utils import get_logger + +logger = get_logger() + + +def filter_scanned_pages(page_processor: Callable): + """Decorator for the __process_images_on_page method of the ParsablePDFImageExtractor. + This makes it so that scanned pages won't be processed (and are thus ultimately removed from the pipline). + A scanned page is defined by + - having only one image on a page + - that image having an image_to_page ratio greater than the allowed max value + found in the CONFIG.filters.image_to_page_quotient.max""" + + def inner(self: ImageExtractor, page: fitz.fitz.Page): + metadata = get_metadata(page) + if is_a_scanned_page(metadata): + logger.debug(f"Page {page.number} won't be processed since it is a scanned page.") + yield from [] + else: + yield from page_processor(self, page) + + logger.info(f"Extracting pages with filtering scanned pages...") + return inner + + +def is_a_scanned_page(metadata: List[dict]): + return _contains_only_one_image(metadata) and _breaches_image_to_page_quotient(first(metadata)) + + +def _breaches_image_to_page_quotient(metadata): + page_width, page_height, x1, x2, y1, y2, width, height = itemgetter( + "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height" + )(metadata) + geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4) + quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max) + return quotient_breached + + +def _contains_only_one_image(metadata): + return True if len(metadata) == 1 else False + + +def get_metadata(page: fitz.fitz.Page): + def get_image_metadata(image_info): + x1, y1, x2, y2 = map(compose(round, int), image_info["bbox"]) + width = abs(x2 - x1) + height = abs(y2 - y1) + return { + "page_width": page_width, + "page_height": page_height, + "page_number": page_number, + "width": width, + "height": height, + "x1": x1, + "x2": x2, + "y1": y1, + "y2": y2, + } + + image_infos = page.get_image_info(xrefs=True) + page_number = page.number + page_width, page_height = page.mediabox_size + + metadata = lmap(get_image_metadata, image_infos) + return metadata diff --git a/test/unit_tests/image_extractor_test.py b/test/unit_tests/image_extractor_test.py index c7e1fc2..8e6916c 100644 --- a/test/unit_tests/image_extractor_test.py +++ b/test/unit_tests/image_extractor_test.py @@ -9,8 +9,7 @@ from funcy import first, rest from image_prediction.extraction import extract_images_from_pdf from image_prediction.image_extractor.extractor import ImageMetadataPair -from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel -from image_prediction.image_extractor.extractors.utils import get_image_infos +from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel, get_image_infos from image_prediction.info import Info from test.utils.comparison import metadata_equal, image_sets_equal from test.utils.generation.pdf import add_image, pdf_stream