diff --git a/image_prediction/image_extractor/filters.py b/image_prediction/image_extractor/filters.py index d6cb2f6..720ead0 100644 --- a/image_prediction/image_extractor/filters.py +++ b/image_prediction/image_extractor/filters.py @@ -1,11 +1,12 @@ from _operator import itemgetter -from typing import Callable, List +from typing import Callable import fitz -from funcy import first, compose, lmap +from funcy import first, lmap, second from image_prediction.config import CONFIG from image_prediction.image_extractor.extractor import ImageExtractor +from image_prediction.info import Info from image_prediction.transformer.transformers.response import compute_geometric_quotient from image_prediction.utils import get_logger @@ -21,10 +22,8 @@ def filter_scanned_pages(page_processor: Callable): defined in CONFIG.filters.image_to_page_quotient.max""" def inner(extractor: ImageExtractor, page: fitz.fitz.Page): - metadata = get_metadata(page) - if is_a_scanned_page(metadata): - logger.debug(f"Page {page.number} won't be processed since it is a scanned page.") - yield from [] + if is_a_scanned_page(page): + logger.info(f"Page {page.number} is a scanned page; skipping image extraction.") else: yield from page_processor(extractor, page) @@ -32,43 +31,29 @@ def filter_scanned_pages(page_processor: Callable): return inner -def is_a_scanned_page(metadata: List[dict]): - return _contains_only_one_image(metadata) and _breaches_image_to_page_quotient(first(metadata)) +def is_a_scanned_page(page: fitz.fitz.Page): + metadata = get_metadata_for_images_on_page(page) + return first(map(__breaches_image_to_page_quotient, metadata)) and not second(metadata) -def _breaches_image_to_page_quotient(metadata): +def __breaches_image_to_page_quotient(metadatum): page_width, page_height, x1, x2, y1, y2, width, height = itemgetter( - "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height" - )(metadata) - geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4) + Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT + )(metadatum) + geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1) quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max) return quotient_breached -def _contains_only_one_image(metadata): - return True if len(metadata) == 1 else False - - -def get_metadata(page: fitz.fitz.Page): - def get_image_metadata(image_info): - x1, y1, x2, y2 = map(compose(round, int), image_info["bbox"]) - width = abs(x2 - x1) - height = abs(y2 - y1) - return { - "page_width": page_width, - "page_height": page_height, - "page_number": page_number, - "width": width, - "height": height, - "x1": x1, - "x2": x2, - "y1": y1, - "y2": y2, - } +def get_metadata_for_images_on_page(page: fitz.fitz.Page): + """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page, + however without the validation steps since not required here and take a significant amount of time. + """ + # temporary solution to avoid circular imports without changing the original code + from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata image_infos = page.get_image_info(xrefs=True) - page_number = page.number - page_width, page_height = page.mediabox_size - metadata = lmap(get_image_metadata, image_infos) + metadata = add_page_metadata(page, metadata) + return metadata