diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 0af1f10..8cf6ff1 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -15,9 +15,6 @@ from funcy import merge, pluck, curry, compose, rcompose, remove from image_prediction.config import CONFIG from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair -from image_prediction.image_extractor.filters import ( - filter_metadata_for_scanned_pages, -) from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs from image_prediction.stitching.utils import validate_box_coords, validate_box_size @@ -51,12 +48,10 @@ class ParsablePDFImageExtractor(ImageExtractor): yield from image_metadata_pairs def __process_images_on_page(self, page: fitz.fitz.Page): - metadata = list(get_metadata_for_images_on_page(page)) - metadata = list(filter_out_page_sized_images(metadata)) - metadata = list(filter_out_tiny_images(metadata)) - metadata = list(filter_invalid_metadata(metadata)) + metadata = get_metadata_for_images_on_page(page) + metadata = the_great_filter(metadata) - metadata = add_alpha_channel_info(self.doc, page, metadata) + metadata = list(add_alpha_channel_info(self.doc, page, metadata)) images = get_images_on_page(self.doc, metadata) @@ -109,12 +104,27 @@ def get_metadata_for_images_on_page(page: fitz.Page): yield from metadata +def the_great_filter(metadata): + return compose( + list, + filter_out_page_sized_images, + list, + filter_out_tiny_images, + list, + filter_invalid_metadata, + )(metadata) + + def filter_invalid_metadata(metadata): return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata) def filter_out_page_sized_images(metadata): - yield from remove(__breaches_image_to_page_quotient, metadata) + yield from remove(breaches_image_to_page_quotient, metadata) + + +def filter_out_tiny_images(metadata): + yield from filterfalse(tiny, metadata) @lru_cache(maxsize=None) @@ -151,10 +161,6 @@ def validate_coords_and_passthrough(metadata): yield from map(validate_box_coords, metadata) -def filter_out_tiny_images(metadata): - yield from filterfalse(tiny, metadata) - - def validate_size_and_passthrough(metadata): yield from map(validate_box_size, metadata) @@ -222,7 +228,7 @@ def clear_caches(): atexit.register(clear_caches) -def __breaches_image_to_page_quotient(metadatum): +def breaches_image_to_page_quotient(metadatum): page_width, page_height, x1, x2, y1, y2, width, height = itemgetter( Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT )(metadatum) diff --git a/image_prediction/image_extractor/filters.py b/image_prediction/image_extractor/filters.py deleted file mode 100644 index 13053de..0000000 --- a/image_prediction/image_extractor/filters.py +++ /dev/null @@ -1,23 +0,0 @@ -from typing import List - -from funcy import first, second - -from image_prediction.image_extractor.extractors.parsable import __breaches_image_to_page_quotient -from image_prediction.info import Info -from image_prediction.utils import get_logger - -logger = get_logger() - - -def filter_metadata_for_scanned_pages(metadata: List[dict]): - assert isinstance(metadata, list) - if is_metadata_of_a_scanned_page(metadata): - logger.info(f"Page {first(metadata)[Info.PAGE_IDX]} is a scanned page; skipping image extraction.") - return [] - return metadata - - -def is_metadata_of_a_scanned_page(metadata): - return first(map(__breaches_image_to_page_quotient, metadata)) and not second(metadata) - -