diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index f6bbc82..4f80723 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -9,11 +9,14 @@ from typing import Iterable, Iterator, List import fitz from PIL import Image -from funcy import merge, pluck, curry, compose, rcompose +from funcy import merge, pluck, curry, compose, rcompose, remove from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair -from image_prediction.image_extractor.filters import filter_metadata_for_scanned_pages +from image_prediction.image_extractor.filters import ( + filter_metadata_for_scanned_pages, + __breaches_image_to_page_quotient, +) from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs from image_prediction.stitching.utils import validate_box_coords, validate_box_size @@ -47,7 +50,7 @@ class ParsablePDFImageExtractor(ImageExtractor): def __process_images_on_page(self, page: fitz.fitz.Page): metadata = list(get_metadata_for_images_on_page(page)) - metadata = filter_metadata_for_scanned_pages(metadata) + metadata = list(filter_out_page_sized_images(metadata)) metadata = list(filter_out_tiny_images(metadata)) metadata = list(filter_invalid_metadata(metadata)) @@ -108,18 +111,8 @@ def filter_invalid_metadata(metadata): return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata) -# def get_metadata_for_images_on_page_2(page: fitz.fitz.Page): -# """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page, -# however without the validation steps since not required here and take a significant amount of time. -# """ -# # temporary solution to avoid circular imports without changing the original code -# from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata -# -# image_infos = page.get_image_info(xrefs=True) -# metadata = lmap(get_image_metadata, image_infos) -# metadata = add_page_metadata(page, metadata) -# -# return metadata +def filter_out_page_sized_images(metadata): + yield from remove(__breaches_image_to_page_quotient, metadata) @lru_cache(maxsize=None)