From 9ec6cc19bab45a0d473cb4a8c892a6e55d79b3eb Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 1 Feb 2023 14:53:26 +0100 Subject: [PATCH] refactor scanned page filtering WIP --- .../image_extractor/extractors/parsable.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index cd5a505..5010c25 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -9,7 +9,7 @@ from typing import Iterable, Iterator, List import fitz from PIL import Image -from funcy import merge, pluck, curry, compose, rcompose, lmap +from funcy import merge, pluck, curry, compose, rcompose from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair @@ -46,8 +46,11 @@ class ParsablePDFImageExtractor(ImageExtractor): yield from image_metadata_pairs def __process_images_on_page(self, page: fitz.fitz.Page): - metadata = list(get_metadata_for_images_on_page(self.doc, page)) + metadata = list(get_metadata_for_images_on_page(page)) metadata = filter_metadata_for_scanned_pages(metadata) + metadata = list(filter_out_tiny_images(metadata)) + metadata = list(filter_invalid_metadata(metadata)) + metadata = add_alpha_channel_info(self.doc, page, metadata) images = get_images_on_page(self.doc, metadata) clear_caches() @@ -89,21 +92,19 @@ def get_images_on_page(doc, metadata): yield from images -def get_metadata_for_images_on_page(doc, page: fitz.Page): +def get_metadata_for_images_on_page(page: fitz.Page): metadata = map(get_image_metadata, get_image_infos(page)) - metadata = filter_out_tiny_images(metadata) - metadata = add_page_metadata(page, metadata) - metadata = add_alpha_channel_info(doc, page, metadata) - - metadata = validate_coords_and_passthrough(metadata) - metadata = validate_size_and_passthrough(metadata) yield from metadata +def filter_invalid_metadata(metadata): + return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata) + + # def get_metadata_for_images_on_page_2(page: fitz.fitz.Page): # """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page, # however without the validation steps since not required here and take a significant amount of time.