refactor scanned page filtering WIP

This commit is contained in:
Julius Unverfehrt 2023-02-01 14:53:26 +01:00
parent 2d385b0a73
commit 9ec6cc19ba

View File

@ -9,7 +9,7 @@ from typing import Iterable, Iterator, List
import fitz
from PIL import Image
from funcy import merge, pluck, curry, compose, rcompose, lmap
from funcy import merge, pluck, curry, compose, rcompose
from image_prediction.formatter.formatters.enum import EnumFormatter
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
@ -46,8 +46,11 @@ class ParsablePDFImageExtractor(ImageExtractor):
yield from image_metadata_pairs
def __process_images_on_page(self, page: fitz.fitz.Page):
metadata = list(get_metadata_for_images_on_page(self.doc, page))
metadata = list(get_metadata_for_images_on_page(page))
metadata = filter_metadata_for_scanned_pages(metadata)
metadata = list(filter_out_tiny_images(metadata))
metadata = list(filter_invalid_metadata(metadata))
metadata = add_alpha_channel_info(self.doc, page, metadata)
images = get_images_on_page(self.doc, metadata)
clear_caches()
@ -89,21 +92,19 @@ def get_images_on_page(doc, metadata):
yield from images
def get_metadata_for_images_on_page(doc, page: fitz.Page):
def get_metadata_for_images_on_page(page: fitz.Page):
metadata = map(get_image_metadata, get_image_infos(page))
metadata = filter_out_tiny_images(metadata)
metadata = add_page_metadata(page, metadata)
metadata = add_alpha_channel_info(doc, page, metadata)
metadata = validate_coords_and_passthrough(metadata)
metadata = validate_size_and_passthrough(metadata)
yield from metadata
def filter_invalid_metadata(metadata):
return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata)
# def get_metadata_for_images_on_page_2(page: fitz.fitz.Page):
# """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page,
# however without the validation steps since not required here and take a significant amount of time.