refactor scanned page filtering WIP

This commit is contained in:
Julius Unverfehrt 2023-02-01 15:47:40 +01:00
parent 8c7349c2d1
commit b880e892ec

View File

@ -10,7 +10,7 @@ from typing import Iterable, Iterator, List
import fitz
from PIL import Image
from funcy import merge, pluck, curry, compose, rcompose, remove
from funcy import merge, pluck, curry, compose, rcompose, remove, rpartial
from image_prediction.config import CONFIG
from image_prediction.formatter.formatters.enum import EnumFormatter
@ -48,11 +48,7 @@ class ParsablePDFImageExtractor(ImageExtractor):
yield from image_metadata_pairs
def __process_images_on_page(self, page: fitz.fitz.Page):
metadata = get_metadata_for_images_on_page(page)
metadata = the_great_filter(metadata)
metadata = list(add_alpha_channel_info(self.doc, page, metadata))
metadata = extract_valid_metadata(self.doc, page)
images = get_images_on_page(self.doc, metadata)
clear_caches()
@ -95,6 +91,15 @@ def get_images_on_page(doc, metadata):
yield from images
def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
return compose(
list,
partial(add_alpha_channel_info, doc, page),
filter_valid_metadata,
get_metadata_for_images_on_page,
)(page)
def get_metadata_for_images_on_page(page: fitz.Page):
metadata = map(get_image_metadata, get_image_infos(page))
@ -104,13 +109,10 @@ def get_metadata_for_images_on_page(page: fitz.Page):
yield from metadata
def the_great_filter(metadata):
return compose(
list,
def filter_valid_metadata(metadata):
yield from compose(
filter_out_page_sized_images,
list,
filter_out_tiny_images,
list,
filter_invalid_metadata,
)(metadata)