refactor scanned page filtering WIP
This commit is contained in:
parent
8c7349c2d1
commit
b880e892ec
@ -10,7 +10,7 @@ from typing import Iterable, Iterator, List
|
||||
|
||||
import fitz
|
||||
from PIL import Image
|
||||
from funcy import merge, pluck, curry, compose, rcompose, remove
|
||||
from funcy import merge, pluck, curry, compose, rcompose, remove, rpartial
|
||||
|
||||
from image_prediction.config import CONFIG
|
||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||
@ -48,11 +48,7 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
||||
yield from image_metadata_pairs
|
||||
|
||||
def __process_images_on_page(self, page: fitz.fitz.Page):
|
||||
metadata = get_metadata_for_images_on_page(page)
|
||||
metadata = the_great_filter(metadata)
|
||||
|
||||
metadata = list(add_alpha_channel_info(self.doc, page, metadata))
|
||||
|
||||
metadata = extract_valid_metadata(self.doc, page)
|
||||
images = get_images_on_page(self.doc, metadata)
|
||||
|
||||
clear_caches()
|
||||
@ -95,6 +91,15 @@ def get_images_on_page(doc, metadata):
|
||||
yield from images
|
||||
|
||||
|
||||
def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
|
||||
return compose(
|
||||
list,
|
||||
partial(add_alpha_channel_info, doc, page),
|
||||
filter_valid_metadata,
|
||||
get_metadata_for_images_on_page,
|
||||
)(page)
|
||||
|
||||
|
||||
def get_metadata_for_images_on_page(page: fitz.Page):
|
||||
|
||||
metadata = map(get_image_metadata, get_image_infos(page))
|
||||
@ -104,13 +109,10 @@ def get_metadata_for_images_on_page(page: fitz.Page):
|
||||
yield from metadata
|
||||
|
||||
|
||||
def the_great_filter(metadata):
|
||||
return compose(
|
||||
list,
|
||||
def filter_valid_metadata(metadata):
|
||||
yield from compose(
|
||||
filter_out_page_sized_images,
|
||||
list,
|
||||
filter_out_tiny_images,
|
||||
list,
|
||||
filter_invalid_metadata,
|
||||
)(metadata)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user