refactor scanned page filtering WIP
This commit is contained in:
parent
2d385b0a73
commit
9ec6cc19ba
@ -9,7 +9,7 @@ from typing import Iterable, Iterator, List
|
||||
|
||||
import fitz
|
||||
from PIL import Image
|
||||
from funcy import merge, pluck, curry, compose, rcompose, lmap
|
||||
from funcy import merge, pluck, curry, compose, rcompose
|
||||
|
||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||
@ -46,8 +46,11 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
||||
yield from image_metadata_pairs
|
||||
|
||||
def __process_images_on_page(self, page: fitz.fitz.Page):
|
||||
metadata = list(get_metadata_for_images_on_page(self.doc, page))
|
||||
metadata = list(get_metadata_for_images_on_page(page))
|
||||
metadata = filter_metadata_for_scanned_pages(metadata)
|
||||
metadata = list(filter_out_tiny_images(metadata))
|
||||
metadata = list(filter_invalid_metadata(metadata))
|
||||
metadata = add_alpha_channel_info(self.doc, page, metadata)
|
||||
images = get_images_on_page(self.doc, metadata)
|
||||
clear_caches()
|
||||
|
||||
@ -89,21 +92,19 @@ def get_images_on_page(doc, metadata):
|
||||
yield from images
|
||||
|
||||
|
||||
def get_metadata_for_images_on_page(doc, page: fitz.Page):
|
||||
def get_metadata_for_images_on_page(page: fitz.Page):
|
||||
|
||||
metadata = map(get_image_metadata, get_image_infos(page))
|
||||
|
||||
metadata = filter_out_tiny_images(metadata)
|
||||
|
||||
metadata = add_page_metadata(page, metadata)
|
||||
|
||||
metadata = add_alpha_channel_info(doc, page, metadata)
|
||||
|
||||
metadata = validate_coords_and_passthrough(metadata)
|
||||
metadata = validate_size_and_passthrough(metadata)
|
||||
yield from metadata
|
||||
|
||||
|
||||
def filter_invalid_metadata(metadata):
|
||||
return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata)
|
||||
|
||||
|
||||
# def get_metadata_for_images_on_page_2(page: fitz.fitz.Page):
|
||||
# """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page,
|
||||
# however without the validation steps since not required here and take a significant amount of time.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user