refactor scanned page filtering WIP
This commit is contained in:
parent
c55777e339
commit
8c7349c2d1
@ -15,9 +15,6 @@ from funcy import merge, pluck, curry, compose, rcompose, remove
|
||||
from image_prediction.config import CONFIG
|
||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||
from image_prediction.image_extractor.filters import (
|
||||
filter_metadata_for_scanned_pages,
|
||||
)
|
||||
from image_prediction.info import Info
|
||||
from image_prediction.stitching.stitching import stitch_pairs
|
||||
from image_prediction.stitching.utils import validate_box_coords, validate_box_size
|
||||
@ -51,12 +48,10 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
||||
yield from image_metadata_pairs
|
||||
|
||||
def __process_images_on_page(self, page: fitz.fitz.Page):
|
||||
metadata = list(get_metadata_for_images_on_page(page))
|
||||
metadata = list(filter_out_page_sized_images(metadata))
|
||||
metadata = list(filter_out_tiny_images(metadata))
|
||||
metadata = list(filter_invalid_metadata(metadata))
|
||||
metadata = get_metadata_for_images_on_page(page)
|
||||
metadata = the_great_filter(metadata)
|
||||
|
||||
metadata = add_alpha_channel_info(self.doc, page, metadata)
|
||||
metadata = list(add_alpha_channel_info(self.doc, page, metadata))
|
||||
|
||||
images = get_images_on_page(self.doc, metadata)
|
||||
|
||||
@ -109,12 +104,27 @@ def get_metadata_for_images_on_page(page: fitz.Page):
|
||||
yield from metadata
|
||||
|
||||
|
||||
def the_great_filter(metadata):
|
||||
return compose(
|
||||
list,
|
||||
filter_out_page_sized_images,
|
||||
list,
|
||||
filter_out_tiny_images,
|
||||
list,
|
||||
filter_invalid_metadata,
|
||||
)(metadata)
|
||||
|
||||
|
||||
def filter_invalid_metadata(metadata):
|
||||
return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata)
|
||||
|
||||
|
||||
def filter_out_page_sized_images(metadata):
|
||||
yield from remove(__breaches_image_to_page_quotient, metadata)
|
||||
yield from remove(breaches_image_to_page_quotient, metadata)
|
||||
|
||||
|
||||
def filter_out_tiny_images(metadata):
|
||||
yield from filterfalse(tiny, metadata)
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
@ -151,10 +161,6 @@ def validate_coords_and_passthrough(metadata):
|
||||
yield from map(validate_box_coords, metadata)
|
||||
|
||||
|
||||
def filter_out_tiny_images(metadata):
|
||||
yield from filterfalse(tiny, metadata)
|
||||
|
||||
|
||||
def validate_size_and_passthrough(metadata):
|
||||
yield from map(validate_box_size, metadata)
|
||||
|
||||
@ -222,7 +228,7 @@ def clear_caches():
|
||||
atexit.register(clear_caches)
|
||||
|
||||
|
||||
def __breaches_image_to_page_quotient(metadatum):
|
||||
def breaches_image_to_page_quotient(metadatum):
|
||||
page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
|
||||
Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT
|
||||
)(metadatum)
|
||||
|
||||
@ -1,23 +0,0 @@
|
||||
from typing import List
|
||||
|
||||
from funcy import first, second
|
||||
|
||||
from image_prediction.image_extractor.extractors.parsable import __breaches_image_to_page_quotient
|
||||
from image_prediction.info import Info
|
||||
from image_prediction.utils import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def filter_metadata_for_scanned_pages(metadata: List[dict]):
|
||||
assert isinstance(metadata, list)
|
||||
if is_metadata_of_a_scanned_page(metadata):
|
||||
logger.info(f"Page {first(metadata)[Info.PAGE_IDX]} is a scanned page; skipping image extraction.")
|
||||
return []
|
||||
return metadata
|
||||
|
||||
|
||||
def is_metadata_of_a_scanned_page(metadata):
|
||||
return first(map(__breaches_image_to_page_quotient, metadata)) and not second(metadata)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user