refactor scanned page filtering WIP
This commit is contained in:
parent
436a32ad2b
commit
0f440bdb09
@ -9,11 +9,14 @@ from typing import Iterable, Iterator, List
|
|||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from funcy import merge, pluck, curry, compose, rcompose
|
from funcy import merge, pluck, curry, compose, rcompose, remove
|
||||||
|
|
||||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||||
from image_prediction.image_extractor.filters import filter_metadata_for_scanned_pages
|
from image_prediction.image_extractor.filters import (
|
||||||
|
filter_metadata_for_scanned_pages,
|
||||||
|
__breaches_image_to_page_quotient,
|
||||||
|
)
|
||||||
from image_prediction.info import Info
|
from image_prediction.info import Info
|
||||||
from image_prediction.stitching.stitching import stitch_pairs
|
from image_prediction.stitching.stitching import stitch_pairs
|
||||||
from image_prediction.stitching.utils import validate_box_coords, validate_box_size
|
from image_prediction.stitching.utils import validate_box_coords, validate_box_size
|
||||||
@ -47,7 +50,7 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
|||||||
|
|
||||||
def __process_images_on_page(self, page: fitz.fitz.Page):
|
def __process_images_on_page(self, page: fitz.fitz.Page):
|
||||||
metadata = list(get_metadata_for_images_on_page(page))
|
metadata = list(get_metadata_for_images_on_page(page))
|
||||||
metadata = filter_metadata_for_scanned_pages(metadata)
|
metadata = list(filter_out_page_sized_images(metadata))
|
||||||
metadata = list(filter_out_tiny_images(metadata))
|
metadata = list(filter_out_tiny_images(metadata))
|
||||||
metadata = list(filter_invalid_metadata(metadata))
|
metadata = list(filter_invalid_metadata(metadata))
|
||||||
|
|
||||||
@ -108,18 +111,8 @@ def filter_invalid_metadata(metadata):
|
|||||||
return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata)
|
return compose(validate_size_and_passthrough, validate_coords_and_passthrough)(metadata)
|
||||||
|
|
||||||
|
|
||||||
# def get_metadata_for_images_on_page_2(page: fitz.fitz.Page):
|
def filter_out_page_sized_images(metadata):
|
||||||
# """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page,
|
yield from remove(__breaches_image_to_page_quotient, metadata)
|
||||||
# however without the validation steps since not required here and take a significant amount of time.
|
|
||||||
# """
|
|
||||||
# # temporary solution to avoid circular imports without changing the original code
|
|
||||||
# from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata
|
|
||||||
#
|
|
||||||
# image_infos = page.get_image_info(xrefs=True)
|
|
||||||
# metadata = lmap(get_image_metadata, image_infos)
|
|
||||||
# metadata = add_page_metadata(page, metadata)
|
|
||||||
#
|
|
||||||
# return metadata
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user