refactor scanned page filtering WIP
This commit is contained in:
parent
5bd5e0cf2b
commit
2d385b0a73
@ -9,11 +9,11 @@ from typing import Iterable, Iterator, List
|
||||
|
||||
import fitz
|
||||
from PIL import Image
|
||||
from funcy import merge, pluck, curry, compose, rcompose
|
||||
from funcy import merge, pluck, curry, compose, rcompose, lmap
|
||||
|
||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||
from image_prediction.image_extractor.filters import filter_scanned_pages
|
||||
from image_prediction.image_extractor.filters import filter_metadata_for_scanned_pages
|
||||
from image_prediction.info import Info
|
||||
from image_prediction.stitching.stitching import stitch_pairs
|
||||
from image_prediction.stitching.utils import validate_box_coords, validate_box_size
|
||||
@ -45,13 +45,10 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
||||
|
||||
yield from image_metadata_pairs
|
||||
|
||||
# FIXME: Heuristic filtering shouldn't take place here,
|
||||
# consider introducing a preprocessing step before extracting images,
|
||||
# e.g. together with an image validation step for broken images.
|
||||
@filter_scanned_pages
|
||||
def __process_images_on_page(self, page: fitz.fitz.Page):
|
||||
images = get_images_on_page(self.doc, page)
|
||||
metadata = get_metadata_for_images_on_page(self.doc, page)
|
||||
metadata = list(get_metadata_for_images_on_page(self.doc, page))
|
||||
metadata = filter_metadata_for_scanned_pages(metadata)
|
||||
images = get_images_on_page(self.doc, metadata)
|
||||
clear_caches()
|
||||
|
||||
image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
|
||||
@ -85,10 +82,8 @@ def extract_pages(doc, page_range):
|
||||
yield from pages
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_images_on_page(doc, page: fitz.Page):
|
||||
image_infos = get_image_infos(page)
|
||||
xrefs = map(itemgetter("xref"), image_infos)
|
||||
def get_images_on_page(doc, metadata):
|
||||
xrefs = pluck(Info.XREF, metadata)
|
||||
images = map(partial(xref_to_image, doc), xrefs)
|
||||
|
||||
yield from images
|
||||
@ -97,18 +92,32 @@ def get_images_on_page(doc, page: fitz.Page):
|
||||
def get_metadata_for_images_on_page(doc, page: fitz.Page):
|
||||
|
||||
metadata = map(get_image_metadata, get_image_infos(page))
|
||||
metadata = validate_coords_and_passthrough(metadata)
|
||||
|
||||
metadata = filter_out_tiny_images(metadata)
|
||||
metadata = validate_size_and_passthrough(metadata)
|
||||
|
||||
metadata = add_page_metadata(page, metadata)
|
||||
|
||||
metadata = add_alpha_channel_info(doc, page, metadata)
|
||||
|
||||
metadata = validate_coords_and_passthrough(metadata)
|
||||
metadata = validate_size_and_passthrough(metadata)
|
||||
yield from metadata
|
||||
|
||||
|
||||
# def get_metadata_for_images_on_page_2(page: fitz.fitz.Page):
|
||||
# """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page,
|
||||
# however without the validation steps since not required here and take a significant amount of time.
|
||||
# """
|
||||
# # temporary solution to avoid circular imports without changing the original code
|
||||
# from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata
|
||||
#
|
||||
# image_infos = page.get_image_info(xrefs=True)
|
||||
# metadata = lmap(get_image_metadata, image_infos)
|
||||
# metadata = add_page_metadata(page, metadata)
|
||||
#
|
||||
# return metadata
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_image_infos(page: fitz.Page) -> List[dict]:
|
||||
return page.get_image_info(xrefs=True)
|
||||
@ -122,7 +131,8 @@ def xref_to_image(doc, xref) -> Image:
|
||||
|
||||
def get_image_metadata(image_info):
|
||||
|
||||
x1, y1, x2, y2 = map(rounder, image_info["bbox"])
|
||||
xref, coords = itemgetter("xref", "bbox")(image_info)
|
||||
x1, y1, x2, y2 = map(rounder, coords)
|
||||
|
||||
width = abs(x2 - x1)
|
||||
height = abs(y2 - y1)
|
||||
@ -134,6 +144,7 @@ def get_image_metadata(image_info):
|
||||
Info.X2: x2,
|
||||
Info.Y1: y1,
|
||||
Info.Y2: y2,
|
||||
Info.XREF: xref,
|
||||
}
|
||||
|
||||
|
||||
@ -206,7 +217,6 @@ def tiny(metadata):
|
||||
def clear_caches():
|
||||
get_image_infos.cache_clear()
|
||||
load_image_handle_from_xref.cache_clear()
|
||||
get_images_on_page.cache_clear()
|
||||
xref_to_image.cache_clear()
|
||||
|
||||
|
||||
|
||||
@ -1,11 +1,9 @@
|
||||
from _operator import itemgetter
|
||||
from typing import Callable
|
||||
from typing import List
|
||||
|
||||
import fitz
|
||||
from funcy import first, lmap, second
|
||||
from funcy import first, second
|
||||
|
||||
from image_prediction.config import CONFIG
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor
|
||||
from image_prediction.info import Info
|
||||
from image_prediction.transformer.transformers.response import compute_geometric_quotient
|
||||
from image_prediction.utils import get_logger
|
||||
@ -13,26 +11,15 @@ from image_prediction.utils import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def filter_scanned_pages(page_processor: Callable):
|
||||
"""Decorator for the __process_images_on_page method of the ParsablePDFImageExtractor.
|
||||
This makes it so that scanned pages won't be processed (and are thus ultimately removed from the pipline).
|
||||
A scanned page is defined by
|
||||
- having only one image on a page and
|
||||
- that image having an image_to_page ratio greater than the allowed max value
|
||||
defined in CONFIG.filters.image_to_page_quotient.max"""
|
||||
|
||||
def inner(extractor: ImageExtractor, page: fitz.fitz.Page):
|
||||
if is_a_scanned_page(page):
|
||||
logger.info(f"Page {page.number} is a scanned page; skipping image extraction.")
|
||||
else:
|
||||
yield from page_processor(extractor, page)
|
||||
|
||||
logger.info(f"Extracting pages with filtering scanned pages...")
|
||||
return inner
|
||||
def filter_metadata_for_scanned_pages(metadata: List[dict]):
|
||||
assert isinstance(metadata, list)
|
||||
if is_metadata_of_a_scanned_page(metadata):
|
||||
logger.info(f"Page {first(metadata)[Info.PAGE_IDX]} is a scanned page; skipping image extraction.")
|
||||
return []
|
||||
return metadata
|
||||
|
||||
|
||||
def is_a_scanned_page(page: fitz.fitz.Page):
|
||||
metadata = get_metadata_for_images_on_page(page)
|
||||
def is_metadata_of_a_scanned_page(metadata):
|
||||
return first(map(__breaches_image_to_page_quotient, metadata)) and not second(metadata)
|
||||
|
||||
|
||||
@ -43,17 +30,3 @@ def __breaches_image_to_page_quotient(metadatum):
|
||||
geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1)
|
||||
quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max)
|
||||
return quotient_breached
|
||||
|
||||
|
||||
def get_metadata_for_images_on_page(page: fitz.fitz.Page):
|
||||
"""Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page,
|
||||
however without the validation steps since not required here and take a significant amount of time.
|
||||
"""
|
||||
# temporary solution to avoid circular imports without changing the original code
|
||||
from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata
|
||||
|
||||
image_infos = page.get_image_info(xrefs=True)
|
||||
metadata = lmap(get_image_metadata, image_infos)
|
||||
metadata = add_page_metadata(page, metadata)
|
||||
|
||||
return metadata
|
||||
|
||||
@ -12,3 +12,4 @@ class Info(Enum):
|
||||
Y1 = "y1"
|
||||
Y2 = "y2"
|
||||
ALPHA = "alpha"
|
||||
XREF = "xref"
|
||||
|
||||
@ -1,12 +1,15 @@
|
||||
from functools import partial
|
||||
from itertools import starmap, product, repeat
|
||||
from typing import Iterable
|
||||
|
||||
import numpy as np
|
||||
from PIL.Image import Image
|
||||
from frozendict import frozendict
|
||||
from funcy import ilen
|
||||
from funcy import ilen, compose, omit
|
||||
|
||||
from image_prediction.estimator.preprocessor.utils import image_to_normalized_tensor
|
||||
from image_prediction.info import Info
|
||||
from image_prediction.utils.generic import lift
|
||||
|
||||
|
||||
def transform_equal(a, b):
|
||||
@ -18,7 +21,8 @@ def images_equal(im1: Image, im2: Image, **kwargs):
|
||||
|
||||
|
||||
def metadata_equal(mdat1: Iterable[dict], mdat2: Iterable[dict]):
|
||||
return set(map(frozendict, mdat1)) == set(map(frozendict, mdat2))
|
||||
f = compose(set, lift(compose(frozendict, partial(omit, keys=[Info.XREF]))))
|
||||
return f(mdat1) == f(mdat2)
|
||||
|
||||
|
||||
def image_sets_equal(ims1: Iterable[Image], ims2: Iterable[Image]):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user