refactor scanned page filtering WIP

This commit is contained in:
Julius Unverfehrt 2023-02-01 14:38:55 +01:00
parent 5bd5e0cf2b
commit 2d385b0a73
4 changed files with 42 additions and 54 deletions

View File

@ -9,11 +9,11 @@ from typing import Iterable, Iterator, List
import fitz
from PIL import Image
from funcy import merge, pluck, curry, compose, rcompose
from funcy import merge, pluck, curry, compose, rcompose, lmap
from image_prediction.formatter.formatters.enum import EnumFormatter
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.image_extractor.filters import filter_scanned_pages
from image_prediction.image_extractor.filters import filter_metadata_for_scanned_pages
from image_prediction.info import Info
from image_prediction.stitching.stitching import stitch_pairs
from image_prediction.stitching.utils import validate_box_coords, validate_box_size
@ -45,13 +45,10 @@ class ParsablePDFImageExtractor(ImageExtractor):
yield from image_metadata_pairs
# FIXME: Heuristic filtering shouldn't take place here,
# consider introducing a preprocessing step before extracting images,
# e.g. together with an image validation step for broken images.
@filter_scanned_pages
def __process_images_on_page(self, page: fitz.fitz.Page):
images = get_images_on_page(self.doc, page)
metadata = get_metadata_for_images_on_page(self.doc, page)
metadata = list(get_metadata_for_images_on_page(self.doc, page))
metadata = filter_metadata_for_scanned_pages(metadata)
images = get_images_on_page(self.doc, metadata)
clear_caches()
image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
@ -85,10 +82,8 @@ def extract_pages(doc, page_range):
yield from pages
@lru_cache(maxsize=None)
def get_images_on_page(doc, page: fitz.Page):
image_infos = get_image_infos(page)
xrefs = map(itemgetter("xref"), image_infos)
def get_images_on_page(doc, metadata):
xrefs = pluck(Info.XREF, metadata)
images = map(partial(xref_to_image, doc), xrefs)
yield from images
@ -97,18 +92,32 @@ def get_images_on_page(doc, page: fitz.Page):
def get_metadata_for_images_on_page(doc, page: fitz.Page):
metadata = map(get_image_metadata, get_image_infos(page))
metadata = validate_coords_and_passthrough(metadata)
metadata = filter_out_tiny_images(metadata)
metadata = validate_size_and_passthrough(metadata)
metadata = add_page_metadata(page, metadata)
metadata = add_alpha_channel_info(doc, page, metadata)
metadata = validate_coords_and_passthrough(metadata)
metadata = validate_size_and_passthrough(metadata)
yield from metadata
# def get_metadata_for_images_on_page_2(page: fitz.fitz.Page):
# """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page,
# however without the validation steps since not required here and take a significant amount of time.
# """
# # temporary solution to avoid circular imports without changing the original code
# from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata
#
# image_infos = page.get_image_info(xrefs=True)
# metadata = lmap(get_image_metadata, image_infos)
# metadata = add_page_metadata(page, metadata)
#
# return metadata
@lru_cache(maxsize=None)
def get_image_infos(page: fitz.Page) -> List[dict]:
return page.get_image_info(xrefs=True)
@ -122,7 +131,8 @@ def xref_to_image(doc, xref) -> Image:
def get_image_metadata(image_info):
x1, y1, x2, y2 = map(rounder, image_info["bbox"])
xref, coords = itemgetter("xref", "bbox")(image_info)
x1, y1, x2, y2 = map(rounder, coords)
width = abs(x2 - x1)
height = abs(y2 - y1)
@ -134,6 +144,7 @@ def get_image_metadata(image_info):
Info.X2: x2,
Info.Y1: y1,
Info.Y2: y2,
Info.XREF: xref,
}
@ -206,7 +217,6 @@ def tiny(metadata):
def clear_caches():
get_image_infos.cache_clear()
load_image_handle_from_xref.cache_clear()
get_images_on_page.cache_clear()
xref_to_image.cache_clear()

View File

@ -1,11 +1,9 @@
from _operator import itemgetter
from typing import Callable
from typing import List
import fitz
from funcy import first, lmap, second
from funcy import first, second
from image_prediction.config import CONFIG
from image_prediction.image_extractor.extractor import ImageExtractor
from image_prediction.info import Info
from image_prediction.transformer.transformers.response import compute_geometric_quotient
from image_prediction.utils import get_logger
@ -13,26 +11,15 @@ from image_prediction.utils import get_logger
logger = get_logger()
def filter_scanned_pages(page_processor: Callable):
"""Decorator for the __process_images_on_page method of the ParsablePDFImageExtractor.
This makes it so that scanned pages won't be processed (and are thus ultimately removed from the pipline).
A scanned page is defined by
- having only one image on a page and
- that image having an image_to_page ratio greater than the allowed max value
defined in CONFIG.filters.image_to_page_quotient.max"""
def inner(extractor: ImageExtractor, page: fitz.fitz.Page):
if is_a_scanned_page(page):
logger.info(f"Page {page.number} is a scanned page; skipping image extraction.")
else:
yield from page_processor(extractor, page)
logger.info(f"Extracting pages with filtering scanned pages...")
return inner
def filter_metadata_for_scanned_pages(metadata: List[dict]):
assert isinstance(metadata, list)
if is_metadata_of_a_scanned_page(metadata):
logger.info(f"Page {first(metadata)[Info.PAGE_IDX]} is a scanned page; skipping image extraction.")
return []
return metadata
def is_a_scanned_page(page: fitz.fitz.Page):
metadata = get_metadata_for_images_on_page(page)
def is_metadata_of_a_scanned_page(metadata):
return first(map(__breaches_image_to_page_quotient, metadata)) and not second(metadata)
@ -43,17 +30,3 @@ def __breaches_image_to_page_quotient(metadatum):
geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1)
quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max)
return quotient_breached
def get_metadata_for_images_on_page(page: fitz.fitz.Page):
"""Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page,
however without the validation steps since not required here and take a significant amount of time.
"""
# temporary solution to avoid circular imports without changing the original code
from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata
image_infos = page.get_image_info(xrefs=True)
metadata = lmap(get_image_metadata, image_infos)
metadata = add_page_metadata(page, metadata)
return metadata

View File

@ -12,3 +12,4 @@ class Info(Enum):
Y1 = "y1"
Y2 = "y2"
ALPHA = "alpha"
XREF = "xref"

View File

@ -1,12 +1,15 @@
from functools import partial
from itertools import starmap, product, repeat
from typing import Iterable
import numpy as np
from PIL.Image import Image
from frozendict import frozendict
from funcy import ilen
from funcy import ilen, compose, omit
from image_prediction.estimator.preprocessor.utils import image_to_normalized_tensor
from image_prediction.info import Info
from image_prediction.utils.generic import lift
def transform_equal(a, b):
@ -18,7 +21,8 @@ def images_equal(im1: Image, im2: Image, **kwargs):
def metadata_equal(mdat1: Iterable[dict], mdat2: Iterable[dict]):
return set(map(frozendict, mdat1)) == set(map(frozendict, mdat2))
f = compose(set, lift(compose(frozendict, partial(omit, keys=[Info.XREF]))))
return f(mdat1) == f(mdat2)
def image_sets_equal(ims1: Iterable[Image], ims2: Iterable[Image]):