clean-up filter logic

- Logic adapted so that it can potentially be
easily removed again from the extraction logic
This commit is contained in:
Julius Unverfehrt 2023-02-01 08:49:24 +01:00
parent 1490d27308
commit 368c54a8be
3 changed files with 118 additions and 6 deletions

View File

@ -5,15 +5,14 @@ import traceback
from functools import partial, lru_cache
from itertools import chain, starmap, filterfalse
from operator import itemgetter, truth
from typing import Iterable, Iterator
from typing import Iterable, Iterator, List
import fitz
from PIL import Image
from funcy import merge, pluck, curry, compose
from funcy import merge, pluck, curry, compose, rcompose
from image_prediction.formatter.formatters.enum import EnumFormatter
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.image_extractor.extractors.utils import get_image_infos, get_image_metadata, add_page_metadata
from image_prediction.image_extractor.filters import filter_scanned_pages
from image_prediction.info import Info
from image_prediction.stitching.stitching import stitch_pairs
@ -47,7 +46,8 @@ class ParsablePDFImageExtractor(ImageExtractor):
yield from image_metadata_pairs
# FIXME: Heuristic filtering shouldn't take place here,
# consider introducing a preprocessing step before extracting images.
# consider introducing a preprocessing step before extracting images,
# e.g. together with a image validation step for broken images.
@filter_scanned_pages
def __process_images_on_page(self, page: fitz.fitz.Page):
images = get_images_on_page(self.doc, page)
@ -109,12 +109,34 @@ def get_metadata_for_images_on_page(doc, page: fitz.Page):
yield from metadata
@lru_cache(maxsize=None)
def get_image_infos(page: fitz.Page) -> List[dict]:
return page.get_image_info(xrefs=True)
@lru_cache(maxsize=None)
def xref_to_image(doc, xref) -> Image:
maybe_image = load_image_handle_from_xref(doc, xref)
return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
def get_image_metadata(image_info):
x1, y1, x2, y2 = map(rounder, image_info["bbox"])
width = abs(x2 - x1)
height = abs(y2 - y1)
return {
Info.WIDTH: width,
Info.HEIGHT: height,
Info.X1: x1,
Info.X2: x2,
Info.Y1: y1,
Info.Y2: y2,
}
def validate_coords_and_passthrough(metadata):
yield from map(validate_box_coords, metadata)
@ -127,6 +149,10 @@ def validate_size_and_passthrough(metadata):
yield from map(validate_box_size, metadata)
def add_page_metadata(page, metadata):
yield from map(partial(merge, get_page_metadata(page)), metadata)
def add_alpha_channel_info(doc, page, metadata):
page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
@ -145,6 +171,19 @@ def load_image_handle_from_xref(doc, xref):
return doc.extract_image(xref)
rounder = rcompose(round, int)
def get_page_metadata(page):
page_width, page_height = map(rounder, page.mediabox_size)
return {
Info.PAGE_WIDTH: page_width,
Info.PAGE_HEIGHT: page_height,
Info.PAGE_IDX: page.number,
}
def has_alpha_channel(doc, xref):
maybe_image = load_image_handle_from_xref(doc, xref)

View File

@ -0,0 +1,74 @@
from _operator import itemgetter
from typing import Callable, List
import fitz
from funcy import first, compose, lmap
from image_prediction.config import CONFIG
from image_prediction.image_extractor.extractor import ImageExtractor
from image_prediction.transformer.transformers.response import compute_geometric_quotient
from image_prediction.utils import get_logger
logger = get_logger()
def filter_scanned_pages(page_processor: Callable):
"""Decorator for the __process_images_on_page method of the ParsablePDFImageExtractor.
This makes it so that scanned pages won't be processed (and are thus ultimately removed from the pipline).
A scanned page is defined by
- having only one image on a page
- that image having an image_to_page ratio greater than the allowed max value
found in the CONFIG.filters.image_to_page_quotient.max"""
def inner(self: ImageExtractor, page: fitz.fitz.Page):
metadata = get_metadata(page)
if is_a_scanned_page(metadata):
logger.debug(f"Page {page.number} won't be processed since it is a scanned page.")
yield from []
else:
yield from page_processor(self, page)
logger.info(f"Extracting pages with filtering scanned pages...")
return inner
def is_a_scanned_page(metadata: List[dict]):
return _contains_only_one_image(metadata) and _breaches_image_to_page_quotient(first(metadata))
def _breaches_image_to_page_quotient(metadata):
page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
)(metadata)
geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)
quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max)
return quotient_breached
def _contains_only_one_image(metadata):
return True if len(metadata) == 1 else False
def get_metadata(page: fitz.fitz.Page):
def get_image_metadata(image_info):
x1, y1, x2, y2 = map(compose(round, int), image_info["bbox"])
width = abs(x2 - x1)
height = abs(y2 - y1)
return {
"page_width": page_width,
"page_height": page_height,
"page_number": page_number,
"width": width,
"height": height,
"x1": x1,
"x2": x2,
"y1": y1,
"y2": y2,
}
image_infos = page.get_image_info(xrefs=True)
page_number = page.number
page_width, page_height = page.mediabox_size
metadata = lmap(get_image_metadata, image_infos)
return metadata

View File

@ -9,8 +9,7 @@ from funcy import first, rest
from image_prediction.extraction import extract_images_from_pdf
from image_prediction.image_extractor.extractor import ImageMetadataPair
from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel
from image_prediction.image_extractor.extractors.utils import get_image_infos
from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel, get_image_infos
from image_prediction.info import Info
from test.utils.comparison import metadata_equal, image_sets_equal
from test.utils.generation.pdf import add_image, pdf_stream