introduce adhoc filter for scanned pages

This commit is contained in:
Julius Unverfehrt 2023-01-31 17:18:28 +01:00
parent 4eb7f3c40a
commit 1490d27308
3 changed files with 16 additions and 48 deletions

View File

@ -5,14 +5,16 @@ import traceback
from functools import partial, lru_cache
from itertools import chain, starmap, filterfalse
from operator import itemgetter, truth
from typing import List, Iterable, Iterator
from typing import Iterable, Iterator
import fitz
from PIL import Image
from funcy import rcompose, merge, pluck, curry, compose
from funcy import merge, pluck, curry, compose
from image_prediction.formatter.formatters.enum import EnumFormatter
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.image_extractor.extractors.utils import get_image_infos, get_image_metadata, add_page_metadata
from image_prediction.image_extractor.filters import filter_scanned_pages
from image_prediction.info import Info
from image_prediction.stitching.stitching import stitch_pairs
from image_prediction.stitching.utils import validate_box_coords, validate_box_size
@ -44,6 +46,9 @@ class ParsablePDFImageExtractor(ImageExtractor):
yield from image_metadata_pairs
# FIXME: Heuristic filtering shouldn't take place here,
# consider introducing a preprocessing step before extracting images.
@filter_scanned_pages
def __process_images_on_page(self, page: fitz.fitz.Page):
images = get_images_on_page(self.doc, page)
metadata = get_metadata_for_images_on_page(self.doc, page)
@ -104,34 +109,12 @@ def get_metadata_for_images_on_page(doc, page: fitz.Page):
yield from metadata
@lru_cache(maxsize=None)
def get_image_infos(page: fitz.Page) -> List[dict]:
return page.get_image_info(xrefs=True)
@lru_cache(maxsize=None)
def xref_to_image(doc, xref) -> Image:
maybe_image = load_image_handle_from_xref(doc, xref)
return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
def get_image_metadata(image_info):
x1, y1, x2, y2 = map(rounder, image_info["bbox"])
width = abs(x2 - x1)
height = abs(y2 - y1)
return {
Info.WIDTH: width,
Info.HEIGHT: height,
Info.X1: x1,
Info.X2: x2,
Info.Y1: y1,
Info.Y2: y2,
}
def validate_coords_and_passthrough(metadata):
yield from map(validate_box_coords, metadata)
@ -144,10 +127,6 @@ def validate_size_and_passthrough(metadata):
yield from map(validate_box_size, metadata)
def add_page_metadata(page, metadata):
yield from map(partial(merge, get_page_metadata(page)), metadata)
def add_alpha_channel_info(doc, page, metadata):
page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
@ -166,19 +145,6 @@ def load_image_handle_from_xref(doc, xref):
return doc.extract_image(xref)
rounder = rcompose(round, int)
def get_page_metadata(page):
page_width, page_height = map(rounder, page.mediabox_size)
return {
Info.PAGE_WIDTH: page_width,
Info.PAGE_HEIGHT: page_height,
Info.PAGE_IDX: page.number,
}
def has_alpha_channel(doc, xref):
maybe_image = load_image_handle_from_xref(doc, xref)

View File

@ -21,11 +21,6 @@ class ResponseTransformer(Transformer):
def build_image_info(data: dict) -> dict:
def compute_geometric_quotient():
page_area_sqrt = math.sqrt(abs(page_width * page_height))
image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
return image_area_sqrt / page_area_sqrt
page_width, page_height, x1, x2, y1, y2, width, height, alpha = itemgetter(
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha"
)(data)
@ -34,7 +29,7 @@ def build_image_info(data: dict) -> dict:
label = classification["label"]
representation = data["representation"]
geometric_quotient = round(compute_geometric_quotient(), 4)
geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)
min_image_to_page_quotient_breached = bool(
geometric_quotient < get_class_specific_min_image_to_page_quotient(label)
@ -89,6 +84,12 @@ def build_image_info(data: dict) -> dict:
return image_info
def compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1):
page_area_sqrt = math.sqrt(abs(page_width * page_height))
image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
return image_area_sqrt / page_area_sqrt
def get_class_specific_min_image_to_page_quotient(label, table=None):
return get_class_specific_value(
"REL_IMAGE_SIZE", label, "min", CONFIG.filters.image_to_page_quotient.min, table=table

View File

@ -9,7 +9,8 @@ from funcy import first, rest
from image_prediction.extraction import extract_images_from_pdf
from image_prediction.image_extractor.extractor import ImageMetadataPair
from image_prediction.image_extractor.extractors.parsable import extract_pages, get_image_infos, has_alpha_channel
from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel
from image_prediction.image_extractor.extractors.utils import get_image_infos
from image_prediction.info import Info
from test.utils.comparison import metadata_equal, image_sets_equal
from test.utils.generation.pdf import add_image, pdf_stream