From 1490d273080900374cf2894d01b66192fb2e1189 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Tue, 31 Jan 2023 17:18:28 +0100 Subject: [PATCH] introduce adhoc filter for scanned pages --- .../image_extractor/extractors/parsable.py | 48 +++---------------- .../transformer/transformers/response.py | 13 ++--- test/unit_tests/image_extractor_test.py | 3 +- 3 files changed, 16 insertions(+), 48 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index eac09e1..0199d49 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -5,14 +5,16 @@ import traceback from functools import partial, lru_cache from itertools import chain, starmap, filterfalse from operator import itemgetter, truth -from typing import List, Iterable, Iterator +from typing import Iterable, Iterator import fitz from PIL import Image -from funcy import rcompose, merge, pluck, curry, compose +from funcy import merge, pluck, curry, compose from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair +from image_prediction.image_extractor.extractors.utils import get_image_infos, get_image_metadata, add_page_metadata +from image_prediction.image_extractor.filters import filter_scanned_pages from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs from image_prediction.stitching.utils import validate_box_coords, validate_box_size @@ -44,6 +46,9 @@ class ParsablePDFImageExtractor(ImageExtractor): yield from image_metadata_pairs + # FIXME: Heuristic filtering shouldn't take place here, + # consider introducing a preprocessing step before extracting images. + @filter_scanned_pages def __process_images_on_page(self, page: fitz.fitz.Page): images = get_images_on_page(self.doc, page) metadata = get_metadata_for_images_on_page(self.doc, page) @@ -104,34 +109,12 @@ def get_metadata_for_images_on_page(doc, page: fitz.Page): yield from metadata -@lru_cache(maxsize=None) -def get_image_infos(page: fitz.Page) -> List[dict]: - return page.get_image_info(xrefs=True) - - @lru_cache(maxsize=None) def xref_to_image(doc, xref) -> Image: maybe_image = load_image_handle_from_xref(doc, xref) return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None -def get_image_metadata(image_info): - - x1, y1, x2, y2 = map(rounder, image_info["bbox"]) - - width = abs(x2 - x1) - height = abs(y2 - y1) - - return { - Info.WIDTH: width, - Info.HEIGHT: height, - Info.X1: x1, - Info.X2: x2, - Info.Y1: y1, - Info.Y2: y2, - } - - def validate_coords_and_passthrough(metadata): yield from map(validate_box_coords, metadata) @@ -144,10 +127,6 @@ def validate_size_and_passthrough(metadata): yield from map(validate_box_size, metadata) -def add_page_metadata(page, metadata): - yield from map(partial(merge, get_page_metadata(page)), metadata) - - def add_alpha_channel_info(doc, page, metadata): page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos) @@ -166,19 +145,6 @@ def load_image_handle_from_xref(doc, xref): return doc.extract_image(xref) -rounder = rcompose(round, int) - - -def get_page_metadata(page): - page_width, page_height = map(rounder, page.mediabox_size) - - return { - Info.PAGE_WIDTH: page_width, - Info.PAGE_HEIGHT: page_height, - Info.PAGE_IDX: page.number, - } - - def has_alpha_channel(doc, xref): maybe_image = load_image_handle_from_xref(doc, xref) diff --git a/image_prediction/transformer/transformers/response.py b/image_prediction/transformer/transformers/response.py index 378fe7b..288c510 100644 --- a/image_prediction/transformer/transformers/response.py +++ b/image_prediction/transformer/transformers/response.py @@ -21,11 +21,6 @@ class ResponseTransformer(Transformer): def build_image_info(data: dict) -> dict: - def compute_geometric_quotient(): - page_area_sqrt = math.sqrt(abs(page_width * page_height)) - image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1)) - return image_area_sqrt / page_area_sqrt - page_width, page_height, x1, x2, y1, y2, width, height, alpha = itemgetter( "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha" )(data) @@ -34,7 +29,7 @@ def build_image_info(data: dict) -> dict: label = classification["label"] representation = data["representation"] - geometric_quotient = round(compute_geometric_quotient(), 4) + geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4) min_image_to_page_quotient_breached = bool( geometric_quotient < get_class_specific_min_image_to_page_quotient(label) @@ -89,6 +84,12 @@ def build_image_info(data: dict) -> dict: return image_info +def compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1): + page_area_sqrt = math.sqrt(abs(page_width * page_height)) + image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1)) + return image_area_sqrt / page_area_sqrt + + def get_class_specific_min_image_to_page_quotient(label, table=None): return get_class_specific_value( "REL_IMAGE_SIZE", label, "min", CONFIG.filters.image_to_page_quotient.min, table=table diff --git a/test/unit_tests/image_extractor_test.py b/test/unit_tests/image_extractor_test.py index e52b2b5..c7e1fc2 100644 --- a/test/unit_tests/image_extractor_test.py +++ b/test/unit_tests/image_extractor_test.py @@ -9,7 +9,8 @@ from funcy import first, rest from image_prediction.extraction import extract_images_from_pdf from image_prediction.image_extractor.extractor import ImageMetadataPair -from image_prediction.image_extractor.extractors.parsable import extract_pages, get_image_infos, has_alpha_channel +from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel +from image_prediction.image_extractor.extractors.utils import get_image_infos from image_prediction.info import Info from test.utils.comparison import metadata_equal, image_sets_equal from test.utils.generation.pdf import add_image, pdf_stream