fix: add small image filter logic

Introduces a preprocessing that scans each page for page sized images. If one is encountered, all images that are below a configured ratio in respect to the page size are dropped. This step has to occur before the image stiching logic, but MIGHT introduce the problem of dropping image parts that might constitue an image. This hoever is not solveable since we want to drop the small images before further processing since the faulty character images are also stiched to a valid image, that in reality isn't an image.
2024-08-06 16:52:02 +02:00 · 2024-08-06 16:52:02 +02:00 · 4102a564a3
commit 4102a564a3
parent 7f49642ba0
3 changed files with 52 additions and 9 deletions
--- a/config/settings.toml
+++ b/config/settings.toml
@ -20,6 +20,12 @@ min = 0.5
 min = 0.05
 max = 0.75
 [filters.is_scanned_page]
 # Minimum permissible image to page ratio tolerance for a page to be considered scanned.
 # This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
 # superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
 tolerance = 0
 # Image width to height ratio
 [filters.image_width_to_height_quotient]
 min = 0.1
--- a/src/image_prediction/image_extractor/extractors/parsable.py
+++ b/src/image_prediction/image_extractor/extractors/parsable.py
@ -3,7 +3,7 @@ import json
 import traceback
 from _operator import itemgetter
 from functools import partial, lru_cache
-from itertools import chain, starmap, filterfalse
+from itertools import chain, starmap, filterfalse, tee
 from operator import itemgetter, truth
 from typing import Iterable, Iterator, List, Union
@ -11,9 +11,10 @@ import fitz
 import numpy as np
 from PIL import Image
 from funcy import merge, pluck, compose, rcompose, remove, keep
 from scipy.stats import gmean
 from image_prediction.config import CONFIG
-from image_prediction.exceptions import InvalidBox, BadXref
+from image_prediction.exceptions import InvalidBox
 from image_prediction.formatter.formatters.enum import EnumFormatter
 from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
 from image_prediction.info import Info
@ -64,9 +65,13 @@ class ParsablePDFImageExtractor(ImageExtractor):
    @staticmethod
    def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
-        def validate(image: Image.Image, metadata: dict):
+        def validate_image_is_not_corrupt(image: Image.Image, metadata: dict):
            """See RED-5148: Some images are corrupt and cannot be processed by the image classifier. This function
            filters out such images by trying to resize and convert them to RGB. If this fails, the image is considered
            corrupt and is dropped.
            TODO: find cleaner solution
            """
            try:
                # TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148)
                image.resize((100, 100)).convert("RGB")
                return ImageMetadataPair(image, metadata)
            except (OSError, Exception) as err:
@ -74,7 +79,41 @@ class ParsablePDFImageExtractor(ImageExtractor):
                logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
                return None
-        return filter(truth, starmap(validate, image_metadata_pairs))
+        def filter_small_images_on_scanned_pages(image_metadata_pairs) -> Iterable[ImageMetadataPair]:
            """See RED-9746: Small images on scanned pages should be dropped, so they are not classified. This is a
            heuristic to filter out images that are too small in relation to the page size if they are on a scanned page.
            The ratio is computed as the geometric mean of the width and height of the image divided by the geometric mean
            of the width and height of the page. If the ratio is below the threshold, the image is dropped.
            """
            def image_is_a_scanned_page(image_metadata_pair: ImageMetadataPair) -> bool:
                tolerance = CONFIG.filters.is_scanned_page.tolerance
                width_ratio = image_metadata_pair.metadata[Info.WIDTH] / image_metadata_pair.metadata[Info.PAGE_WIDTH]
                height_ratio = (
                    image_metadata_pair.metadata[Info.HEIGHT] / image_metadata_pair.metadata[Info.PAGE_HEIGHT]
                )
                return width_ratio >= 1 - tolerance and height_ratio >= 1 - tolerance
            def image_fits_geometric_mean_ratio(image_metadata_pair: ImageMetadataPair) -> bool:
                min_ratio = CONFIG.filters.image_to_page_quotient.min
                metadatum = image_metadata_pair.metadata
                image_gmean = gmean([metadatum[Info.WIDTH], metadatum[Info.HEIGHT]])
                page_gmean = gmean([metadatum[Info.PAGE_WIDTH], metadatum[Info.PAGE_HEIGHT]])
                ratio = image_gmean / page_gmean
                return ratio >= min_ratio
            pairs, pairs_copy = tee(image_metadata_pairs)
            if any(map(image_is_a_scanned_page, pairs_copy)):
                logger.debug("Scanned page detected, filtering out small images ...")
                return filter(image_fits_geometric_mean_ratio, pairs)
            else:
                return pairs
        image_metadata_pairs = filter_small_images_on_scanned_pages(image_metadata_pairs)
        return filter(truth, starmap(validate_image_is_not_corrupt, image_metadata_pairs))
 def extract_pages(doc, page_range):
@ -99,7 +138,6 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
    return list(metadata)
 def get_metadata_for_images_on_page(page: fitz.Page):
    metadata = map(get_image_metadata, get_image_infos(page))
    metadata = add_page_metadata(page, metadata)
@ -172,7 +210,6 @@ def _normalize_channels(array: np.ndarray):
 def get_image_metadata(image_info):
    xref, coords = itemgetter("xref", "bbox")(image_info)
    x1, y1, x2, y2 = map(rounder, coords)
@ -228,7 +265,6 @@ def get_page_metadata(page):
 def has_alpha_channel(doc, xref):
    maybe_image = load_image_handle_from_xref(doc, xref)
    maybe_smask = maybe_image["smask"] if maybe_image else None
--- a/src/image_prediction/utils/pdf_annotation.py
+++ b/src/image_prediction/utils/pdf_annotation.py
@ -56,7 +56,8 @@ def annotate_image(doc, image_info):
 def init():
    PDFNet.Initialize(
-        "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
+        # "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
        "Knecon AG:OEM:DDA-R::WL+:AMS(20270129):EA5FDFB23C7F36B9C2AE606F4F0D9197DE1FB649119F9730B622FABEF5C7"
    )