fix: add small image filter logic

Introduces a preprocessing that scans each page for page sized images.
If one is encountered, all images that are below a configured ratio in
respect to the page size are dropped.

This step has to occur before the image stiching logic, but MIGHT
introduce the problem of dropping image parts that might constitue an
image. This hoever is not solveable since we want to drop the small
images before further processing since the faulty character images are
also stiched to a valid image, that in reality isn't an image.
This commit is contained in:
Julius Unverfehrt 2024-08-06 16:52:02 +02:00
parent 7f49642ba0
commit 4102a564a3
3 changed files with 52 additions and 9 deletions

View File

@ -20,6 +20,12 @@ min = 0.5
min = 0.05 min = 0.05
max = 0.75 max = 0.75
[filters.is_scanned_page]
# Minimum permissible image to page ratio tolerance for a page to be considered scanned.
# This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
# superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
tolerance = 0
# Image width to height ratio # Image width to height ratio
[filters.image_width_to_height_quotient] [filters.image_width_to_height_quotient]
min = 0.1 min = 0.1

View File

@ -3,7 +3,7 @@ import json
import traceback import traceback
from _operator import itemgetter from _operator import itemgetter
from functools import partial, lru_cache from functools import partial, lru_cache
from itertools import chain, starmap, filterfalse from itertools import chain, starmap, filterfalse, tee
from operator import itemgetter, truth from operator import itemgetter, truth
from typing import Iterable, Iterator, List, Union from typing import Iterable, Iterator, List, Union
@ -11,9 +11,10 @@ import fitz
import numpy as np import numpy as np
from PIL import Image from PIL import Image
from funcy import merge, pluck, compose, rcompose, remove, keep from funcy import merge, pluck, compose, rcompose, remove, keep
from scipy.stats import gmean
from image_prediction.config import CONFIG from image_prediction.config import CONFIG
from image_prediction.exceptions import InvalidBox, BadXref from image_prediction.exceptions import InvalidBox
from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.formatter.formatters.enum import EnumFormatter
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.info import Info from image_prediction.info import Info
@ -64,9 +65,13 @@ class ParsablePDFImageExtractor(ImageExtractor):
@staticmethod @staticmethod
def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]: def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
def validate(image: Image.Image, metadata: dict): def validate_image_is_not_corrupt(image: Image.Image, metadata: dict):
"""See RED-5148: Some images are corrupt and cannot be processed by the image classifier. This function
filters out such images by trying to resize and convert them to RGB. If this fails, the image is considered
corrupt and is dropped.
TODO: find cleaner solution
"""
try: try:
# TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148)
image.resize((100, 100)).convert("RGB") image.resize((100, 100)).convert("RGB")
return ImageMetadataPair(image, metadata) return ImageMetadataPair(image, metadata)
except (OSError, Exception) as err: except (OSError, Exception) as err:
@ -74,7 +79,41 @@ class ParsablePDFImageExtractor(ImageExtractor):
logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}") logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
return None return None
return filter(truth, starmap(validate, image_metadata_pairs)) def filter_small_images_on_scanned_pages(image_metadata_pairs) -> Iterable[ImageMetadataPair]:
"""See RED-9746: Small images on scanned pages should be dropped, so they are not classified. This is a
heuristic to filter out images that are too small in relation to the page size if they are on a scanned page.
The ratio is computed as the geometric mean of the width and height of the image divided by the geometric mean
of the width and height of the page. If the ratio is below the threshold, the image is dropped.
"""
def image_is_a_scanned_page(image_metadata_pair: ImageMetadataPair) -> bool:
tolerance = CONFIG.filters.is_scanned_page.tolerance
width_ratio = image_metadata_pair.metadata[Info.WIDTH] / image_metadata_pair.metadata[Info.PAGE_WIDTH]
height_ratio = (
image_metadata_pair.metadata[Info.HEIGHT] / image_metadata_pair.metadata[Info.PAGE_HEIGHT]
)
return width_ratio >= 1 - tolerance and height_ratio >= 1 - tolerance
def image_fits_geometric_mean_ratio(image_metadata_pair: ImageMetadataPair) -> bool:
min_ratio = CONFIG.filters.image_to_page_quotient.min
metadatum = image_metadata_pair.metadata
image_gmean = gmean([metadatum[Info.WIDTH], metadatum[Info.HEIGHT]])
page_gmean = gmean([metadatum[Info.PAGE_WIDTH], metadatum[Info.PAGE_HEIGHT]])
ratio = image_gmean / page_gmean
return ratio >= min_ratio
pairs, pairs_copy = tee(image_metadata_pairs)
if any(map(image_is_a_scanned_page, pairs_copy)):
logger.debug("Scanned page detected, filtering out small images ...")
return filter(image_fits_geometric_mean_ratio, pairs)
else:
return pairs
image_metadata_pairs = filter_small_images_on_scanned_pages(image_metadata_pairs)
return filter(truth, starmap(validate_image_is_not_corrupt, image_metadata_pairs))
def extract_pages(doc, page_range): def extract_pages(doc, page_range):
@ -99,7 +138,6 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
return list(metadata) return list(metadata)
def get_metadata_for_images_on_page(page: fitz.Page): def get_metadata_for_images_on_page(page: fitz.Page):
metadata = map(get_image_metadata, get_image_infos(page)) metadata = map(get_image_metadata, get_image_infos(page))
metadata = add_page_metadata(page, metadata) metadata = add_page_metadata(page, metadata)
@ -172,7 +210,6 @@ def _normalize_channels(array: np.ndarray):
def get_image_metadata(image_info): def get_image_metadata(image_info):
xref, coords = itemgetter("xref", "bbox")(image_info) xref, coords = itemgetter("xref", "bbox")(image_info)
x1, y1, x2, y2 = map(rounder, coords) x1, y1, x2, y2 = map(rounder, coords)
@ -228,7 +265,6 @@ def get_page_metadata(page):
def has_alpha_channel(doc, xref): def has_alpha_channel(doc, xref):
maybe_image = load_image_handle_from_xref(doc, xref) maybe_image = load_image_handle_from_xref(doc, xref)
maybe_smask = maybe_image["smask"] if maybe_image else None maybe_smask = maybe_image["smask"] if maybe_image else None

View File

@ -56,7 +56,8 @@ def annotate_image(doc, image_info):
def init(): def init():
PDFNet.Initialize( PDFNet.Initialize(
"Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7" # "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
"Knecon AG:OEM:DDA-R::WL+:AMS(20270129):EA5FDFB23C7F36B9C2AE606F4F0D9197DE1FB649119F9730B622FABEF5C7"
) )