fix: add small image filter logic
Introduces a preprocessing that scans each page for page sized images. If one is encountered, all images that are below a configured ratio in respect to the page size are dropped. This step has to occur before the image stiching logic, but MIGHT introduce the problem of dropping image parts that might constitue an image. This hoever is not solveable since we want to drop the small images before further processing since the faulty character images are also stiched to a valid image, that in reality isn't an image.
This commit is contained in:
parent
7f49642ba0
commit
4102a564a3
@ -20,6 +20,12 @@ min = 0.5
|
|||||||
min = 0.05
|
min = 0.05
|
||||||
max = 0.75
|
max = 0.75
|
||||||
|
|
||||||
|
[filters.is_scanned_page]
|
||||||
|
# Minimum permissible image to page ratio tolerance for a page to be considered scanned.
|
||||||
|
# This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
|
||||||
|
# superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
|
||||||
|
tolerance = 0
|
||||||
|
|
||||||
# Image width to height ratio
|
# Image width to height ratio
|
||||||
[filters.image_width_to_height_quotient]
|
[filters.image_width_to_height_quotient]
|
||||||
min = 0.1
|
min = 0.1
|
||||||
|
|||||||
@ -3,7 +3,7 @@ import json
|
|||||||
import traceback
|
import traceback
|
||||||
from _operator import itemgetter
|
from _operator import itemgetter
|
||||||
from functools import partial, lru_cache
|
from functools import partial, lru_cache
|
||||||
from itertools import chain, starmap, filterfalse
|
from itertools import chain, starmap, filterfalse, tee
|
||||||
from operator import itemgetter, truth
|
from operator import itemgetter, truth
|
||||||
from typing import Iterable, Iterator, List, Union
|
from typing import Iterable, Iterator, List, Union
|
||||||
|
|
||||||
@ -11,9 +11,10 @@ import fitz
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from funcy import merge, pluck, compose, rcompose, remove, keep
|
from funcy import merge, pluck, compose, rcompose, remove, keep
|
||||||
|
from scipy.stats import gmean
|
||||||
|
|
||||||
from image_prediction.config import CONFIG
|
from image_prediction.config import CONFIG
|
||||||
from image_prediction.exceptions import InvalidBox, BadXref
|
from image_prediction.exceptions import InvalidBox
|
||||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||||
from image_prediction.info import Info
|
from image_prediction.info import Info
|
||||||
@ -64,9 +65,13 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
|
def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
|
||||||
def validate(image: Image.Image, metadata: dict):
|
def validate_image_is_not_corrupt(image: Image.Image, metadata: dict):
|
||||||
|
"""See RED-5148: Some images are corrupt and cannot be processed by the image classifier. This function
|
||||||
|
filters out such images by trying to resize and convert them to RGB. If this fails, the image is considered
|
||||||
|
corrupt and is dropped.
|
||||||
|
TODO: find cleaner solution
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
# TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148)
|
|
||||||
image.resize((100, 100)).convert("RGB")
|
image.resize((100, 100)).convert("RGB")
|
||||||
return ImageMetadataPair(image, metadata)
|
return ImageMetadataPair(image, metadata)
|
||||||
except (OSError, Exception) as err:
|
except (OSError, Exception) as err:
|
||||||
@ -74,7 +79,41 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
|||||||
logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
|
logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return filter(truth, starmap(validate, image_metadata_pairs))
|
def filter_small_images_on_scanned_pages(image_metadata_pairs) -> Iterable[ImageMetadataPair]:
|
||||||
|
"""See RED-9746: Small images on scanned pages should be dropped, so they are not classified. This is a
|
||||||
|
heuristic to filter out images that are too small in relation to the page size if they are on a scanned page.
|
||||||
|
|
||||||
|
The ratio is computed as the geometric mean of the width and height of the image divided by the geometric mean
|
||||||
|
of the width and height of the page. If the ratio is below the threshold, the image is dropped.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def image_is_a_scanned_page(image_metadata_pair: ImageMetadataPair) -> bool:
|
||||||
|
tolerance = CONFIG.filters.is_scanned_page.tolerance
|
||||||
|
width_ratio = image_metadata_pair.metadata[Info.WIDTH] / image_metadata_pair.metadata[Info.PAGE_WIDTH]
|
||||||
|
height_ratio = (
|
||||||
|
image_metadata_pair.metadata[Info.HEIGHT] / image_metadata_pair.metadata[Info.PAGE_HEIGHT]
|
||||||
|
)
|
||||||
|
return width_ratio >= 1 - tolerance and height_ratio >= 1 - tolerance
|
||||||
|
|
||||||
|
def image_fits_geometric_mean_ratio(image_metadata_pair: ImageMetadataPair) -> bool:
|
||||||
|
min_ratio = CONFIG.filters.image_to_page_quotient.min
|
||||||
|
metadatum = image_metadata_pair.metadata
|
||||||
|
image_gmean = gmean([metadatum[Info.WIDTH], metadatum[Info.HEIGHT]])
|
||||||
|
page_gmean = gmean([metadatum[Info.PAGE_WIDTH], metadatum[Info.PAGE_HEIGHT]])
|
||||||
|
ratio = image_gmean / page_gmean
|
||||||
|
return ratio >= min_ratio
|
||||||
|
|
||||||
|
pairs, pairs_copy = tee(image_metadata_pairs)
|
||||||
|
|
||||||
|
if any(map(image_is_a_scanned_page, pairs_copy)):
|
||||||
|
logger.debug("Scanned page detected, filtering out small images ...")
|
||||||
|
return filter(image_fits_geometric_mean_ratio, pairs)
|
||||||
|
else:
|
||||||
|
return pairs
|
||||||
|
|
||||||
|
image_metadata_pairs = filter_small_images_on_scanned_pages(image_metadata_pairs)
|
||||||
|
|
||||||
|
return filter(truth, starmap(validate_image_is_not_corrupt, image_metadata_pairs))
|
||||||
|
|
||||||
|
|
||||||
def extract_pages(doc, page_range):
|
def extract_pages(doc, page_range):
|
||||||
@ -99,7 +138,6 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
|
|||||||
return list(metadata)
|
return list(metadata)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_metadata_for_images_on_page(page: fitz.Page):
|
def get_metadata_for_images_on_page(page: fitz.Page):
|
||||||
metadata = map(get_image_metadata, get_image_infos(page))
|
metadata = map(get_image_metadata, get_image_infos(page))
|
||||||
metadata = add_page_metadata(page, metadata)
|
metadata = add_page_metadata(page, metadata)
|
||||||
@ -172,7 +210,6 @@ def _normalize_channels(array: np.ndarray):
|
|||||||
|
|
||||||
|
|
||||||
def get_image_metadata(image_info):
|
def get_image_metadata(image_info):
|
||||||
|
|
||||||
xref, coords = itemgetter("xref", "bbox")(image_info)
|
xref, coords = itemgetter("xref", "bbox")(image_info)
|
||||||
x1, y1, x2, y2 = map(rounder, coords)
|
x1, y1, x2, y2 = map(rounder, coords)
|
||||||
|
|
||||||
@ -228,7 +265,6 @@ def get_page_metadata(page):
|
|||||||
|
|
||||||
|
|
||||||
def has_alpha_channel(doc, xref):
|
def has_alpha_channel(doc, xref):
|
||||||
|
|
||||||
maybe_image = load_image_handle_from_xref(doc, xref)
|
maybe_image = load_image_handle_from_xref(doc, xref)
|
||||||
maybe_smask = maybe_image["smask"] if maybe_image else None
|
maybe_smask = maybe_image["smask"] if maybe_image else None
|
||||||
|
|
||||||
|
|||||||
@ -56,7 +56,8 @@ def annotate_image(doc, image_info):
|
|||||||
|
|
||||||
def init():
|
def init():
|
||||||
PDFNet.Initialize(
|
PDFNet.Initialize(
|
||||||
"Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
|
# "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
|
||||||
|
"Knecon AG:OEM:DDA-R::WL+:AMS(20270129):EA5FDFB23C7F36B9C2AE606F4F0D9197DE1FB649119F9730B622FABEF5C7"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user