introduce adhoc filter for scanned pages
This commit is contained in:
parent
4eb7f3c40a
commit
1490d27308
@ -5,14 +5,16 @@ import traceback
|
||||
from functools import partial, lru_cache
|
||||
from itertools import chain, starmap, filterfalse
|
||||
from operator import itemgetter, truth
|
||||
from typing import List, Iterable, Iterator
|
||||
from typing import Iterable, Iterator
|
||||
|
||||
import fitz
|
||||
from PIL import Image
|
||||
from funcy import rcompose, merge, pluck, curry, compose
|
||||
from funcy import merge, pluck, curry, compose
|
||||
|
||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||
from image_prediction.image_extractor.extractors.utils import get_image_infos, get_image_metadata, add_page_metadata
|
||||
from image_prediction.image_extractor.filters import filter_scanned_pages
|
||||
from image_prediction.info import Info
|
||||
from image_prediction.stitching.stitching import stitch_pairs
|
||||
from image_prediction.stitching.utils import validate_box_coords, validate_box_size
|
||||
@ -44,6 +46,9 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
||||
|
||||
yield from image_metadata_pairs
|
||||
|
||||
# FIXME: Heuristic filtering shouldn't take place here,
|
||||
# consider introducing a preprocessing step before extracting images.
|
||||
@filter_scanned_pages
|
||||
def __process_images_on_page(self, page: fitz.fitz.Page):
|
||||
images = get_images_on_page(self.doc, page)
|
||||
metadata = get_metadata_for_images_on_page(self.doc, page)
|
||||
@ -104,34 +109,12 @@ def get_metadata_for_images_on_page(doc, page: fitz.Page):
|
||||
yield from metadata
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_image_infos(page: fitz.Page) -> List[dict]:
|
||||
return page.get_image_info(xrefs=True)
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def xref_to_image(doc, xref) -> Image:
|
||||
maybe_image = load_image_handle_from_xref(doc, xref)
|
||||
return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
|
||||
|
||||
|
||||
def get_image_metadata(image_info):
|
||||
|
||||
x1, y1, x2, y2 = map(rounder, image_info["bbox"])
|
||||
|
||||
width = abs(x2 - x1)
|
||||
height = abs(y2 - y1)
|
||||
|
||||
return {
|
||||
Info.WIDTH: width,
|
||||
Info.HEIGHT: height,
|
||||
Info.X1: x1,
|
||||
Info.X2: x2,
|
||||
Info.Y1: y1,
|
||||
Info.Y2: y2,
|
||||
}
|
||||
|
||||
|
||||
def validate_coords_and_passthrough(metadata):
|
||||
yield from map(validate_box_coords, metadata)
|
||||
|
||||
@ -144,10 +127,6 @@ def validate_size_and_passthrough(metadata):
|
||||
yield from map(validate_box_size, metadata)
|
||||
|
||||
|
||||
def add_page_metadata(page, metadata):
|
||||
yield from map(partial(merge, get_page_metadata(page)), metadata)
|
||||
|
||||
|
||||
def add_alpha_channel_info(doc, page, metadata):
|
||||
|
||||
page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
|
||||
@ -166,19 +145,6 @@ def load_image_handle_from_xref(doc, xref):
|
||||
return doc.extract_image(xref)
|
||||
|
||||
|
||||
rounder = rcompose(round, int)
|
||||
|
||||
|
||||
def get_page_metadata(page):
|
||||
page_width, page_height = map(rounder, page.mediabox_size)
|
||||
|
||||
return {
|
||||
Info.PAGE_WIDTH: page_width,
|
||||
Info.PAGE_HEIGHT: page_height,
|
||||
Info.PAGE_IDX: page.number,
|
||||
}
|
||||
|
||||
|
||||
def has_alpha_channel(doc, xref):
|
||||
|
||||
maybe_image = load_image_handle_from_xref(doc, xref)
|
||||
|
||||
@ -21,11 +21,6 @@ class ResponseTransformer(Transformer):
|
||||
|
||||
|
||||
def build_image_info(data: dict) -> dict:
|
||||
def compute_geometric_quotient():
|
||||
page_area_sqrt = math.sqrt(abs(page_width * page_height))
|
||||
image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
|
||||
return image_area_sqrt / page_area_sqrt
|
||||
|
||||
page_width, page_height, x1, x2, y1, y2, width, height, alpha = itemgetter(
|
||||
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha"
|
||||
)(data)
|
||||
@ -34,7 +29,7 @@ def build_image_info(data: dict) -> dict:
|
||||
label = classification["label"]
|
||||
representation = data["representation"]
|
||||
|
||||
geometric_quotient = round(compute_geometric_quotient(), 4)
|
||||
geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)
|
||||
|
||||
min_image_to_page_quotient_breached = bool(
|
||||
geometric_quotient < get_class_specific_min_image_to_page_quotient(label)
|
||||
@ -89,6 +84,12 @@ def build_image_info(data: dict) -> dict:
|
||||
return image_info
|
||||
|
||||
|
||||
def compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1):
|
||||
page_area_sqrt = math.sqrt(abs(page_width * page_height))
|
||||
image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
|
||||
return image_area_sqrt / page_area_sqrt
|
||||
|
||||
|
||||
def get_class_specific_min_image_to_page_quotient(label, table=None):
|
||||
return get_class_specific_value(
|
||||
"REL_IMAGE_SIZE", label, "min", CONFIG.filters.image_to_page_quotient.min, table=table
|
||||
|
||||
@ -9,7 +9,8 @@ from funcy import first, rest
|
||||
|
||||
from image_prediction.extraction import extract_images_from_pdf
|
||||
from image_prediction.image_extractor.extractor import ImageMetadataPair
|
||||
from image_prediction.image_extractor.extractors.parsable import extract_pages, get_image_infos, has_alpha_channel
|
||||
from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel
|
||||
from image_prediction.image_extractor.extractors.utils import get_image_infos
|
||||
from image_prediction.info import Info
|
||||
from test.utils.comparison import metadata_equal, image_sets_equal
|
||||
from test.utils.generation.pdf import add_image, pdf_stream
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user