clean-up filter logic
- Logic adapted so that it can potentially be easily removed again from the extraction logic
This commit is contained in:
parent
1490d27308
commit
368c54a8be
@ -5,15 +5,14 @@ import traceback
|
||||
from functools import partial, lru_cache
|
||||
from itertools import chain, starmap, filterfalse
|
||||
from operator import itemgetter, truth
|
||||
from typing import Iterable, Iterator
|
||||
from typing import Iterable, Iterator, List
|
||||
|
||||
import fitz
|
||||
from PIL import Image
|
||||
from funcy import merge, pluck, curry, compose
|
||||
from funcy import merge, pluck, curry, compose, rcompose
|
||||
|
||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||
from image_prediction.image_extractor.extractors.utils import get_image_infos, get_image_metadata, add_page_metadata
|
||||
from image_prediction.image_extractor.filters import filter_scanned_pages
|
||||
from image_prediction.info import Info
|
||||
from image_prediction.stitching.stitching import stitch_pairs
|
||||
@ -47,7 +46,8 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
||||
yield from image_metadata_pairs
|
||||
|
||||
# FIXME: Heuristic filtering shouldn't take place here,
|
||||
# consider introducing a preprocessing step before extracting images.
|
||||
# consider introducing a preprocessing step before extracting images,
|
||||
# e.g. together with a image validation step for broken images.
|
||||
@filter_scanned_pages
|
||||
def __process_images_on_page(self, page: fitz.fitz.Page):
|
||||
images = get_images_on_page(self.doc, page)
|
||||
@ -109,12 +109,34 @@ def get_metadata_for_images_on_page(doc, page: fitz.Page):
|
||||
yield from metadata
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_image_infos(page: fitz.Page) -> List[dict]:
|
||||
return page.get_image_info(xrefs=True)
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def xref_to_image(doc, xref) -> Image:
|
||||
maybe_image = load_image_handle_from_xref(doc, xref)
|
||||
return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
|
||||
|
||||
|
||||
def get_image_metadata(image_info):
|
||||
|
||||
x1, y1, x2, y2 = map(rounder, image_info["bbox"])
|
||||
|
||||
width = abs(x2 - x1)
|
||||
height = abs(y2 - y1)
|
||||
|
||||
return {
|
||||
Info.WIDTH: width,
|
||||
Info.HEIGHT: height,
|
||||
Info.X1: x1,
|
||||
Info.X2: x2,
|
||||
Info.Y1: y1,
|
||||
Info.Y2: y2,
|
||||
}
|
||||
|
||||
|
||||
def validate_coords_and_passthrough(metadata):
|
||||
yield from map(validate_box_coords, metadata)
|
||||
|
||||
@ -127,6 +149,10 @@ def validate_size_and_passthrough(metadata):
|
||||
yield from map(validate_box_size, metadata)
|
||||
|
||||
|
||||
def add_page_metadata(page, metadata):
|
||||
yield from map(partial(merge, get_page_metadata(page)), metadata)
|
||||
|
||||
|
||||
def add_alpha_channel_info(doc, page, metadata):
|
||||
|
||||
page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
|
||||
@ -145,6 +171,19 @@ def load_image_handle_from_xref(doc, xref):
|
||||
return doc.extract_image(xref)
|
||||
|
||||
|
||||
rounder = rcompose(round, int)
|
||||
|
||||
|
||||
def get_page_metadata(page):
|
||||
page_width, page_height = map(rounder, page.mediabox_size)
|
||||
|
||||
return {
|
||||
Info.PAGE_WIDTH: page_width,
|
||||
Info.PAGE_HEIGHT: page_height,
|
||||
Info.PAGE_IDX: page.number,
|
||||
}
|
||||
|
||||
|
||||
def has_alpha_channel(doc, xref):
|
||||
|
||||
maybe_image = load_image_handle_from_xref(doc, xref)
|
||||
|
||||
74
image_prediction/image_extractor/filters.py
Normal file
74
image_prediction/image_extractor/filters.py
Normal file
@ -0,0 +1,74 @@
|
||||
from _operator import itemgetter
|
||||
from typing import Callable, List
|
||||
|
||||
import fitz
|
||||
from funcy import first, compose, lmap
|
||||
|
||||
from image_prediction.config import CONFIG
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor
|
||||
from image_prediction.transformer.transformers.response import compute_geometric_quotient
|
||||
from image_prediction.utils import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def filter_scanned_pages(page_processor: Callable):
|
||||
"""Decorator for the __process_images_on_page method of the ParsablePDFImageExtractor.
|
||||
This makes it so that scanned pages won't be processed (and are thus ultimately removed from the pipline).
|
||||
A scanned page is defined by
|
||||
- having only one image on a page
|
||||
- that image having an image_to_page ratio greater than the allowed max value
|
||||
found in the CONFIG.filters.image_to_page_quotient.max"""
|
||||
|
||||
def inner(self: ImageExtractor, page: fitz.fitz.Page):
|
||||
metadata = get_metadata(page)
|
||||
if is_a_scanned_page(metadata):
|
||||
logger.debug(f"Page {page.number} won't be processed since it is a scanned page.")
|
||||
yield from []
|
||||
else:
|
||||
yield from page_processor(self, page)
|
||||
|
||||
logger.info(f"Extracting pages with filtering scanned pages...")
|
||||
return inner
|
||||
|
||||
|
||||
def is_a_scanned_page(metadata: List[dict]):
|
||||
return _contains_only_one_image(metadata) and _breaches_image_to_page_quotient(first(metadata))
|
||||
|
||||
|
||||
def _breaches_image_to_page_quotient(metadata):
|
||||
page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
|
||||
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
|
||||
)(metadata)
|
||||
geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)
|
||||
quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max)
|
||||
return quotient_breached
|
||||
|
||||
|
||||
def _contains_only_one_image(metadata):
|
||||
return True if len(metadata) == 1 else False
|
||||
|
||||
|
||||
def get_metadata(page: fitz.fitz.Page):
|
||||
def get_image_metadata(image_info):
|
||||
x1, y1, x2, y2 = map(compose(round, int), image_info["bbox"])
|
||||
width = abs(x2 - x1)
|
||||
height = abs(y2 - y1)
|
||||
return {
|
||||
"page_width": page_width,
|
||||
"page_height": page_height,
|
||||
"page_number": page_number,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"x1": x1,
|
||||
"x2": x2,
|
||||
"y1": y1,
|
||||
"y2": y2,
|
||||
}
|
||||
|
||||
image_infos = page.get_image_info(xrefs=True)
|
||||
page_number = page.number
|
||||
page_width, page_height = page.mediabox_size
|
||||
|
||||
metadata = lmap(get_image_metadata, image_infos)
|
||||
return metadata
|
||||
@ -9,8 +9,7 @@ from funcy import first, rest
|
||||
|
||||
from image_prediction.extraction import extract_images_from_pdf
|
||||
from image_prediction.image_extractor.extractor import ImageMetadataPair
|
||||
from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel
|
||||
from image_prediction.image_extractor.extractors.utils import get_image_infos
|
||||
from image_prediction.image_extractor.extractors.parsable import extract_pages, has_alpha_channel, get_image_infos
|
||||
from image_prediction.info import Info
|
||||
from test.utils.comparison import metadata_equal, image_sets_equal
|
||||
from test.utils.generation.pdf import add_image, pdf_stream
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user