- reduce code duplication by adapting functions of the module
- use the modules enums for image metadata
- improve readabilty of the scanned page detection heuristic
This commit is contained in:
Julius Unverfehrt 2023-02-01 12:43:49 +01:00
parent 876260f403
commit 5bd5e0cf2b

View File

@ -1,11 +1,12 @@
from _operator import itemgetter
from typing import Callable, List
from typing import Callable
import fitz
from funcy import first, compose, lmap
from funcy import first, lmap, second
from image_prediction.config import CONFIG
from image_prediction.image_extractor.extractor import ImageExtractor
from image_prediction.info import Info
from image_prediction.transformer.transformers.response import compute_geometric_quotient
from image_prediction.utils import get_logger
@ -21,10 +22,8 @@ def filter_scanned_pages(page_processor: Callable):
defined in CONFIG.filters.image_to_page_quotient.max"""
def inner(extractor: ImageExtractor, page: fitz.fitz.Page):
metadata = get_metadata(page)
if is_a_scanned_page(metadata):
logger.debug(f"Page {page.number} won't be processed since it is a scanned page.")
yield from []
if is_a_scanned_page(page):
logger.info(f"Page {page.number} is a scanned page; skipping image extraction.")
else:
yield from page_processor(extractor, page)
@ -32,43 +31,29 @@ def filter_scanned_pages(page_processor: Callable):
return inner
def is_a_scanned_page(metadata: List[dict]):
return _contains_only_one_image(metadata) and _breaches_image_to_page_quotient(first(metadata))
def is_a_scanned_page(page: fitz.fitz.Page):
metadata = get_metadata_for_images_on_page(page)
return first(map(__breaches_image_to_page_quotient, metadata)) and not second(metadata)
def _breaches_image_to_page_quotient(metadata):
def __breaches_image_to_page_quotient(metadatum):
page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
)(metadata)
geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)
Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT
)(metadatum)
geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1)
quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max)
return quotient_breached
def _contains_only_one_image(metadata):
return True if len(metadata) == 1 else False
def get_metadata(page: fitz.fitz.Page):
def get_image_metadata(image_info):
x1, y1, x2, y2 = map(compose(round, int), image_info["bbox"])
width = abs(x2 - x1)
height = abs(y2 - y1)
return {
"page_width": page_width,
"page_height": page_height,
"page_number": page_number,
"width": width,
"height": height,
"x1": x1,
"x2": x2,
"y1": y1,
"y2": y2,
}
def get_metadata_for_images_on_page(page: fitz.fitz.Page):
"""Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page,
however without the validation steps since not required here and take a significant amount of time.
"""
# temporary solution to avoid circular imports without changing the original code
from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata
image_infos = page.get_image_info(xrefs=True)
page_number = page.number
page_width, page_height = page.mediabox_size
metadata = lmap(get_image_metadata, image_infos)
metadata = add_page_metadata(page, metadata)
return metadata