refactor
- reduce code duplication by adapting functions of the module - use the modules enums for image metadata - improve readabilty of the scanned page detection heuristic
This commit is contained in:
parent
876260f403
commit
5bd5e0cf2b
@ -1,11 +1,12 @@
|
||||
from _operator import itemgetter
|
||||
from typing import Callable, List
|
||||
from typing import Callable
|
||||
|
||||
import fitz
|
||||
from funcy import first, compose, lmap
|
||||
from funcy import first, lmap, second
|
||||
|
||||
from image_prediction.config import CONFIG
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor
|
||||
from image_prediction.info import Info
|
||||
from image_prediction.transformer.transformers.response import compute_geometric_quotient
|
||||
from image_prediction.utils import get_logger
|
||||
|
||||
@ -21,10 +22,8 @@ def filter_scanned_pages(page_processor: Callable):
|
||||
defined in CONFIG.filters.image_to_page_quotient.max"""
|
||||
|
||||
def inner(extractor: ImageExtractor, page: fitz.fitz.Page):
|
||||
metadata = get_metadata(page)
|
||||
if is_a_scanned_page(metadata):
|
||||
logger.debug(f"Page {page.number} won't be processed since it is a scanned page.")
|
||||
yield from []
|
||||
if is_a_scanned_page(page):
|
||||
logger.info(f"Page {page.number} is a scanned page; skipping image extraction.")
|
||||
else:
|
||||
yield from page_processor(extractor, page)
|
||||
|
||||
@ -32,43 +31,29 @@ def filter_scanned_pages(page_processor: Callable):
|
||||
return inner
|
||||
|
||||
|
||||
def is_a_scanned_page(metadata: List[dict]):
|
||||
return _contains_only_one_image(metadata) and _breaches_image_to_page_quotient(first(metadata))
|
||||
def is_a_scanned_page(page: fitz.fitz.Page):
|
||||
metadata = get_metadata_for_images_on_page(page)
|
||||
return first(map(__breaches_image_to_page_quotient, metadata)) and not second(metadata)
|
||||
|
||||
|
||||
def _breaches_image_to_page_quotient(metadata):
|
||||
def __breaches_image_to_page_quotient(metadatum):
|
||||
page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
|
||||
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
|
||||
)(metadata)
|
||||
geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)
|
||||
Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT
|
||||
)(metadatum)
|
||||
geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1)
|
||||
quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max)
|
||||
return quotient_breached
|
||||
|
||||
|
||||
def _contains_only_one_image(metadata):
|
||||
return True if len(metadata) == 1 else False
|
||||
|
||||
|
||||
def get_metadata(page: fitz.fitz.Page):
|
||||
def get_image_metadata(image_info):
|
||||
x1, y1, x2, y2 = map(compose(round, int), image_info["bbox"])
|
||||
width = abs(x2 - x1)
|
||||
height = abs(y2 - y1)
|
||||
return {
|
||||
"page_width": page_width,
|
||||
"page_height": page_height,
|
||||
"page_number": page_number,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"x1": x1,
|
||||
"x2": x2,
|
||||
"y1": y1,
|
||||
"y2": y2,
|
||||
}
|
||||
def get_metadata_for_images_on_page(page: fitz.fitz.Page):
|
||||
"""Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page,
|
||||
however without the validation steps since not required here and take a significant amount of time.
|
||||
"""
|
||||
# temporary solution to avoid circular imports without changing the original code
|
||||
from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata
|
||||
|
||||
image_infos = page.get_image_info(xrefs=True)
|
||||
page_number = page.number
|
||||
page_width, page_height = page.mediabox_size
|
||||
|
||||
metadata = lmap(get_image_metadata, image_infos)
|
||||
metadata = add_page_metadata(page, metadata)
|
||||
|
||||
return metadata
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user