refactor

- reduce code duplication by adapting functions of the module - use the modules enums for image metadata - improve readabilty of the scanned page detection heuristic
2023-02-01 12:43:49 +01:00 · 2023-02-01 12:43:49 +01:00 · 5bd5e0cf2b
commit 5bd5e0cf2b
parent 876260f403
1 changed files with 20 additions and 35 deletions
--- a/image_prediction/image_extractor/filters.py
+++ b/image_prediction/image_extractor/filters.py
@ -1,11 +1,12 @@
 from _operator import itemgetter
-from typing import Callable, List
+from typing import Callable

 import fitz
-from funcy import first, compose, lmap
+from funcy import first, lmap, second

 from image_prediction.config import CONFIG
 from image_prediction.image_extractor.extractor import ImageExtractor
+from image_prediction.info import Info
 from image_prediction.transformer.transformers.response import compute_geometric_quotient
 from image_prediction.utils import get_logger

@ -21,10 +22,8 @@ def filter_scanned_pages(page_processor: Callable):
          defined in CONFIG.filters.image_to_page_quotient.max"""

    def inner(extractor: ImageExtractor, page: fitz.fitz.Page):
-        metadata = get_metadata(page)
-        if is_a_scanned_page(metadata):
-            logger.debug(f"Page {page.number} won't be processed since it is a scanned page.")
-            yield from []
+        if is_a_scanned_page(page):
+            logger.info(f"Page {page.number} is a scanned page; skipping image extraction.")
        else:
            yield from page_processor(extractor, page)

@ -32,43 +31,29 @@ def filter_scanned_pages(page_processor: Callable):
    return inner


-def is_a_scanned_page(metadata: List[dict]):
-    return _contains_only_one_image(metadata) and _breaches_image_to_page_quotient(first(metadata))
+def is_a_scanned_page(page: fitz.fitz.Page):
+    metadata = get_metadata_for_images_on_page(page)
+    return first(map(__breaches_image_to_page_quotient, metadata)) and not second(metadata)


-def _breaches_image_to_page_quotient(metadata):
+def __breaches_image_to_page_quotient(metadatum):
    page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
-        "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
-    )(metadata)
-    geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)
+        Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT
+    )(metadatum)
+    geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1)
    quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max)
    return quotient_breached


-def _contains_only_one_image(metadata):
-    return True if len(metadata) == 1 else False
-
-
-def get_metadata(page: fitz.fitz.Page):
-    def get_image_metadata(image_info):
-        x1, y1, x2, y2 = map(compose(round, int), image_info["bbox"])
-        width = abs(x2 - x1)
-        height = abs(y2 - y1)
-        return {
-            "page_width": page_width,
-            "page_height": page_height,
-            "page_number": page_number,
-            "width": width,
-            "height": height,
-            "x1": x1,
-            "x2": x2,
-            "y1": y1,
-            "y2": y2,
-        }
+def get_metadata_for_images_on_page(page: fitz.fitz.Page):
+    """Effectively the same as image_prediction.image_extractor.extractors.parsable.get_metadata_for_images_on_page,
+    however without the validation steps since not required here and take a significant amount of time.
+    """
+    # temporary solution to avoid circular imports without changing the original code
+    from image_prediction.image_extractor.extractors.parsable import get_image_metadata, add_page_metadata

    image_infos = page.get_image_info(xrefs=True)
-    page_number = page.number
-    page_width, page_height = page.mediabox_size
-
    metadata = lmap(get_image_metadata, image_infos)
+    metadata = add_page_metadata(page, metadata)
+
    return metadata