add image extraction for scanned PDFs WIP

2022-08-03 13:15:27 +02:00 · 2022-08-03 13:15:27 +02:00 · 4692607834
commit 4692607834
parent 1b3b11b6f9
1 changed files with 15 additions and 0 deletions
--- a/image_prediction/pipeline.py
+++ b/image_prediction/pipeline.py
@ -1,6 +1,7 @@
 import os
 from functools import partial
 from itertools import chain, tee
+from typing import Iterable

 from funcy import rcompose, first, compose, second, chunks, identity, rpartial
 from tqdm import tqdm
@ -15,6 +16,7 @@ from image_prediction.default_objects import (
 )
 from image_prediction.locations import MLRUNS_DIR
 from image_prediction.utils.generic import lift, starlift
+from pdf2img.extraction import extract_images_per_page

 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

@ -61,6 +63,12 @@ class Pipeline:
            join,  # ... the streams by zipping
            reformat,  # ... the items
        )
+        self.pipe_for_scanned_pdf = rcompose(
+            extract_from_scanned,
+            pairwise_apply(classify, represent, identity),  # ... apply functions to the streams pairwise
+            join,  # ... the streams by zipping
+            reformat,  # ... the items
+        )

    def __call__(self, pdf: bytes, page_range: range = None):
        yield from tqdm(
@ -69,3 +77,10 @@ class Pipeline:
            unit=" images",
            disable=not self.verbose,
        )
+
+
+def extract_from_scanned(pdf: bytes, bbox_info_per_page: Iterable[dict]):
+    images = extract_images_per_page(pdf, bbox_info_per_page)
+    for page_images, page_info in zip(images, bbox_info_per_page):
+        metadata_per_image = page_info["bboxes"]
+        metadata_per_image =