diff --git a/image_prediction/pipeline.py b/image_prediction/pipeline.py index 6d29ac7..019cf73 100644 --- a/image_prediction/pipeline.py +++ b/image_prediction/pipeline.py @@ -1,6 +1,7 @@ import os from functools import partial from itertools import chain, tee +from typing import Iterable from funcy import rcompose, first, compose, second, chunks, identity, rpartial from tqdm import tqdm @@ -15,6 +16,7 @@ from image_prediction.default_objects import ( ) from image_prediction.locations import MLRUNS_DIR from image_prediction.utils.generic import lift, starlift +from pdf2img.extraction import extract_images_per_page os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" @@ -61,6 +63,12 @@ class Pipeline: join, # ... the streams by zipping reformat, # ... the items ) + self.pipe_for_scanned_pdf = rcompose( + extract_from_scanned, + pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise + join, # ... the streams by zipping + reformat, # ... the items + ) def __call__(self, pdf: bytes, page_range: range = None): yield from tqdm( @@ -69,3 +77,10 @@ class Pipeline: unit=" images", disable=not self.verbose, ) + + +def extract_from_scanned(pdf: bytes, bbox_info_per_page: Iterable[dict]): + images = extract_images_per_page(pdf, bbox_info_per_page) + for page_images, page_info in zip(images, bbox_info_per_page): + metadata_per_image = page_info["bboxes"] + metadata_per_image = \ No newline at end of file