add image extraction for scanned PDFs WIP

This commit is contained in:
Julius Unverfehrt 2022-08-03 13:15:27 +02:00
parent 1b3b11b6f9
commit 4692607834

View File

@ -1,6 +1,7 @@
import os
from functools import partial
from itertools import chain, tee
from typing import Iterable
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
from tqdm import tqdm
@ -15,6 +16,7 @@ from image_prediction.default_objects import (
)
from image_prediction.locations import MLRUNS_DIR
from image_prediction.utils.generic import lift, starlift
from pdf2img.extraction import extract_images_per_page
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@ -61,6 +63,12 @@ class Pipeline:
join, # ... the streams by zipping
reformat, # ... the items
)
self.pipe_for_scanned_pdf = rcompose(
extract_from_scanned,
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
join, # ... the streams by zipping
reformat, # ... the items
)
def __call__(self, pdf: bytes, page_range: range = None):
yield from tqdm(
@ -69,3 +77,10 @@ class Pipeline:
unit=" images",
disable=not self.verbose,
)
def extract_from_scanned(pdf: bytes, bbox_info_per_page: Iterable[dict]):
images = extract_images_per_page(pdf, bbox_info_per_page)
for page_images, page_info in zip(images, bbox_info_per_page):
metadata_per_image = page_info["bboxes"]
metadata_per_image =