add image extraction for scanned PDFs WIP
This commit is contained in:
parent
1b3b11b6f9
commit
4692607834
@ -1,6 +1,7 @@
|
||||
import os
|
||||
from functools import partial
|
||||
from itertools import chain, tee
|
||||
from typing import Iterable
|
||||
|
||||
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
|
||||
from tqdm import tqdm
|
||||
@ -15,6 +16,7 @@ from image_prediction.default_objects import (
|
||||
)
|
||||
from image_prediction.locations import MLRUNS_DIR
|
||||
from image_prediction.utils.generic import lift, starlift
|
||||
from pdf2img.extraction import extract_images_per_page
|
||||
|
||||
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
||||
|
||||
@ -61,6 +63,12 @@ class Pipeline:
|
||||
join, # ... the streams by zipping
|
||||
reformat, # ... the items
|
||||
)
|
||||
self.pipe_for_scanned_pdf = rcompose(
|
||||
extract_from_scanned,
|
||||
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
|
||||
join, # ... the streams by zipping
|
||||
reformat, # ... the items
|
||||
)
|
||||
|
||||
def __call__(self, pdf: bytes, page_range: range = None):
|
||||
yield from tqdm(
|
||||
@ -69,3 +77,10 @@ class Pipeline:
|
||||
unit=" images",
|
||||
disable=not self.verbose,
|
||||
)
|
||||
|
||||
|
||||
def extract_from_scanned(pdf: bytes, bbox_info_per_page: Iterable[dict]):
|
||||
images = extract_images_per_page(pdf, bbox_info_per_page)
|
||||
for page_images, page_info in zip(images, bbox_info_per_page):
|
||||
metadata_per_image = page_info["bboxes"]
|
||||
metadata_per_image =
|
||||
Loading…
x
Reference in New Issue
Block a user