doc.extract_image(xref) can yield None; hence added filtering for None images
This commit is contained in:
parent
f3e2b2335f
commit
8f61c4cba2
@ -1,10 +1,11 @@
|
|||||||
import io
|
import io
|
||||||
from itertools import chain, starmap
|
from itertools import chain, starmap
|
||||||
from operator import itemgetter
|
from operator import itemgetter, __and__, truth
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from funcy import rcompose
|
from funcy import rcompose, compose, curry
|
||||||
|
from iteration_utilities import starfilter
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||||
@ -18,7 +19,11 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
|||||||
|
|
||||||
def __process_images_on_page(self, page: fitz.fitz.Page):
|
def __process_images_on_page(self, page: fitz.fitz.Page):
|
||||||
def load_image_from_xref(xref):
|
def load_image_from_xref(xref):
|
||||||
return Image.open(io.BytesIO(self.doc.extract_image(xref)["image"]))
|
maybe_image = self.doc.extract_image(xref)
|
||||||
|
if maybe_image:
|
||||||
|
return Image.open(io.BytesIO(maybe_image["image"]))
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
def format_metadata(image_info):
|
def format_metadata(image_info):
|
||||||
x1, y1, x2, y2 = map(rounder, image_info["bbox"])
|
x1, y1, x2, y2 = map(rounder, image_info["bbox"])
|
||||||
@ -44,7 +49,7 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
|||||||
images = map(load_image_from_xref, xrefs)
|
images = map(load_image_from_xref, xrefs)
|
||||||
metadata = map(format_metadata, image_infos)
|
metadata = map(format_metadata, image_infos)
|
||||||
|
|
||||||
return starmap(ImageMetadataPair, zip(images, metadata))
|
return starmap(ImageMetadataPair, filter(compose(all, curry(map)(truth)), zip(images, metadata)))
|
||||||
|
|
||||||
def extract(self, pdf: bytes):
|
def extract(self, pdf: bytes):
|
||||||
self.doc = fitz.Document(stream=pdf)
|
self.doc = fitz.Document(stream=pdf)
|
||||||
|
|||||||
@ -1,35 +1,49 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
from glob import glob
|
||||||
|
|
||||||
from image_prediction.pipeline import load_pipeline
|
from image_prediction.pipeline import load_pipeline
|
||||||
from image_prediction.transformer.transformers.coordinate.pdfnet import PDFNetCoordinateTransformer
|
|
||||||
from image_prediction.utils.pdf_annotation import annotate_pdf
|
from image_prediction.utils.pdf_annotation import annotate_pdf
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("pdf")
|
|
||||||
|
parser.add_argument("input", help="pdf file or directory")
|
||||||
|
|
||||||
|
parser.add_argument("-print", "-p", help="print output to terminal", action="store_true", default=False)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def process_pdf(pipeline, pdf_path):
|
||||||
pipeline = load_pipeline(verbose=True)
|
|
||||||
|
|
||||||
pdf_path = args.pdf
|
|
||||||
|
|
||||||
with open(pdf_path, "rb") as f:
|
with open(pdf_path, "rb") as f:
|
||||||
predictions = list(pipeline(f.read()))
|
predictions = list(pipeline(f.read()))
|
||||||
|
|
||||||
print(json.dumps(predictions, indent=2))
|
|
||||||
|
|
||||||
annotate_pdf(
|
annotate_pdf(
|
||||||
pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", "_annotated.pdf")))
|
pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", "_annotated.pdf")))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return predictions
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
pipeline = load_pipeline(verbose=True)
|
||||||
|
|
||||||
|
if os.path.isfile(args.input):
|
||||||
|
pdf_paths = [args.input]
|
||||||
|
else:
|
||||||
|
pdf_paths = glob(os.path.join(args.input, "*.pdf"))
|
||||||
|
|
||||||
|
for pdf_path in pdf_paths:
|
||||||
|
predictions = process_pdf(pipeline, pdf_path)
|
||||||
|
if args.print:
|
||||||
|
print(pdf_path)
|
||||||
|
print(json.dumps(predictions, indent=2))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user