From 643ab99bd3a5c515b922e068eaacfd9b225d0c73 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Mon, 28 Mar 2022 11:27:05 +0200 Subject: [PATCH] added parsable pdf image extractor --- .../image_extractor/extractors/parsable.py | 32 +++++++++++++++++++ requirements.txt | 1 + 2 files changed, 33 insertions(+) create mode 100644 image_prediction/image_extractor/extractors/parsable.py diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py new file mode 100644 index 0000000..e58a0b9 --- /dev/null +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -0,0 +1,32 @@ +from itertools import chain +from operator import itemgetter + +import fitz + +from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair + + +class ParsablePDFImageExtractor(ImageExtractor): + def __init__(self): + self.doc: fitz.fitz.Document = None + self.page: fitz.fitz.Page = None + + def __build_metadata(self, xref): + metadata = self.page.get_image_info(xref) + page_width, page_height = self.page.mediabox_size + metadata = {**metadata, "page_width": page_width, "page_height": page_height, "page_idx": self.page.number} + return metadata + + def __process_images_on_page(self, page: fitz.fitz.Page): + self.page = page + + image_handles = page.get_images(full=True) + xrefs = itemgetter(0)(image_handles) + images = map(lambda xref: self.doc.extract_image(xref)["image"], xrefs) + metadata = map(self.__build_metadata, xrefs) + return map(ImageMetadataPair, zip(images, metadata)) + + def extract(self, pdf: bytes): + self.doc = fitz.Document(stream=pdf) + image_metadata_pairs = chain(*map(self.__process_images_on_page, self.doc)) + return image_metadata_pairs diff --git a/requirements.txt b/requirements.txt index a0a88e6..3f49c2d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,3 +21,4 @@ PyYAML~=5.4.1 scikit_learn~=0.24.2 pytest~=7.1.0 funcy==1.17 +PyMuPDF==1.19.6