2022-03-30 00:53:34 +02:00

62 lines
1.9 KiB
Python

import io
from itertools import chain, starmap
from operator import itemgetter
import fitz
from PIL import Image
from funcy import rcompose
from tqdm import tqdm
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.info import Info
from image_prediction.utils import get_logger
logger = get_logger()
class ParsablePDFImageExtractor(ImageExtractor):
def __init__(self, verbose=False):
self.doc: fitz.fitz.Document = None
self.verbose = verbose
def __process_images_on_page(self, page: fitz.fitz.Page):
def load_image_from_xref(xref):
return Image.open(io.BytesIO(self.doc.extract_image(xref)["image"]))
def format_metadata(image_info):
x1, y1, x2, y2 = map(rounder, image_info["bbox"])
width, height = itemgetter("width", "height")(image_info)
return {
Info.PAGE_WIDTH: page_width,
Info.PAGE_HEIGHT: page_height,
Info.PAGE_IDX: page.number,
Info.WIDTH: width,
Info.HEIGHT: height,
Info.X1: x1,
Info.X2: x2,
Info.Y1: y1,
Info.Y2: y2,
}
rounder = rcompose(round, int)
page_width, page_height = map(rounder, page.mediabox_size)
image_infos = page.get_image_info(xrefs=True)
xrefs = map(itemgetter("xref"), image_infos)
images = map(load_image_from_xref, xrefs)
metadata = map(format_metadata, image_infos)
return starmap(ImageMetadataPair, zip(images, metadata))
def extract(self, pdf: bytes):
logger.debug("Extracting")
self.doc = fitz.Document(stream=pdf)
image_metadata_pairs = chain.from_iterable(
map(self.__process_images_on_page, tqdm(self.doc, desc="Extracting", disable=not self.verbose))
)
return image_metadata_pairs