fixed misaligned metadata and images

This commit is contained in:
Matthias Bisping 2022-03-28 16:38:46 +02:00
parent 9461be29d5
commit b818ee4724
3 changed files with 25 additions and 9 deletions

View File

@ -0,0 +1,13 @@
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
def extract_images_from_pdf(pdf, extractor=None):
if not extractor:
extractor = ParsablePDFImageExtractor()
try:
images_extracted, metadata_extracted = zip(*extractor(pdf))
return images_extracted, metadata_extracted
except ValueError:
return [], []

View File

@ -1,7 +1,9 @@
import io
from itertools import chain, starmap from itertools import chain, starmap
from operator import itemgetter from operator import itemgetter
import fitz import fitz
from PIL import Image
from funcy import rcompose from funcy import rcompose
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
@ -13,7 +15,7 @@ class ParsablePDFImageExtractor(ImageExtractor):
def __process_images_on_page(self, page: fitz.fitz.Page): def __process_images_on_page(self, page: fitz.fitz.Page):
def load_image_from_xref(xref): def load_image_from_xref(xref):
return self.doc.extract_image(xref)["image"] return Image.open(io.BytesIO(self.doc.extract_image(xref)["image"]))
def format_metadata(image_info): def format_metadata(image_info):
x1, y1, x2, y2 = map(rounder, image_info["bbox"]) x1, y1, x2, y2 = map(rounder, image_info["bbox"])
@ -34,10 +36,9 @@ class ParsablePDFImageExtractor(ImageExtractor):
page_width, page_height = map(rounder, page.mediabox_size) page_width, page_height = map(rounder, page.mediabox_size)
image_handles = page.get_images(full=True) image_infos = page.get_image_info(xrefs=True)
xrefs = map(itemgetter(0), image_handles) xrefs = map(itemgetter("xref"), image_infos)
images = map(load_image_from_xref, xrefs) images = map(load_image_from_xref, xrefs)
image_infos = page.get_image_info()
metadata = map(format_metadata, image_infos) metadata = map(format_metadata, image_infos)
return starmap(ImageMetadataPair, zip(images, metadata)) return starmap(ImageMetadataPair, zip(images, metadata))

View File

@ -1,7 +1,9 @@
import time import numpy as np
import pytest import pytest
from image_prediction.estimator.preprocessor.utils import images_to_batch_tensor
from image_prediction.extraction import extract_images_from_pdf
@pytest.mark.parametrize("extractor_type", ["mock"]) @pytest.mark.parametrize("extractor_type", ["mock"])
@pytest.mark.parametrize("batch_size", [1, 2, 4]) @pytest.mark.parametrize("batch_size", [1, 2, 4])
@ -11,8 +13,8 @@ def test_image_extractor_mock(image_extractor, images):
@pytest.mark.parametrize("extractor_type", ["parsable_pdf"]) @pytest.mark.parametrize("extractor_type", ["parsable_pdf"])
@pytest.mark.parametrize("batch_size", [10]) @pytest.mark.parametrize("batch_size", [0, 1, 2, 4, 8])
def test_parsable_pdf_image_extractor(image_extractor, pdf, images, metadata): def test_parsable_pdf_image_extractor(image_extractor, pdf, images, metadata):
images_extracted, metadata_extracted = map(list, zip(*image_extractor(pdf))) images_extracted, metadata_extracted = map(list, extract_images_from_pdf(pdf, image_extractor))
# assert images_extracted == images assert np.allclose(images_to_batch_tensor(images_extracted), images_to_batch_tensor(images))
assert list(metadata_extracted) == metadata assert list(metadata_extracted) == metadata