From 9461be29d572ee4f8dd399831ab9f1a76a9c2ae2 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Mon, 28 Mar 2022 15:42:54 +0200 Subject: [PATCH] add ParsablePDFImageExtractor test --- .../image_extractor/extractors/parsable.py | 42 ++++++++++++------ requirements.txt | 1 + test/unit_tests/conftest.py | 43 +++++++++++++++---- test/unit_tests/image_extractor_test.py | 10 +++-- 4 files changed, 71 insertions(+), 25 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index e58a0b9..0853526 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -1,7 +1,8 @@ -from itertools import chain +from itertools import chain, starmap from operator import itemgetter import fitz +from funcy import rcompose from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair @@ -9,22 +10,37 @@ from image_prediction.image_extractor.extractor import ImageExtractor, ImageMeta class ParsablePDFImageExtractor(ImageExtractor): def __init__(self): self.doc: fitz.fitz.Document = None - self.page: fitz.fitz.Page = None - - def __build_metadata(self, xref): - metadata = self.page.get_image_info(xref) - page_width, page_height = self.page.mediabox_size - metadata = {**metadata, "page_width": page_width, "page_height": page_height, "page_idx": self.page.number} - return metadata def __process_images_on_page(self, page: fitz.fitz.Page): - self.page = page + def load_image_from_xref(xref): + return self.doc.extract_image(xref)["image"] + + def format_metadata(image_info): + x1, y1, x2, y2 = map(rounder, image_info["bbox"]) + width, height = itemgetter("width", "height")(image_info) + return { + "page_width": page_width, + "page_height": page_height, + "page_idx": page.number, + "width": width, + "height": height, + "x1": x1, + "x2": x2, + "y1": y1, + "y2": y2 + } + + rounder = rcompose(round, int) + + page_width, page_height = map(rounder, page.mediabox_size) image_handles = page.get_images(full=True) - xrefs = itemgetter(0)(image_handles) - images = map(lambda xref: self.doc.extract_image(xref)["image"], xrefs) - metadata = map(self.__build_metadata, xrefs) - return map(ImageMetadataPair, zip(images, metadata)) + xrefs = map(itemgetter(0), image_handles) + images = map(load_image_from_xref, xrefs) + image_infos = page.get_image_info() + metadata = map(format_metadata, image_infos) + + return starmap(ImageMetadataPair, zip(images, metadata)) def extract(self, pdf: bytes): self.doc = fitz.Document(stream=pdf) diff --git a/requirements.txt b/requirements.txt index 3f49c2d..9cf8815 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,4 @@ scikit_learn~=0.24.2 pytest~=7.1.0 funcy==1.17 PyMuPDF==1.19.6 +fpdf==1.7.2 diff --git a/test/unit_tests/conftest.py b/test/unit_tests/conftest.py index e3dca1e..314b186 100644 --- a/test/unit_tests/conftest.py +++ b/test/unit_tests/conftest.py @@ -1,5 +1,8 @@ import random +import tempfile +from operator import itemgetter +import fpdf import numpy as np import pytest from PIL import Image @@ -126,19 +129,25 @@ def map_labels(numeric_labels, classes): @pytest.fixture def metadata(images): + page_idx = 0 + + def current_page_idx(): + nonlocal page_idx + page_idx += random.randint(0, 2) + return min(page_idx, len(images) - 1) + def build_image_metadata(image): width, height = image.size - page_width = 595.32 - page_height = 842.04 - x1 = random.uniform(0, page_width - width) + page_width = 595 + page_height = 842 + x1 = random.randint(0, page_width - width) x2 = x1 + width - y1 = random.uniform(0, page_height) - y2 = y1 - height - + y1 = random.randint(0, page_height - height) + y2 = y1 + height metadata = { "page_width": page_width, "page_height": page_height, - "page_idx": 0, + "page_idx": current_page_idx(), "width": width, "height": height, "x1": x1, @@ -146,5 +155,23 @@ def metadata(images): "y1": y1, "y2": y2 } + return metadata - return map(build_image_metadata, images) + return list(map(build_image_metadata, images)) + + +@pytest.fixture +def pdf(images, metadata): + pdf = fpdf.FPDF(unit="pt") + pdf.add_page() + for image, metadata in zip(images, metadata): + while metadata["page_idx"] > pdf.page - 1: + pdf.add_page() + x, y, w, h = itemgetter("x1", "y1", "width", "height")(metadata) + with tempfile.NamedTemporaryFile(suffix=".png") as temp_image: + image.save(temp_image.name) + pdf.image(temp_image.name, x=x, y=y, w=w, h=h) + with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf: + pdf.output(temp_pdf.name) + with open(temp_pdf.name, "rb") as open_temp_pdf: + yield open_temp_pdf.read() diff --git a/test/unit_tests/image_extractor_test.py b/test/unit_tests/image_extractor_test.py index 0c44ab2..9166da9 100644 --- a/test/unit_tests/image_extractor_test.py +++ b/test/unit_tests/image_extractor_test.py @@ -1,3 +1,5 @@ +import time + import pytest @@ -9,8 +11,8 @@ def test_image_extractor_mock(image_extractor, images): @pytest.mark.parametrize("extractor_type", ["parsable_pdf"]) -@pytest.mark.parametrize("batch_size", [1, 2, 4]) +@pytest.mark.parametrize("batch_size", [10]) def test_parsable_pdf_image_extractor(image_extractor, pdf, images, metadata): - images_extracted, metadata_extracted = map(list, zip(*image_extractor(images))) - assert images_extracted == images - assert metadata_extracted == metadata + images_extracted, metadata_extracted = map(list, zip(*image_extractor(pdf))) + # assert images_extracted == images + assert list(metadata_extracted) == metadata