Merge in RR/image-prediction from RED-5107-hotfix to release/3.4.1
Squashed commit of the following:
commit b7b99074054e67201537efc2f0a5b96f29bd1684
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Aug 29 12:57:50 2022 +0200
RED-5107: move image normalization for predictor to image extraction step to be able to properly catch exeption thrown from this step
85 lines
2.8 KiB
Python
85 lines
2.8 KiB
Python
import random
|
|
from operator import itemgetter
|
|
|
|
import fitz
|
|
import fpdf
|
|
import pytest
|
|
from PIL import Image
|
|
from funcy import first, rest, lmap
|
|
|
|
from image_prediction.extraction import extract_images_from_pdf
|
|
from image_prediction.image_extractor.extractor import ImageMetadataPair
|
|
from image_prediction.image_extractor.extractors.parsable import extract_pages, get_image_infos, has_alpha_channel
|
|
from image_prediction.info import Info
|
|
from test.utils.comparison import metadata_equal, image_sets_equal
|
|
from test.utils.generation.pdf import add_image, pdf_stream
|
|
|
|
|
|
@pytest.mark.parametrize("extractor_type", ["mock"])
|
|
@pytest.mark.parametrize("batch_size", [1, 2, 16])
|
|
def test_image_extractor_mock(image_extractor, images):
|
|
images_extracted, metadata = map(list, zip(*image_extractor(images)))
|
|
assert images_extracted == images
|
|
|
|
|
|
@pytest.mark.parametrize("extractor_type", ["parsable_pdf", "default"])
|
|
@pytest.mark.parametrize("input_size", [{"depth": 3, "width": 170, "height": 220}], indirect=["input_size"])
|
|
@pytest.mark.parametrize("alpha", [False, True])
|
|
def test_parsable_pdf_image_extractor(image_extractor, pdf, images, metadata, input_size, alpha):
|
|
images_extracted, metadata_extracted = map(list, extract_images_from_pdf(pdf, image_extractor))
|
|
|
|
# TODO: move resize operation to expected images fixture
|
|
def __resize_and_convert(image):
|
|
return image.resize((224, 224)).convert("RGB")
|
|
|
|
images = lmap(__resize_and_convert, images)
|
|
|
|
if not alpha:
|
|
assert image_sets_equal(images_extracted, images)
|
|
assert metadata_equal(metadata_extracted, metadata)
|
|
|
|
|
|
@pytest.mark.parametrize("batch_size", [1, 2, 16])
|
|
def test_extract_pages(pdf):
|
|
doc = fitz.Document(stream=pdf)
|
|
|
|
max_index = max(0, doc.page_count - 1)
|
|
i = random.randint(0, max(0, max_index - 1))
|
|
j = random.randint(i + 1, max_index) if max_index > 0 else 0
|
|
|
|
page_range = range(i, j)
|
|
|
|
pages = list(extract_pages(doc, page_range))
|
|
assert all((isinstance(p, fitz.Page) for p in pages))
|
|
assert len(pages) == len(page_range)
|
|
|
|
|
|
@pytest.mark.parametrize("suffix", ["gif", "png", "jpeg"])
|
|
@pytest.mark.parametrize("mode", ["RGB", "RGBA"])
|
|
def test_has_alpha_channel(base_patch_metadata, suffix, mode):
|
|
|
|
mode = "RGB" if suffix == "jpeg" else mode
|
|
|
|
pdf = fpdf.FPDF(unit="pt")
|
|
|
|
image = Image.new(mode, itemgetter(Info.WIDTH, Info.HEIGHT)(base_patch_metadata), color=(10, 10, 10))
|
|
|
|
add_image(pdf, ImageMetadataPair(image, base_patch_metadata), suffix=suffix)
|
|
|
|
doc = fitz.Document(stream=pdf_stream(pdf))
|
|
|
|
page = first(doc)
|
|
|
|
xrefs = map(itemgetter("xref"), get_image_infos(page))
|
|
|
|
result = has_alpha_channel(doc, first(xrefs))
|
|
|
|
if mode == "RGBA":
|
|
assert result
|
|
if mode == "RGB":
|
|
assert not result
|
|
|
|
assert not list(rest(xrefs))
|
|
|
|
doc.close()
|