Julius Unverfehrt 1796c1bcbb fix: RED-3813: ensure image hashes are always 25 chars long
The hashing algorithm omits leading bits without information. Since this
proves problematic for later processing, we restore this
information and ensure the hashes are always 25 characters long.
2024-08-22 11:15:41 +02:00

49 lines
1.6 KiB
Python

import random
from itertools import starmap
from operator import __eq__
from pathlib import Path
import pytest
from PIL.Image import Image
from funcy import compose, first
from image_prediction.encoder.encoders.hash_encoder import HashEncoder
from image_prediction.encoder.encoders.hash_encoder import hash_image
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
from image_prediction.utils.generic import lift
def resize(image: Image):
factor = random.uniform(0.3, 2)
new_size = map(lambda x: int(x * factor), image.size)
return image.resize(new_size)
def close(a: str, b: str):
assert len(a) == len(b)
return sum(starmap(__eq__, zip(a, b))) / len(a) >= 0.75
@pytest.mark.xfail(reason="Stochastic test, may fail some amount of the time.")
def test_hash_encoder(images, hashed_images, base_patch_image):
encoder = HashEncoder()
assert list(encoder(images)) == hashed_images
hashed_resized = compose(first, encoder, lift(resize))([base_patch_image])
hashed = hash_image(base_patch_image)
assert close(hashed_resized, hashed)
def test_all_hashes_have_length_of_twentyfive():
"""See RED-3814: all hashes should have 25 characters."""
pdf_path = Path(__file__).parents[1] / "data" / "similarImages2.pdf"
pdf_bytes = pdf_path.read_bytes()
image_extractor = ParsablePDFImageExtractor()
image_metadata_pairs = list(image_extractor.extract(pdf_bytes))
images = [image for image, _ in image_metadata_pairs]
hash_encoder = HashEncoder()
hashes = list(hash_encoder.encode(images))
assert all(len(h) == 25 for h in hashes)