The hashing algorithm omits leading bits without information. Since this proves problematic for later processing, we restore this information and ensure the hashes are always 25 characters long.
49 lines
1.6 KiB
Python
49 lines
1.6 KiB
Python
import random
|
|
from itertools import starmap
|
|
from operator import __eq__
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
from PIL.Image import Image
|
|
from funcy import compose, first
|
|
|
|
from image_prediction.encoder.encoders.hash_encoder import HashEncoder
|
|
from image_prediction.encoder.encoders.hash_encoder import hash_image
|
|
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
|
|
from image_prediction.utils.generic import lift
|
|
|
|
|
|
def resize(image: Image):
|
|
factor = random.uniform(0.3, 2)
|
|
new_size = map(lambda x: int(x * factor), image.size)
|
|
return image.resize(new_size)
|
|
|
|
|
|
def close(a: str, b: str):
|
|
assert len(a) == len(b)
|
|
return sum(starmap(__eq__, zip(a, b))) / len(a) >= 0.75
|
|
|
|
|
|
@pytest.mark.xfail(reason="Stochastic test, may fail some amount of the time.")
|
|
def test_hash_encoder(images, hashed_images, base_patch_image):
|
|
encoder = HashEncoder()
|
|
assert list(encoder(images)) == hashed_images
|
|
|
|
hashed_resized = compose(first, encoder, lift(resize))([base_patch_image])
|
|
hashed = hash_image(base_patch_image)
|
|
assert close(hashed_resized, hashed)
|
|
|
|
|
|
def test_all_hashes_have_length_of_twentyfive():
|
|
"""See RED-3814: all hashes should have 25 characters."""
|
|
pdf_path = Path(__file__).parents[1] / "data" / "similarImages2.pdf"
|
|
pdf_bytes = pdf_path.read_bytes()
|
|
image_extractor = ParsablePDFImageExtractor()
|
|
image_metadata_pairs = list(image_extractor.extract(pdf_bytes))
|
|
images = [image for image, _ in image_metadata_pairs]
|
|
|
|
hash_encoder = HashEncoder()
|
|
hashes = list(hash_encoder.encode(images))
|
|
|
|
assert all(len(h) == 25 for h in hashes)
|