diff --git a/src/image_prediction/encoder/encoders/hash_encoder.py b/src/image_prediction/encoder/encoders/hash_encoder.py index c21bacd..0266a84 100644 --- a/src/image_prediction/encoder/encoders/hash_encoder.py +++ b/src/image_prediction/encoder/encoders/hash_encoder.py @@ -13,7 +13,7 @@ class HashEncoder(Encoder): yield from self.encode(images) -def hash_image(image: Image.Image): +def hash_image(image: Image.Image) -> str: """See: https://stackoverflow.com/a/49692185/3578468""" image = image.resize((10, 10), Image.ANTIALIAS) image = image.convert("L") @@ -21,4 +21,6 @@ def hash_image(image: Image.Image): avg_pixel = sum(pixel_data) / len(pixel_data) bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data]) hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper() - return hex_representation + # Note: For each 4 leading zeros, the hex representation will be shorter by one character. + # To ensure that all hashes have the same length, we pad the hex representation with zeros (also see RED-3813). + return hex_representation.zfill(25) diff --git a/test/data.dvc b/test/data.dvc index c7040fe..b7fe73d 100644 --- a/test/data.dvc +++ b/test/data.dvc @@ -1,5 +1,5 @@ outs: -- md5: 4b0fec291ce0661b3efbbd8b80f4f514.dir - size: 107332 - nfiles: 4 +- md5: 11f42db1ef57bcac9547792c5506583c.dir + size: 1118031 + nfiles: 5 path: data diff --git a/test/unit_tests/encoder_test.py b/test/unit_tests/encoder_test.py index 5102aca..5524197 100644 --- a/test/unit_tests/encoder_test.py +++ b/test/unit_tests/encoder_test.py @@ -1,12 +1,15 @@ import random from itertools import starmap from operator import __eq__ +from pathlib import Path import pytest from PIL.Image import Image from funcy import compose, first -from image_prediction.encoder.encoders.hash_encoder import HashEncoder, hash_image +from image_prediction.encoder.encoders.hash_encoder import HashEncoder +from image_prediction.encoder.encoders.hash_encoder import hash_image +from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor from image_prediction.utils.generic import lift @@ -29,3 +32,17 @@ def test_hash_encoder(images, hashed_images, base_patch_image): hashed_resized = compose(first, encoder, lift(resize))([base_patch_image]) hashed = hash_image(base_patch_image) assert close(hashed_resized, hashed) + + +def test_all_hashes_have_length_of_twentyfive(): + """See RED-3814: all hashes should have 25 characters.""" + pdf_path = Path(__file__).parents[1] / "data" / "similarImages2.pdf" + pdf_bytes = pdf_path.read_bytes() + image_extractor = ParsablePDFImageExtractor() + image_metadata_pairs = list(image_extractor.extract(pdf_bytes)) + images = [image for image, _ in image_metadata_pairs] + + hash_encoder = HashEncoder() + hashes = list(hash_encoder.encode(images)) + + assert all(len(h) == 25 for h in hashes)