fix: RED-3813: ensure image hashes are always 25 chars long

The hashing algorithm omits leading bits without information. Since this proves problematic for later processing, we restore this information and ensure the hashes are always 25 characters long.
2024-08-22 11:09:18 +02:00 · 2024-08-22 11:09:18 +02:00 · 1796c1bcbb
commit 1796c1bcbb
parent 278b42e368
3 changed files with 25 additions and 6 deletions
--- a/src/image_prediction/encoder/encoders/hash_encoder.py
+++ b/src/image_prediction/encoder/encoders/hash_encoder.py
@ -13,7 +13,7 @@ class HashEncoder(Encoder):
        yield from self.encode(images)


-def hash_image(image: Image.Image):
+def hash_image(image: Image.Image) -> str:
    """See: https://stackoverflow.com/a/49692185/3578468"""
    image = image.resize((10, 10), Image.ANTIALIAS)
    image = image.convert("L")
@ -21,4 +21,6 @@ def hash_image(image: Image.Image):
    avg_pixel = sum(pixel_data) / len(pixel_data)
    bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
    hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
-    return hex_representation
+    # Note: For each 4 leading zeros, the hex representation will be shorter by one character.
+    # To ensure that all hashes have the same length, we pad the hex representation with zeros (also see RED-3813).
+    return hex_representation.zfill(25)
--- a/test/data.dvc
+++ b/test/data.dvc
@ -1,5 +1,5 @@
 outs:
- md5: 4b0fec291ce0661b3efbbd8b80f4f514.dir
-  size: 107332
-  nfiles: 4
+- md5: 11f42db1ef57bcac9547792c5506583c.dir
+  size: 1118031
+  nfiles: 5
  path: data
--- a/test/unit_tests/encoder_test.py
+++ b/test/unit_tests/encoder_test.py
@ -1,12 +1,15 @@
 import random
 from itertools import starmap
 from operator import __eq__
+from pathlib import Path

 import pytest
 from PIL.Image import Image
 from funcy import compose, first

-from image_prediction.encoder.encoders.hash_encoder import HashEncoder, hash_image
+from image_prediction.encoder.encoders.hash_encoder import HashEncoder
+from image_prediction.encoder.encoders.hash_encoder import hash_image
+from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
 from image_prediction.utils.generic import lift


@ -29,3 +32,17 @@ def test_hash_encoder(images, hashed_images, base_patch_image):
    hashed_resized = compose(first, encoder, lift(resize))([base_patch_image])
    hashed = hash_image(base_patch_image)
    assert close(hashed_resized, hashed)
+
+
+def test_all_hashes_have_length_of_twentyfive():
+    """See RED-3814: all hashes should have 25 characters."""
+    pdf_path = Path(__file__).parents[1] / "data" / "similarImages2.pdf"
+    pdf_bytes = pdf_path.read_bytes()
+    image_extractor = ParsablePDFImageExtractor()
+    image_metadata_pairs = list(image_extractor.extract(pdf_bytes))
+    images = [image for image, _ in image_metadata_pairs]
+
+    hash_encoder = HashEncoder()
+    hashes = list(hash_encoder.encode(images))
+
+    assert all(len(h) == 25 for h in hashes)