fix: RED-3813: ensure image hashes are always 25 chars long

The hashing algorithm omits leading bits without information. Since this
proves problematic for later processing, we restore this
information and ensure the hashes are always 25 characters long.
This commit is contained in:
Julius Unverfehrt 2024-08-22 11:09:18 +02:00
parent 278b42e368
commit 1796c1bcbb
3 changed files with 25 additions and 6 deletions

View File

@ -13,7 +13,7 @@ class HashEncoder(Encoder):
yield from self.encode(images)
def hash_image(image: Image.Image):
def hash_image(image: Image.Image) -> str:
"""See: https://stackoverflow.com/a/49692185/3578468"""
image = image.resize((10, 10), Image.ANTIALIAS)
image = image.convert("L")
@ -21,4 +21,6 @@ def hash_image(image: Image.Image):
avg_pixel = sum(pixel_data) / len(pixel_data)
bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
return hex_representation
# Note: For each 4 leading zeros, the hex representation will be shorter by one character.
# To ensure that all hashes have the same length, we pad the hex representation with zeros (also see RED-3813).
return hex_representation.zfill(25)

View File

@ -1,5 +1,5 @@
outs:
- md5: 4b0fec291ce0661b3efbbd8b80f4f514.dir
size: 107332
nfiles: 4
- md5: 11f42db1ef57bcac9547792c5506583c.dir
size: 1118031
nfiles: 5
path: data

View File

@ -1,12 +1,15 @@
import random
from itertools import starmap
from operator import __eq__
from pathlib import Path
import pytest
from PIL.Image import Image
from funcy import compose, first
from image_prediction.encoder.encoders.hash_encoder import HashEncoder, hash_image
from image_prediction.encoder.encoders.hash_encoder import HashEncoder
from image_prediction.encoder.encoders.hash_encoder import hash_image
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
from image_prediction.utils.generic import lift
@ -29,3 +32,17 @@ def test_hash_encoder(images, hashed_images, base_patch_image):
hashed_resized = compose(first, encoder, lift(resize))([base_patch_image])
hashed = hash_image(base_patch_image)
assert close(hashed_resized, hashed)
def test_all_hashes_have_length_of_twentyfive():
"""See RED-3814: all hashes should have 25 characters."""
pdf_path = Path(__file__).parents[1] / "data" / "similarImages2.pdf"
pdf_bytes = pdf_path.read_bytes()
image_extractor = ParsablePDFImageExtractor()
image_metadata_pairs = list(image_extractor.extract(pdf_bytes))
images = [image for image, _ in image_metadata_pairs]
hash_encoder = HashEncoder()
hashes = list(hash_encoder.encode(images))
assert all(len(h) == 25 for h in hashes)