fix: RED-3813: ensure image hashes are always 25 chars long
The hashing algorithm omits leading bits without information. Since this proves problematic for later processing, we restore this information and ensure the hashes are always 25 characters long.
This commit is contained in:
parent
278b42e368
commit
1796c1bcbb
@ -13,7 +13,7 @@ class HashEncoder(Encoder):
|
||||
yield from self.encode(images)
|
||||
|
||||
|
||||
def hash_image(image: Image.Image):
|
||||
def hash_image(image: Image.Image) -> str:
|
||||
"""See: https://stackoverflow.com/a/49692185/3578468"""
|
||||
image = image.resize((10, 10), Image.ANTIALIAS)
|
||||
image = image.convert("L")
|
||||
@ -21,4 +21,6 @@ def hash_image(image: Image.Image):
|
||||
avg_pixel = sum(pixel_data) / len(pixel_data)
|
||||
bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
|
||||
hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
|
||||
return hex_representation
|
||||
# Note: For each 4 leading zeros, the hex representation will be shorter by one character.
|
||||
# To ensure that all hashes have the same length, we pad the hex representation with zeros (also see RED-3813).
|
||||
return hex_representation.zfill(25)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
outs:
|
||||
- md5: 4b0fec291ce0661b3efbbd8b80f4f514.dir
|
||||
size: 107332
|
||||
nfiles: 4
|
||||
- md5: 11f42db1ef57bcac9547792c5506583c.dir
|
||||
size: 1118031
|
||||
nfiles: 5
|
||||
path: data
|
||||
|
||||
@ -1,12 +1,15 @@
|
||||
import random
|
||||
from itertools import starmap
|
||||
from operator import __eq__
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from PIL.Image import Image
|
||||
from funcy import compose, first
|
||||
|
||||
from image_prediction.encoder.encoders.hash_encoder import HashEncoder, hash_image
|
||||
from image_prediction.encoder.encoders.hash_encoder import HashEncoder
|
||||
from image_prediction.encoder.encoders.hash_encoder import hash_image
|
||||
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
|
||||
from image_prediction.utils.generic import lift
|
||||
|
||||
|
||||
@ -29,3 +32,17 @@ def test_hash_encoder(images, hashed_images, base_patch_image):
|
||||
hashed_resized = compose(first, encoder, lift(resize))([base_patch_image])
|
||||
hashed = hash_image(base_patch_image)
|
||||
assert close(hashed_resized, hashed)
|
||||
|
||||
|
||||
def test_all_hashes_have_length_of_twentyfive():
|
||||
"""See RED-3814: all hashes should have 25 characters."""
|
||||
pdf_path = Path(__file__).parents[1] / "data" / "similarImages2.pdf"
|
||||
pdf_bytes = pdf_path.read_bytes()
|
||||
image_extractor = ParsablePDFImageExtractor()
|
||||
image_metadata_pairs = list(image_extractor.extract(pdf_bytes))
|
||||
images = [image for image, _ in image_metadata_pairs]
|
||||
|
||||
hash_encoder = HashEncoder()
|
||||
hashes = list(hash_encoder.encode(images))
|
||||
|
||||
assert all(len(h) == 25 for h in hashes)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user