# Iconfinder 如何杜绝盗版，哈希算法检测图像重复

【伯乐在线导读】：Iconfinder 是一个图标搜索引擎，为设计师、开发者和其他创意工作者提供精美图标，目前托管超过 34 万枚图标，是全球最大的付费图标库。用户也可以在 Iconfinder 的交易板块上传出售原创作品。每个月都有成千上万的图标上传到Iconfinder，同时也伴随而来大量的盗版图。Iconfinder 工程师 Silviu Tantos 在本文中提出一个新颖巧妙的图像查重技术，以杜绝盗版。

# 选择一个哈希算法

Python

 1 2 3 4 5 6 7 8 9 10 >>> import hashlib       # Calculating the hash value of a string.     >>> hashlib.md5(‘The quick brown fox jumps over the lazy dog’).hexdigest()     ‘9e107d9d372bb6826bd81d3542a419d6’       # Loading an image file into memory and calculating it’s hash value.     >>> image_file = open(‘data/cat_grumpy_orig.png’).read()     >>> hashlib.md5(image_file).hexdigest()     ‘3e1f6e9f2689d59b9ed28bcdab73455f’

Python

 1 2 3 4 5 6 7 # Original text.     >>> hashlib.md5(‘The quick brown fox jumps over the lazy dog’).hexdigest()     ‘9e107d9d372bb6826bd81d3542a419d6’       # Slight modification of the text.     >>> hashlib.md5(‘The quick brown fox jumps over the lazy dog.’).hexdigest()     ‘e4d909c290d0fb1ca068ffaddf22cbd0’

Original image                                                                             Modified image

Python

 1 2 3 4 5 6 7 8 9 # Load the original image into memory and calculate it’s hash value.     >>> image_file = open(‘data/cat_grumpy_orig.png’).read()     >>> hashlib.md5(image_file).hexdigest()     ‘3e1f6e9f2689d59b9ed28bcdab73455f’       # Load the modified image into memory and calculate it’s hash value.     >>> image_file_modified = open(‘data/cat_grumpy_modif.png’).read()     >>> hashlib.md5(image_file_modified).hexdigest()     ’12d1b9409c3e8e0361c24beaee9c0ab1′

# dHash

Test image

Python

 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 >>> from PIL import Image     >>> test_image  = Image.open(‘data/test_image.jpg’)       # The image is an RGB image with a size of 8×8 pixels.     >>> print ‘Image Mode: %s’ % test_image.mode     Image Mode: RGB     >>> print ‘Width: %s px, Height: %s px’ % (test_image.size[0], test_image.size[1])     Width: 4 px, Height: 4 px       # Get the pixel values from the image and print them into rows based on     # the image’s width.     >>> width, height = test_image.size     >>> pixels = list(test_image.getdata())     >>> for col in xrange(width):     ...   print pixels[col:col+width]     ...     [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 255)]     [(0, 0, 0), (212, 45, 45), (51, 92, 154), (130, 183, 47)]     [(206, 210, 198), (131, 78, 8), (131, 156, 180), (117, 155, 201)]     [(104, 133, 170), (215, 130, 20), (153, 155, 155), (104, 142, 191)]

## 1.图像灰度化

Original image (after step 1)                                     Modified image (after step 1)

## 2.将图像缩小到一个常见大小

Original image (after step 2)                     Modified image (after step 2)

## 3.比较邻域像素

Python

 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 >>> from PIL import Image     >>> img = Image.open(‘data/cat_grumpy_orig_after_step_2.png’)     >>> width, height = img.size     >>> pixels = list(img.getdata())     >>> for col in xrange(width):     ...   print pixels[col:col+width]     ...     [254, 254, 255, 253, 248, 254, 255, 254, 255]     [254, 255, 253, 248, 254, 255, 254, 255, 255]     [253, 248, 254, 255, 254, 255, 255, 255, 222]     [248, 254, 255, 254, 255, 255, 255, 222, 184]     [254, 255, 254, 255, 255, 255, 222, 184, 177]     [255, 254, 255, 255, 255, 222, 184, 177, 184]     [254, 255, 255, 255, 222, 184, 177, 184, 225]     [255, 255, 255, 222, 184, 177, 184, 225, 255]

Python

 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 >>> difference = []     >>> for row in xrange(height):     ...   for col in xrange(width):     ...     if col != width:     ...       difference.append(pixels[col+row] > pixels[(col+row)+1])     ...     >>> for col in xrange(width–1):     ...   print difference[col:col+(width–1)]     ...     [False, False, True, True, False, False, True, False]     [False, True, True, False, False, True, False, False]     [True, True, False, False, True, False, False, False]     [True, False, False, True, False, False, False, True]     [False, False, True, False, False, False, True, True]     [False, True, False, False, False, True, True, False]     [True, False, False, False, True, True, False, False]     [False, False, False, True, True, False, False, True]

# Python实现

Python

 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 def dhash(image, hash_size = 8):         # Grayscale and shrink the image in one step.         image = image.convert(‘L’).resize(             (hash_size + 1, hash_size),             Image.ANTIALIAS,         )           pixels = list(image.getdata())           # Compare adjacent pixels.         difference = []         for row in xrange(hash_size):             for col in xrange(hash_size):                 pixel_left = image.getpixel((col, row))                 pixel_right = image.getpixel((col + 1, row))                 difference.append(pixel_left > pixel_right)           # Convert the binary array to a hexadecimal string.         decimal_value = 0         hex_string = []         for index, value in enumerate(difference):             if value:                 decimal_value += 2**(index % 8)             if (index % 8) == 7:                 hex_string.append(hex(decimal_value)[2:].rjust(2, ‘0’))                 decimal_value = 0           return ”.join(hex_string)

Python

 1 2 3 4 5 6 7 8 9 10 >>> from PIL import Image     >>> from utility import dhash, hamming_distance     >>> orig = Image.open(‘data/cat_grumpy_orig.png’)     >>> modif = Image.open(‘data/cat_grumpy_modif.png’)     >>> dhash(orig)     ‘4c8e3366c275650f’     >>> dhash(modif)     ‘4c8e3366c275650f’     >>> dhash(orig) == dhash(modif)     True

Python

 1 2 SELECT pk, hash, file_path FROM image_hashes         WHERE hash = ‘4c8e3366c275650f’;

Python

 1 2 3 4 5 6 SELECT pk, hash, BIT_COUNT(         CONV(hash, 16, 10) ^ CONV(‘4c8e3366c275650f’, 16, 10)     ) as hamming_distance         FROM image_hashes         HAVING hamming_distance < 4         ORDER BY hamming_distance ASC;

1

（新浪微博：@Judyfish）