Skip to content

Instantly share code, notes, and snippets.

@hotzenklotz
Created May 1, 2016 11:45
Show Gist options
  • Save hotzenklotz/8ab241bc9c9e8c62aad860fba20f2604 to your computer and use it in GitHub Desktop.
Save hotzenklotz/8ab241bc9c9e8c62aad860fba20f2604 to your computer and use it in GitHub Desktop.
Find and show all duplicate images from a directory. Duplicate detection works by measuring the Chi Squared Distance of two image histograms.
#!/usr/bin/env python
import os
import argparse
from multiprocessing import Pool
# install through pip
import numpy as np
import matplotlib.image as mpimg
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy.misc import imresize
from orderedset import OrderedSet
def calc_distance(hist1, hist2):
# Histrogram distance calculated with Chi square distance
# For different distance measures read:
# http://www.ariel.ac.il/sites/ofirpele/publications/ECCV2010.pdf
chi_squared = 0.5 * np.sum(
np.divide(
np.square((hist2 - hist1)),
hist2 + hist1
)
)
return chi_squared
def plot_histogram(hist):
# Plot a histogram
plt.bar(np.arange(255), hist)
plt.xlabel('Color')
plt.ylabel('Frequency')
return hist
def show_image_grid(images):
# Plot a list of images as a grid. This will take a while for many images.
num_cols = int(np.ceil(np.sqrt(len(images))))
gs = gridspec.GridSpec(num_cols, num_cols, top=1., bottom=0., right=1., left=0., hspace=0., wspace=0.)
for i, img in enumerate(images):
resized_image = read_image(img)
ax = plt.subplot(gs[i])
ax.imshow(resized_image)
ax.set_xticks([])
ax.set_yticks([])
def read_image(path, scale_Factor=0.1):
# Reads an image and down-scales it for better performance.
return imresize(mpimg.imread(path), scale_Factor)
def is_duplicate(img1, img2, threshold=0.03):
# Duplicate detection by measuring the distance between two image histograms.
# Histograms will take a while, especially for large images.
# Fine-tune threshold for
image1 = read_image(img1)
image2 = read_image(img2)
# Histogram measurement are sped up by down-scaling the images first.
# Histograms are equalized and represent a probability mass function.
hist1, bins = np.histogram(image1, bins=255, density=True)
hist2, bins = np.histogram(image2, bins=255, density=True)
dist = calc_distance(hist1, hist2)
print dist
return dist < threshold
def find_duplicates(img_tuple):
# Auxilliary function for the multithreading
(img1, img2) = img_tuple
duplicates = OrderedSet()
if is_duplicate(img1, img2):
duplicates.add(img1)
duplicates.add(img2)
return duplicates
if __name__ == '__main__':
# Find and show all duplicate images from a directory.
# Duplicate detection works by measuring the Chi Squared Distance of
# two image histograms.
# Inspiration: http://www.ariel.ac.il/sites/ofirpele/publications/ECCV2010.pdf
parser = argparse.ArgumentParser(description='Find and show duplicate images.')
parser.add_argument('--dir', '-d', metavar='image directory', type=str, required=True, help='The root image directory.')
args = parser.parse_args()
# Read all image file name from root directory
images = sorted(os.listdir(args.dir))
images = map(lambda x: os.path.join(args.dir, x).lower(), images)
images = filter(lambda x: x.endswith(("jpg", "jpeg", "png", "bmp")), images)
# Do a pairwise comparision of all images to find duplicates in parallel.
# This may take while.
image_pairs = [(images[i - 1], images[i]) for i in range(1, len(images) - 1)]
pool = Pool(processes=4)
result_set = pool.map(find_duplicates, image_pairs)
# Combine the results of all threads
duplicates = OrderedSet()
for result in result_set:
duplicates |= result
# Finally show all the duplicates on an image grid.
# This may take while.
print "Found %s duplicates:" % len(duplicates)
print duplicates
show_image_grid(duplicates)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment