Created
May 1, 2016 11:45
-
-
Save hotzenklotz/8ab241bc9c9e8c62aad860fba20f2604 to your computer and use it in GitHub Desktop.
Find and show all duplicate images from a directory. Duplicate detection works by measuring the Chi Squared Distance of two image histograms.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import argparse | |
from multiprocessing import Pool | |
# install through pip | |
import numpy as np | |
import matplotlib.image as mpimg | |
import matplotlib.mlab as mlab | |
import matplotlib.pyplot as plt | |
import matplotlib.gridspec as gridspec | |
from scipy.misc import imresize | |
from orderedset import OrderedSet | |
def calc_distance(hist1, hist2): | |
# Histrogram distance calculated with Chi square distance | |
# For different distance measures read: | |
# http://www.ariel.ac.il/sites/ofirpele/publications/ECCV2010.pdf | |
chi_squared = 0.5 * np.sum( | |
np.divide( | |
np.square((hist2 - hist1)), | |
hist2 + hist1 | |
) | |
) | |
return chi_squared | |
def plot_histogram(hist): | |
# Plot a histogram | |
plt.bar(np.arange(255), hist) | |
plt.xlabel('Color') | |
plt.ylabel('Frequency') | |
return hist | |
def show_image_grid(images): | |
# Plot a list of images as a grid. This will take a while for many images. | |
num_cols = int(np.ceil(np.sqrt(len(images)))) | |
gs = gridspec.GridSpec(num_cols, num_cols, top=1., bottom=0., right=1., left=0., hspace=0., wspace=0.) | |
for i, img in enumerate(images): | |
resized_image = read_image(img) | |
ax = plt.subplot(gs[i]) | |
ax.imshow(resized_image) | |
ax.set_xticks([]) | |
ax.set_yticks([]) | |
def read_image(path, scale_Factor=0.1): | |
# Reads an image and down-scales it for better performance. | |
return imresize(mpimg.imread(path), scale_Factor) | |
def is_duplicate(img1, img2, threshold=0.03): | |
# Duplicate detection by measuring the distance between two image histograms. | |
# Histograms will take a while, especially for large images. | |
# Fine-tune threshold for | |
image1 = read_image(img1) | |
image2 = read_image(img2) | |
# Histogram measurement are sped up by down-scaling the images first. | |
# Histograms are equalized and represent a probability mass function. | |
hist1, bins = np.histogram(image1, bins=255, density=True) | |
hist2, bins = np.histogram(image2, bins=255, density=True) | |
dist = calc_distance(hist1, hist2) | |
print dist | |
return dist < threshold | |
def find_duplicates(img_tuple): | |
# Auxilliary function for the multithreading | |
(img1, img2) = img_tuple | |
duplicates = OrderedSet() | |
if is_duplicate(img1, img2): | |
duplicates.add(img1) | |
duplicates.add(img2) | |
return duplicates | |
if __name__ == '__main__': | |
# Find and show all duplicate images from a directory. | |
# Duplicate detection works by measuring the Chi Squared Distance of | |
# two image histograms. | |
# Inspiration: http://www.ariel.ac.il/sites/ofirpele/publications/ECCV2010.pdf | |
parser = argparse.ArgumentParser(description='Find and show duplicate images.') | |
parser.add_argument('--dir', '-d', metavar='image directory', type=str, required=True, help='The root image directory.') | |
args = parser.parse_args() | |
# Read all image file name from root directory | |
images = sorted(os.listdir(args.dir)) | |
images = map(lambda x: os.path.join(args.dir, x).lower(), images) | |
images = filter(lambda x: x.endswith(("jpg", "jpeg", "png", "bmp")), images) | |
# Do a pairwise comparision of all images to find duplicates in parallel. | |
# This may take while. | |
image_pairs = [(images[i - 1], images[i]) for i in range(1, len(images) - 1)] | |
pool = Pool(processes=4) | |
result_set = pool.map(find_duplicates, image_pairs) | |
# Combine the results of all threads | |
duplicates = OrderedSet() | |
for result in result_set: | |
duplicates |= result | |
# Finally show all the duplicates on an image grid. | |
# This may take while. | |
print "Found %s duplicates:" % len(duplicates) | |
print duplicates | |
show_image_grid(duplicates) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment