hotzenklotz · May 1, 2016 11:45
diff --git a/image_duplicate_detection.py b/image_duplicate_detection.py
 #!/usr/bin/env python
 import os
 import argparse
 from multiprocessing import Pool

 # install through pip
 import numpy as np
 import matplotlib.image as mpimg
 import matplotlib.mlab as mlab
 import matplotlib.pyplot as plt
 import matplotlib.gridspec as gridspec
 from scipy.misc import imresize
 from orderedset import OrderedSet

 def calc_distance(hist1, hist2):
 	# Histrogram distance calculated with Chi square distance
 	# For different distance measures read:
 	# http://www.ariel.ac.il/sites/ofirpele/publications/ECCV2010.pdf

 	chi_squared = 0.5 * np.sum(
 		np.divide(
 			np.square((hist2 - hist1)),
 			hist2 + hist1
 		)
 	)

 	return chi_squared


 def plot_histogram(hist):
 	# Plot a histogram

 	plt.bar(np.arange(255), hist)
 	plt.xlabel('Color')
 	plt.ylabel('Frequency')

 	return hist


 def show_image_grid(images):
 	# Plot a list of images as a grid. This will take a while for many images.

 	num_cols = int(np.ceil(np.sqrt(len(images))))
 	gs = gridspec.GridSpec(num_cols, num_cols, top=1., bottom=0., right=1., left=0., hspace=0., wspace=0.)

 	for i, img in enumerate(images):
 		resized_image = read_image(img)

 		ax = plt.subplot(gs[i])
 		ax.imshow(resized_image)
 		ax.set_xticks([])
 		ax.set_yticks([])

 def read_image(path, scale_Factor=0.1):
 	# Reads an image and down-scales it for better performance.
 	return imresize(mpimg.imread(path), scale_Factor)

 def is_duplicate(img1, img2, threshold=0.03):
 	# Duplicate detection by measuring the distance between two image histograms.
 	# Histograms will take a while, especially for large images.
 	# Fine-tune threshold for

 	image1 = read_image(img1)
 	image2 = read_image(img2)

 	# Histogram measurement are sped up by down-scaling the images first.
 	# Histograms are equalized and represent a probability mass function.
 	hist1, bins = np.histogram(image1, bins=255, density=True)
 	hist2, bins = np.histogram(image2, bins=255, density=True)

 	dist = calc_distance(hist1, hist2)
 	print dist
 	return dist < threshold


 def find_duplicates(img_tuple):
 	# Auxilliary function for the multithreading

 	(img1, img2) = img_tuple
 	duplicates = OrderedSet()

 	if is_duplicate(img1, img2):
 		duplicates.add(img1)
 		duplicates.add(img2)

 	return duplicates


 if __name__ == '__main__':
 	# Find and show all duplicate images from a directory.
 	# Duplicate detection works by measuring the Chi Squared Distance of
 	# two image histograms.
 	# Inspiration: http://www.ariel.ac.il/sites/ofirpele/publications/ECCV2010.pdf

 	parser = argparse.ArgumentParser(description='Find and show duplicate images.')
 	parser.add_argument('--dir', '-d', metavar='image directory', type=str, required=True, help='The root image directory.')
 	args = parser.parse_args()

 	# Read all image file name from root directory
 	images = sorted(os.listdir(args.dir))
 	images = map(lambda x: os.path.join(args.dir, x).lower(), images)
 	images = filter(lambda x: x.endswith(("jpg", "jpeg", "png", "bmp")), images)

 	# Do a pairwise comparision of all images to find duplicates in parallel.
 	# This may take while.
 	image_pairs = [(images[i - 1], images[i]) for i in range(1, len(images) - 1)]
 	pool = Pool(processes=4)
 	result_set = pool.map(find_duplicates, image_pairs)

 	# Combine the results of all threads
 	duplicates = OrderedSet()
 	for result in result_set:
 		duplicates |= result

 	# Finally show all the duplicates on an image grid.
 	# This may take while.
 	print "Found %s duplicates:" % len(duplicates)
 	print duplicates
 	show_image_grid(duplicates)

 	plt.show()
	#!/usr/bin/env python
	import os
	import argparse
	from multiprocessing import Pool

	# install through pip
	import numpy as np
	import matplotlib.image as mpimg
	import matplotlib.mlab as mlab
	import matplotlib.pyplot as plt
	import matplotlib.gridspec as gridspec
	from scipy.misc import imresize
	from orderedset import OrderedSet

	def calc_distance(hist1, hist2):
	# Histrogram distance calculated with Chi square distance
	# For different distance measures read:
	# http://www.ariel.ac.il/sites/ofirpele/publications/ECCV2010.pdf

	chi_squared = 0.5 * np.sum(
	np.divide(
	np.square((hist2 - hist1)),
	hist2 + hist1
	)
	)

	return chi_squared


	def plot_histogram(hist):
	# Plot a histogram

	plt.bar(np.arange(255), hist)
	plt.xlabel('Color')
	plt.ylabel('Frequency')

	return hist


	def show_image_grid(images):
	# Plot a list of images as a grid. This will take a while for many images.

	num_cols = int(np.ceil(np.sqrt(len(images))))
	gs = gridspec.GridSpec(num_cols, num_cols, top=1., bottom=0., right=1., left=0., hspace=0., wspace=0.)

	for i, img in enumerate(images):
	resized_image = read_image(img)

	ax = plt.subplot(gs[i])
	ax.imshow(resized_image)
	ax.set_xticks([])
	ax.set_yticks([])

	def read_image(path, scale_Factor=0.1):
	# Reads an image and down-scales it for better performance.
	return imresize(mpimg.imread(path), scale_Factor)

	def is_duplicate(img1, img2, threshold=0.03):
	# Duplicate detection by measuring the distance between two image histograms.
	# Histograms will take a while, especially for large images.
	# Fine-tune threshold for

	image1 = read_image(img1)
	image2 = read_image(img2)

	# Histogram measurement are sped up by down-scaling the images first.
	# Histograms are equalized and represent a probability mass function.
	hist1, bins = np.histogram(image1, bins=255, density=True)
	hist2, bins = np.histogram(image2, bins=255, density=True)

	dist = calc_distance(hist1, hist2)
	print dist
	return dist < threshold


	def find_duplicates(img_tuple):
	# Auxilliary function for the multithreading

	(img1, img2) = img_tuple
	duplicates = OrderedSet()

	if is_duplicate(img1, img2):
	duplicates.add(img1)
	duplicates.add(img2)

	return duplicates


	if __name__ == '__main__':
	# Find and show all duplicate images from a directory.
	# Duplicate detection works by measuring the Chi Squared Distance of
	# two image histograms.
	# Inspiration: http://www.ariel.ac.il/sites/ofirpele/publications/ECCV2010.pdf

	parser = argparse.ArgumentParser(description='Find and show duplicate images.')
	parser.add_argument('--dir', '-d', metavar='image directory', type=str, required=True, help='The root image directory.')
	args = parser.parse_args()

	# Read all image file name from root directory
	images = sorted(os.listdir(args.dir))
	images = map(lambda x: os.path.join(args.dir, x).lower(), images)
	images = filter(lambda x: x.endswith(("jpg", "jpeg", "png", "bmp")), images)

	# Do a pairwise comparision of all images to find duplicates in parallel.
	# This may take while.
	image_pairs = [(images[i - 1], images[i]) for i in range(1, len(images) - 1)]
	pool = Pool(processes=4)
	result_set = pool.map(find_duplicates, image_pairs)

	# Combine the results of all threads
	duplicates = OrderedSet()
	for result in result_set:
	duplicates \|= result

	# Finally show all the duplicates on an image grid.
	# This may take while.
	print "Found %s duplicates:" % len(duplicates)
	print duplicates
	show_image_grid(duplicates)

	plt.show()