-
-
Save bistaumanga/6023716 to your computer and use it in GitHub Desktop.
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Sat May 3 10:21:21 2014 | |
| @author: umb | |
| """ | |
| import numpy as np | |
| class GMM: | |
| def __init__(self, k = 3, eps = 0.0001): | |
| self.k = k ## number of clusters | |
| self.eps = eps ## threshold to stop `epsilon` | |
| # All parameters from fitting/learning are kept in a named tuple | |
| from collections import namedtuple | |
| def fit_EM(self, X, max_iters = 1000): | |
| # n = number of data-points, d = dimension of data points | |
| n, d = X.shape | |
| # randomly choose the starting centroids/means | |
| ## as 3 of the points from datasets | |
| mu = X[np.random.choice(n, self.k, False), :] | |
| # initialize the covariance matrices for each gaussians | |
| Sigma= [np.eye(d)] * self.k | |
| # initialize the probabilities/weights for each gaussians | |
| w = [1./self.k] * self.k | |
| # responsibility matrix is initialized to all zeros | |
| # we have responsibility for each of n points for eack of k gaussians | |
| R = np.zeros((n, self.k)) | |
| ### log_likelihoods | |
| log_likelihoods = [] | |
| P = lambda mu, s: np.linalg.det(s) ** -.5 ** (2 * np.pi) ** (-X.shape[1]/2.) \ | |
| * np.exp(-.5 * np.einsum('ij, ij -> i',\ | |
| X - mu, np.dot(np.linalg.inv(s) , (X - mu).T).T ) ) | |
| # Iterate till max_iters iterations | |
| while len(log_likelihoods) < max_iters: | |
| # E - Step | |
| ## Vectorized implementation of e-step equation to calculate the | |
| ## membership for each of k -gaussians | |
| for k in range(self.k): | |
| R[:, k] = w[k] * P(mu[k], Sigma[k]) | |
| ### Likelihood computation | |
| log_likelihood = np.sum(np.log(np.sum(R, axis = 1))) | |
| log_likelihoods.append(log_likelihood) | |
| ## Normalize so that the responsibility matrix is row stochastic | |
| R = (R.T / np.sum(R, axis = 1)).T | |
| ## The number of datapoints belonging to each gaussian | |
| N_ks = np.sum(R, axis = 0) | |
| # M Step | |
| ## calculate the new mean and covariance for each gaussian by | |
| ## utilizing the new responsibilities | |
| for k in range(self.k): | |
| ## means | |
| mu[k] = 1. / N_ks[k] * np.sum(R[:, k] * X.T, axis = 1).T | |
| x_mu = np.matrix(X - mu[k]) | |
| ## covariances | |
| Sigma[k] = np.array(1 / N_ks[k] * np.dot(np.multiply(x_mu.T, R[:, k]), x_mu)) | |
| ## and finally the probabilities | |
| w[k] = 1. / n * N_ks[k] | |
| # check for onvergence | |
| if len(log_likelihoods) < 2 : continue | |
| if np.abs(log_likelihood - log_likelihoods[-2]) < self.eps: break | |
| ## bind all results together | |
| from collections import namedtuple | |
| self.params = namedtuple('params', ['mu', 'Sigma', 'w', 'log_likelihoods', 'num_iters']) | |
| self.params.mu = mu | |
| self.params.Sigma = Sigma | |
| self.params.w = w | |
| self.params.log_likelihoods = log_likelihoods | |
| self.params.num_iters = len(log_likelihoods) | |
| return self.params | |
| def plot_log_likelihood(self): | |
| import pylab as plt | |
| plt.plot(self.params.log_likelihoods) | |
| plt.title('Log Likelihood vs iteration plot') | |
| plt.xlabel('Iterations') | |
| plt.ylabel('log likelihood') | |
| plt.show() | |
| def predict(self, x): | |
| p = lambda mu, s : np.linalg.det(s) ** - 0.5 * (2 * np.pi) **\ | |
| (-len(x)/2) * np.exp( -0.5 * np.dot(x - mu , \ | |
| np.dot(np.linalg.inv(s) , x - mu))) | |
| probs = np.array([w * p(mu, s) for mu, s, w in \ | |
| zip(self.params.mu, self.params.Sigma, self.params.w)]) | |
| return probs/np.sum(probs) | |
| def demo_2d(): | |
| # Load data | |
| #X = np.genfromtxt('data1.csv', delimiter=',') | |
| ### generate the random data | |
| np.random.seed(3) | |
| m1, cov1 = [9, 8], [[.5, 1], [.25, 1]] ## first gaussian | |
| data1 = np.random.multivariate_normal(m1, cov1, 90) | |
| m2, cov2 = [6, 13], [[.5, -.5], [-.5, .1]] ## second gaussian | |
| data2 = np.random.multivariate_normal(m2, cov2, 45) | |
| m3, cov3 = [4, 7], [[0.25, 0.5], [-0.1, 0.5]] ## third gaussian | |
| data3 = np.random.multivariate_normal(m3, cov3, 65) | |
| X = np.vstack((data1,np.vstack((data2,data3)))) | |
| np.random.shuffle(X) | |
| # np.savetxt('sample.csv', X, fmt = "%.4f", delimiter = ",") | |
| #### | |
| gmm = GMM(3, 0.000001) | |
| params = gmm.fit_EM(X, max_iters= 100) | |
| print params.log_likelihoods | |
| import pylab as plt | |
| from matplotlib.patches import Ellipse | |
| def plot_ellipse(pos, cov, nstd=2, ax=None, **kwargs): | |
| def eigsorted(cov): | |
| vals, vecs = np.linalg.eigh(cov) | |
| order = vals.argsort()[::-1] | |
| return vals[order], vecs[:,order] | |
| if ax is None: | |
| ax = plt.gca() | |
| vals, vecs = eigsorted(cov) | |
| theta = np.degrees(np.arctan2(*vecs[:,0][::-1])) | |
| # Width and height are "full" widths, not radius | |
| width, height = 2 * nstd * np.sqrt(abs(vals)) | |
| ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs) | |
| ax.add_artist(ellip) | |
| return ellip | |
| def show(X, mu, cov): | |
| plt.cla() | |
| K = len(mu) # number of clusters | |
| colors = ['b', 'k', 'g', 'c', 'm', 'y', 'r'] | |
| plt.plot(X.T[0], X.T[1], 'm*') | |
| for k in range(K): | |
| plot_ellipse(mu[k], cov[k], alpha=0.6, color = colors[k % len(colors)]) | |
| fig = plt.figure(figsize = (13, 6)) | |
| fig.add_subplot(121) | |
| show(X, params.mu, params.Sigma) | |
| fig.add_subplot(122) | |
| plt.plot(np.array(params.log_likelihoods)) | |
| plt.title('Log Likelihood vs iteration plot') | |
| plt.xlabel('Iterations') | |
| plt.ylabel('log likelihood') | |
| plt.show() | |
| print gmm.predict(np.array([1, 2])) | |
| if __name__ == "__main__": | |
| demo_2d() | |
| from optparse import OptionParser | |
| parser = OptionParser() | |
| parser.add_option("-f", "--file", dest="filepath", help="File path for data") | |
| parser.add_option("-k", "--clusters", dest="clusters", help="No. of gaussians") | |
| parser.add_option("-e", "--eps", dest="epsilon", help="Epsilon to stop") | |
| parser.add_option("-m", "--maxiters", dest="max_iters", help="Maximum no. of iteration") | |
| options, args = parser.parse_args() | |
| if not options.filepath : raise('File not provided') | |
| if not options.clusters : | |
| print("Used default number of clusters = 3" ) | |
| k = 3 | |
| else: k = int(options.clusters) | |
| if not options.epsilon : | |
| print("Used default eps = 0.0001" ) | |
| eps = 0.0001 | |
| else: eps = float(options.epsilon) | |
| if not options.max_iters : | |
| print("Used default maxiters = 1000" ) | |
| max_iters = 1000 | |
| else: eps = int(options.maxiters) | |
| X = np.genfromtxt(options.filepath, delimiter=',') | |
| gmm = GMM(k, eps) | |
| params = gmm.fit_EM(X, max_iters) | |
| print params.log_likelihoods | |
| gmm.plot_log_likelihood() | |
| print gmm.predict(np.array([1, 2])) |
Can this be done for images in CIELab spaces?
Good implementation. Thank you.
In you probability calculation on line 41, after the -.5, shouldn't there only be one '*' instead of '**'?
I am getting these warnings:
/home/yash/NCSU/Sem 2/CV/Project 1/gmm.py:48: RuntimeWarning: divide by zero encountered in log
log_likelihood = np.sum(np.log(np.sum(R, axis = 1)))
/home/yash/NCSU/Sem 2/CV/Project 1/gmm.py:53: RuntimeWarning: invalid value encountered in divide
R = (R.T / np.sum(R, axis = 1)).T
/home/yash/anaconda2/lib/python2.7/site-packages/numpy/linalg/linalg.py:1804: RuntimeWarning: invalid value encountered in det
r = _umath_linalg.det(a, signature=signature)
The dataset contains images(RGB) and the shape of train array is (1000, 10880). The original shape is (1000, 60, 60, 3) (1000 60x60 rgb images). I have flattened this array to (1000, 10880).
The log likelihood array that I get after running the program is
[-inf, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
What's going wrong?
how to import new data set?
any suggestion
Could you please tell me how to use ENSEMBLE Technique on this?
Good stuff. Any license for using the code?
@raghughanapuram For 1D data, just set d=1 in fit_EM().