alextp · January 28, 2011 01:07 · alextp · Jan 28, 2011 · ogrisel · Jan 28, 2011
diff --git a/sparse_pca.py b/sparse_pca.py
 import numpy as np, scipy, scipy.sparse, numpy.linalg, scipy.optimize
 from scipy import weave


 def project_l1(lbda, sigma):
    "Project positive vector lbda to have l1 norm sigma"
    ll = -np.sort(-lbda)
    cs = 0.
    theta = 0
    prevtheta = 0
    cs = np.cumsum(ll)
    vtheta = (cs-sigma)/(np.arange(ll.shape[0])+1.)
    i = np.argmin(ll-vtheta >= 0)-1
    prevtheta = vtheta[i]
    a = lbda - prevtheta
    a *= a > 0                
    return a



 def norm(dif):
    "Using np.sum instead of np.dot to never ever have a 1-element array as result"
    return np.sum(dif*dif)

 def getrow(M, i):
    return np.asarray(M.getrow(i).todense()).reshape(-1)

 def setrow(M, i, v):
    J = v.nonzero()[0]
    Nj = len(J)
    weave.inline("""
 int j;
 for (j = 0; j < Nj; ++j) {
    double d = v(J(j));
    int ii = i;
    PyObject_CallMethod(M, "__setitem__", "((ll)d)", ii,j, d);
 }""", ["J", "M", "v", "Nj", "i"],
                 type_converters=weave.converters.blitz)



 def recerror(orig, t1, t2):
    "The error of reconstructing orig as t1*t2"
    err = 0.
    for i in xrange(orig.shape[0]):
        oi = getrow(orig, i)
        t1i = getrow(t1, i)
        err += norm(oi-t2*t1i)
    return err



 class MatModel(object):
    def __init__(self, M, alpha, Ntopics, lrate, nsteps):
        """M is the sparse matrix we want to decompose,
 alpha is the maximum l1 norm for each line of the factorized matrices
 Ntopics is the number of latent components to use
 lrate is the learning rate to use in the gradient descent
 nsteps is how many gradient-and-project steps are used at each iteration"""
        self.na, self.nb = M.shape
        self.Ntopics = Ntopics
        self.alpha  = alpha
        self.lrate = lrate
        self.nsteps = nsteps
        self.M = M
        self.Tl = scipy.sparse.csr_matrix(np.random.random((self.na,self.Ntopics)))
        self.Tr = scipy.sparse.csr_matrix(np.random.random((self.nb,self.Ntopics)))

    def iterate(self):
        print "Maximizing tl"
        self.optimize_l(self.M, self.Tl, self.Tr)
        print "error =", self.error()
        print "Maximizing tr"
        self.optimize_l(self.M.T, self.Tr, self.Tl)
        print "error =", self.error()


    def error(self):
        return recerror(self.M, self.Tl, self.Tr)


    def optimize_l(self, M, l1, l2):
        """Code to optimize either Tl or Tr. 
 The optimization is a gradient-descent step followed by an l1 projection step."""
        nt = np.dot(l2.T,l2).toarray()
        for i in xrange(M.shape[0]):
            ai = getrow(M, i)
            tai = getrow(l1, i)
            for j in xrange(self.nsteps):
                grad = (np.dot(nt,tai)-(l2.T*ai).reshape(-1))
                tai -= self.lrate*grad/np.sqrt(norm(grad))
                tai = project_l1(tai, self.alpha)
            setrow(l1, i, tai)
	import numpy as np, scipy, scipy.sparse, numpy.linalg, scipy.optimize
	from scipy import weave


	def project_l1(lbda, sigma):
	"Project positive vector lbda to have l1 norm sigma"
	ll = -np.sort(-lbda)
	cs = 0.
	theta = 0
	prevtheta = 0
	cs = np.cumsum(ll)
	vtheta = (cs-sigma)/(np.arange(ll.shape[0])+1.)
	i = np.argmin(ll-vtheta >= 0)-1
	prevtheta = vtheta[i]
	a = lbda - prevtheta
	a *= a > 0
	return a



	def norm(dif):
	"Using np.sum instead of np.dot to never ever have a 1-element array as result"
	return np.sum(dif*dif)

	def getrow(M, i):
	return np.asarray(M.getrow(i).todense()).reshape(-1)

	def setrow(M, i, v):
	J = v.nonzero()[0]
	Nj = len(J)
	weave.inline("""
	int j;
	for (j = 0; j < Nj; ++j) {
	double d = v(J(j));
	int ii = i;
	PyObject_CallMethod(M, "__setitem__", "((ll)d)", ii,j, d);
	}""", ["J", "M", "v", "Nj", "i"],
	type_converters=weave.converters.blitz)



	def recerror(orig, t1, t2):
	"The error of reconstructing orig as t1*t2"
	err = 0.
	for i in xrange(orig.shape[0]):
	oi = getrow(orig, i)
	t1i = getrow(t1, i)
	err += norm(oi-t2*t1i)
	return err



	class MatModel(object):
	def __init__(self, M, alpha, Ntopics, lrate, nsteps):
	"""M is the sparse matrix we want to decompose,
	alpha is the maximum l1 norm for each line of the factorized matrices
	Ntopics is the number of latent components to use
	lrate is the learning rate to use in the gradient descent
	nsteps is how many gradient-and-project steps are used at each iteration"""
	self.na, self.nb = M.shape
	self.Ntopics = Ntopics
	self.alpha = alpha
	self.lrate = lrate
	self.nsteps = nsteps
	self.M = M
	self.Tl = scipy.sparse.csr_matrix(np.random.random((self.na,self.Ntopics)))
	self.Tr = scipy.sparse.csr_matrix(np.random.random((self.nb,self.Ntopics)))

	def iterate(self):
	print "Maximizing tl"
	self.optimize_l(self.M, self.Tl, self.Tr)
	print "error =", self.error()
	print "Maximizing tr"
	self.optimize_l(self.M.T, self.Tr, self.Tl)
	print "error =", self.error()


	def error(self):
	return recerror(self.M, self.Tl, self.Tr)


	def optimize_l(self, M, l1, l2):
	"""Code to optimize either Tl or Tr.
	The optimization is a gradient-descent step followed by an l1 projection step."""
	nt = np.dot(l2.T,l2).toarray()
	for i in xrange(M.shape[0]):
	ai = getrow(M, i)
	tai = getrow(l1, i)
	for j in xrange(self.nsteps):
	grad = (np.dot(nt,tai)-(l2.T*ai).reshape(-1))
	tai -= self.lrate*grad/np.sqrt(norm(grad))
	tai = project_l1(tai, self.alpha)
	setrow(l1, i, tai)