alextp · August 31, 2010 00:09 · Aug 31, 2010 · Aug 31, 2010 · Aug 31, 2010
diff --git a/lda.py b/lda.py
@@ -179,7 +179,7 @@ def e(self):
             for i,w in enumerate(self.docs[d]):
                 zw = self.ptopics.T[w].copy()
                 zw *= self.docprobs[d]
-                zp = exp_digamma(zw)/exp_digamma(np.sum(zw))
+                zw = exp_digamma(zw)/exp_digamma(np.sum(zw))
                 self.ctopics.T[w] += zw
                 self.doccounts[d] += zw
 

diff --git a/lda.py b/lda.py
@@ -4,10 +4,16 @@ def symdirichlet(alpha, n):
     v = np.zeros(n)+alpha
     return np.random.dirichlet(v)
 
+
 def exp_digamma(x):
+    if x < 0.1:
+        return x/100
     a = x*x
     b = a*x
-    return x - 0.5 + 1./(24*x) + 1/(48.*a) + 23/(5760*b)
+    c = b*x
+    return x - 0.5 + 1./(24*x) 
+
+exp_digamma = np.vectorize(exp_digamma)
 
 
 import re

diff --git a/lda.py b/lda.py
@@ -0,0 +1,204 @@
+import numpy as np
+
+def symdirichlet(alpha, n):
+    v = np.zeros(n)+alpha
+    return np.random.dirichlet(v)
+
+def exp_digamma(x):
+    a = x*x
+    b = a*x
+    return x - 0.5 + 1./(24*x) + 1/(48.*a) + 23/(5760*b)
+
+
+import re
+wre = re.compile(r"(\w)+")
+def get_words(text, stop=True):
+    "A simple tokenizer"
+    l = 0
+    while l < len(text):
+        s = wre.search(text,l)
+        try:
+            w = text[s.start():s.end()].lower()
+            if stop:
+                yield w
+            elif w not in stoplist:
+                yield w
+            l = s.end()
+        except:
+            break
+
+
+class EmLda(object):
+    def __init__(self, docs, nt):
+        self.Nt = nt
+        self.docs = []
+        self.all_words = []
+        self.reverse_map = {}
+        self.Nd = 0
+        for d in docs:
+            doc = []
+            self.docs.append(doc)
+            self.Nd += 1
+            for w in get_words(d):
+                if len(w) < 5: continue
+                if not w in self.reverse_map:
+                    self.reverse_map[w] = len(self.all_words)
+                    self.all_words.append(w)
+                doc.append(self.reverse_map[w])
+        self.V = len(self.all_words)
+        self.ctopics = np.zeros((self.Nt, self.V))
+        self.doccounts = np.zeros((self.Nd, self.Nt))
+        self.ptopics = np.zeros((self.Nt, self.V))
+        self.docprobs = np.zeros((self.Nd, self.Nt))
+        self.beta = 1.
+        self.alpha = 1.
+        for d in xrange(self.Nd):
+            for i,w in enumerate(self.docs[d]):
+                zw = symdirichlet(self.beta, self.Nt)
+                self.ctopics.T[w] += zw
+                self.doccounts[d] += zw
+        self.m()
+
+
+
+    def e(self):
+        self.ctopics.fill(0)
+        self.doccounts.fill(0)
+        for d in xrange(self.Nd):
+            for i,w in enumerate(self.docs[d]):
+                zw = self.ptopics.T[w].copy()
+                zw *= self.docprobs[d]
+                zw /= np.sum(zw)
+                self.ctopics.T[w] += zw
+                self.doccounts[d] += zw
+
+    def m(self):
+        self.ptopics.fill(0)
+        self.docprobs.fill(0)
+        self.ptopics += self.ctopics + self.alpha
+        self.ptopics /= np.sum(self.ctopics, axis=1).reshape((-1,1))
+        self.docprobs += self.doccounts + self.beta
+        self.docprobs /= np.sum(self.docprobs, axis=1).reshape((-1,1))
+
+    def iterate(self):
+        self.e()
+        self.m()
+
+
+    def run(self, n):
+        for i in xrange(n):
+            print "iter", i
+            import sys
+            sys.stdout.flush()
+            self.iterate()
+        for w in xrange(100):
+            print "word", self.all_words[w], 
+            print "topics", self.ptopics.T[w]/np.sum(self.ptopics.T[w])
+        for i in xrange(self.Nt):
+            print
+            print
+            print "Topic", i
+            print
+            print_topic(self, self.ptopics[i], 40)
+
+
+class PrLda(EmLda):
+
+    def __init__(self, *args):
+        super(PrLda, self).__init__(*args)
+        self.Nw = sum(len(d) for d in self.docs)
+        c = 0
+        self.sigma = 15.
+
+    def e(self):
+        self.do_lambda()
+        self.do_z()
+
+    def do_lambda(self):
+        self.ptheta = [[] for i in xrange(self.V)]
+        self.ctopics.fill(0)
+        self.doccounts.fill(0)
+        for d in xrange(self.Nd):
+            for i,w in enumerate(self.docs[d]):
+                zw = self.ptopics.T[w].copy()
+                zw *= self.docprobs[d]
+                zw /= np.sum(zw)
+                self.ptheta[w].append(zw)
+        self.lbda = []
+        self.ptheta = map(np.array, self.ptheta)
+        for w in xrange(self.V):
+            self.lbda.append(self.optimize_lambda(self.ptheta[w]))
+
+    def optimize_lambda(self, ptheta, steps=50, lrate=1.):
+        lbda = np.ones(ptheta.shape[1])
+        lbda /= np.sum(lbda).T
+        lbda *= self.sigma
+        prevobj = np.inf
+        n = 0
+        while True:
+            obj = np.sum(np.sum(ptheta,axis=0).T*np.exp(-lbda))
+            if  n > 5 and  prevobj - obj < 0.01*obj:
+                break
+            n += 1
+            prevobj = obj
+            # do the gradient descent
+            lbda += lrate*np.sum(ptheta,axis=0)*np.exp(-lbda)
+            # truncate
+            lbda *= lbda>0
+            # project it into the l1 ball with diameter sigma
+            ll = -np.sort(-lbda)
+            cs = np.argmin(((ll-self.sigma).cumsum()/(np.arange(len(ll))+1.))) >= 0
+            theta = ll[cs-1]
+            lbda -= theta
+            lbda *= lbda > 0                
+        return lbda
+
+    def do_z(self):
+        indices = [0 for i in xrange(self.V)]
+        for d in xrange(self.Nd):
+            for i,w in enumerate(self.docs[d]):
+                zw = self.ptheta[w][indices[w]]
+                zw *= np.exp(-self.lbda[w])
+                zw /= np.sum(zw)
+                indices[w] += 1
+                self.ctopics.T[w] += zw
+                self.doccounts[d] += zw
+
+
+class VarLda(EmLda):
+    def e(self):
+        self.ctopics.fill(0)
+        self.doccounts.fill(0)
+        for d in xrange(self.Nd):
+            for i,w in enumerate(self.docs[d]):
+                zw = self.ptopics.T[w].copy()
+                zw *= self.docprobs[d]
+                zp = exp_digamma(zw)/exp_digamma(np.sum(zw))
+                self.ctopics.T[w] += zw
+                self.doccounts[d] += zw
+
+
+
+def print_topic(model, t, n):
+    s = np.argsort(-t)
+    for w in s[:n]:
+        print "     ",model.all_words[w]
+
+
+if __name__ == '__main__':
+    import sys
+    docs = []
+    nt = int(sys.argv[1])
+    import os
+    for fname in os.listdir(sys.argv[2]):
+        if not fname.startswith("."):
+            docs.append(file(os.path.join(sys.argv[2],fname)).read())
+    el = VarLda(docs, nt)
+    el.run(10)
+    el = EmLda(docs, nt)
+    el.run(10)
+    el = PrLda(docs, nt)
+    el.run(10)
+    el = PrLda(docs, nt)
+    el.sigma = 150
+    el.run(10)