Skip to content

Instantly share code, notes, and snippets.

@alextp
Created August 31, 2010 00:09

Revisions

  1. @invalid-email-address Anonymous revised this gist Aug 31, 2010. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion lda.py
    Original file line number Diff line number Diff line change
    @@ -179,7 +179,7 @@ def e(self):
    for i,w in enumerate(self.docs[d]):
    zw = self.ptopics.T[w].copy()
    zw *= self.docprobs[d]
    zp = exp_digamma(zw)/exp_digamma(np.sum(zw))
    zw = exp_digamma(zw)/exp_digamma(np.sum(zw))
    self.ctopics.T[w] += zw
    self.doccounts[d] += zw

  2. @invalid-email-address Anonymous revised this gist Aug 31, 2010. 1 changed file with 7 additions and 1 deletion.
    8 changes: 7 additions & 1 deletion lda.py
    Original file line number Diff line number Diff line change
    @@ -4,10 +4,16 @@ def symdirichlet(alpha, n):
    v = np.zeros(n)+alpha
    return np.random.dirichlet(v)


    def exp_digamma(x):
    if x < 0.1:
    return x/100
    a = x*x
    b = a*x
    return x - 0.5 + 1./(24*x) + 1/(48.*a) + 23/(5760*b)
    c = b*x
    return x - 0.5 + 1./(24*x)

    exp_digamma = np.vectorize(exp_digamma)


    import re
  3. @invalid-email-address Anonymous created this gist Aug 31, 2010.
    204 changes: 204 additions & 0 deletions lda.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,204 @@
    import numpy as np

    def symdirichlet(alpha, n):
    v = np.zeros(n)+alpha
    return np.random.dirichlet(v)

    def exp_digamma(x):
    a = x*x
    b = a*x
    return x - 0.5 + 1./(24*x) + 1/(48.*a) + 23/(5760*b)


    import re
    wre = re.compile(r"(\w)+")
    def get_words(text, stop=True):
    "A simple tokenizer"
    l = 0
    while l < len(text):
    s = wre.search(text,l)
    try:
    w = text[s.start():s.end()].lower()
    if stop:
    yield w
    elif w not in stoplist:
    yield w
    l = s.end()
    except:
    break


    class EmLda(object):
    def __init__(self, docs, nt):
    self.Nt = nt
    self.docs = []
    self.all_words = []
    self.reverse_map = {}
    self.Nd = 0
    for d in docs:
    doc = []
    self.docs.append(doc)
    self.Nd += 1
    for w in get_words(d):
    if len(w) < 5: continue
    if not w in self.reverse_map:
    self.reverse_map[w] = len(self.all_words)
    self.all_words.append(w)
    doc.append(self.reverse_map[w])
    self.V = len(self.all_words)
    self.ctopics = np.zeros((self.Nt, self.V))
    self.doccounts = np.zeros((self.Nd, self.Nt))
    self.ptopics = np.zeros((self.Nt, self.V))
    self.docprobs = np.zeros((self.Nd, self.Nt))
    self.beta = 1.
    self.alpha = 1.
    for d in xrange(self.Nd):
    for i,w in enumerate(self.docs[d]):
    zw = symdirichlet(self.beta, self.Nt)
    self.ctopics.T[w] += zw
    self.doccounts[d] += zw
    self.m()



    def e(self):
    self.ctopics.fill(0)
    self.doccounts.fill(0)
    for d in xrange(self.Nd):
    for i,w in enumerate(self.docs[d]):
    zw = self.ptopics.T[w].copy()
    zw *= self.docprobs[d]
    zw /= np.sum(zw)
    self.ctopics.T[w] += zw
    self.doccounts[d] += zw

    def m(self):
    self.ptopics.fill(0)
    self.docprobs.fill(0)
    self.ptopics += self.ctopics + self.alpha
    self.ptopics /= np.sum(self.ctopics, axis=1).reshape((-1,1))
    self.docprobs += self.doccounts + self.beta
    self.docprobs /= np.sum(self.docprobs, axis=1).reshape((-1,1))

    def iterate(self):
    self.e()
    self.m()


    def run(self, n):
    for i in xrange(n):
    print "iter", i
    import sys
    sys.stdout.flush()
    self.iterate()
    for w in xrange(100):
    print "word", self.all_words[w],
    print "topics", self.ptopics.T[w]/np.sum(self.ptopics.T[w])
    for i in xrange(self.Nt):
    print
    print
    print "Topic", i
    print
    print_topic(self, self.ptopics[i], 40)


    class PrLda(EmLda):

    def __init__(self, *args):
    super(PrLda, self).__init__(*args)
    self.Nw = sum(len(d) for d in self.docs)
    c = 0
    self.sigma = 15.

    def e(self):
    self.do_lambda()
    self.do_z()

    def do_lambda(self):
    self.ptheta = [[] for i in xrange(self.V)]
    self.ctopics.fill(0)
    self.doccounts.fill(0)
    for d in xrange(self.Nd):
    for i,w in enumerate(self.docs[d]):
    zw = self.ptopics.T[w].copy()
    zw *= self.docprobs[d]
    zw /= np.sum(zw)
    self.ptheta[w].append(zw)
    self.lbda = []
    self.ptheta = map(np.array, self.ptheta)
    for w in xrange(self.V):
    self.lbda.append(self.optimize_lambda(self.ptheta[w]))

    def optimize_lambda(self, ptheta, steps=50, lrate=1.):
    lbda = np.ones(ptheta.shape[1])
    lbda /= np.sum(lbda).T
    lbda *= self.sigma
    prevobj = np.inf
    n = 0
    while True:
    obj = np.sum(np.sum(ptheta,axis=0).T*np.exp(-lbda))
    if n > 5 and prevobj - obj < 0.01*obj:
    break
    n += 1
    prevobj = obj
    # do the gradient descent
    lbda += lrate*np.sum(ptheta,axis=0)*np.exp(-lbda)
    # truncate
    lbda *= lbda>0
    # project it into the l1 ball with diameter sigma
    ll = -np.sort(-lbda)
    cs = np.argmin(((ll-self.sigma).cumsum()/(np.arange(len(ll))+1.))) >= 0
    theta = ll[cs-1]
    lbda -= theta
    lbda *= lbda > 0
    return lbda

    def do_z(self):
    indices = [0 for i in xrange(self.V)]
    for d in xrange(self.Nd):
    for i,w in enumerate(self.docs[d]):
    zw = self.ptheta[w][indices[w]]
    zw *= np.exp(-self.lbda[w])
    zw /= np.sum(zw)
    indices[w] += 1
    self.ctopics.T[w] += zw
    self.doccounts[d] += zw


    class VarLda(EmLda):
    def e(self):
    self.ctopics.fill(0)
    self.doccounts.fill(0)
    for d in xrange(self.Nd):
    for i,w in enumerate(self.docs[d]):
    zw = self.ptopics.T[w].copy()
    zw *= self.docprobs[d]
    zp = exp_digamma(zw)/exp_digamma(np.sum(zw))
    self.ctopics.T[w] += zw
    self.doccounts[d] += zw



    def print_topic(model, t, n):
    s = np.argsort(-t)
    for w in s[:n]:
    print " ",model.all_words[w]


    if __name__ == '__main__':
    import sys
    docs = []
    nt = int(sys.argv[1])
    import os
    for fname in os.listdir(sys.argv[2]):
    if not fname.startswith("."):
    docs.append(file(os.path.join(sys.argv[2],fname)).read())
    el = VarLda(docs, nt)
    el.run(10)
    el = EmLda(docs, nt)
    el.run(10)
    el = PrLda(docs, nt)
    el.run(10)
    el = PrLda(docs, nt)
    el.sigma = 150
    el.run(10)