Skip to content

Instantly share code, notes, and snippets.

@johnsmith17th
Created May 27, 2013 05:30

Revisions

  1. johnsmith17th created this gist May 27, 2013.
    48 changes: 48 additions & 0 deletions base.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,48 @@
    '''
    @author: John Smith
    '''

    class Classifier(object):
    ''' base classifier '''

    def __init__(self, **kwargs):
    pass

    def loadm(self, data):
    ''' load model from dictionary object '''
    pass

    def savem(self):
    ''' get dictionary object of model '''
    return { }

    def load(self, filename):
    ''' load model from file '''
    import json
    finp = open(filename, 'r')
    text = finp.read()
    finp.close()
    data = json.loads(text)
    self.loadm(data)
    pass

    def save(self, filename):
    ''' save model to file '''
    import json
    data = self.savem()
    text = json.dumps(data, ensure_ascii = False, indent = 4)
    fout = open(filename, 'w')
    fout.write(text)
    fout.close()
    pass

    def model(self, X, Y):
    ''' build classifier model '''
    ''' X: the feature weight array of samples '''
    ''' Y: the label array of samples '''
    pass

    def predict(self, x):
    ''' to predict the label of sample '''
    pass

    114 changes: 114 additions & 0 deletions bayes.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,114 @@
    '''
    @author: John Smith
    '''

    from base import Classifier

    class NaiveBayes(Classifier):
    ''' base naive bayes classifier '''

    @staticmethod
    def sampling(terms, docs):
    X = [] # features
    Y = [] # labels
    for d in docs:
    d.prepare()
    X.append([d.seglist.count(t) for t in terms])
    Y.append(1 if d.cat == '-' else -1)
    return (X, Y)

    def savem(self):
    data = {'probc': self.probc,
    'probf': self.probf,
    'cats': self.cats,
    'numx': self.numx,
    'numf': self.numf,
    'numc': self.numc,
    }
    return data

    def loadm(self, data):
    self.probc = data['probc']
    self.probf = data['probf']
    self.cats = data['cats']
    self.numx = data['numx']
    self.numf = data['numf']
    self.numc = data['numc']

    def model(self, X, Y):
    Classifier.model(self, X, Y)
    self.probc = []
    self.probf = []
    self.cats = list(set(Y))
    self.numx = len(X) # number of sample
    self.numf = len(X[0]) # number of feature
    self.numc = len(self.cats) # number of category

    class BernoulliBayes(NaiveBayes):
    ''' bernoulli naive bayes classifier '''

    def model(self, X, Y):
    NaiveBayes.model(self, X, Y)
    sit = range(self.numx)
    cit = range(self.numc)
    fit = range(self.numf)
    sta1 = lambda c: sum(1 for y in Y if y == c) # N(cat)
    sta2 = lambda f, c: sum(1 for i in sit if Y[i] == c and X[i][f] > 0) # N(feature, cat)
    for c in cit:
    ci = self.cats[c]
    nc = sta1(ci) # N(c)
    pc = float(nc) / float(self.numx) # P(c)
    pf = [0] * self.numf
    for f in fit:
    fn = sta2(f, ci) # N(f, c)
    pf[f] = float(1 + fn) / float(self.numc + nc)
    self.probc.append(pc)
    self.probf.append(pf)

    def predict(self, x):
    p = [0] * self.numc
    cit = range(self.numc)
    fit = range(self.numf)
    for c in cit:
    pi = 1
    for f in fit:
    if x[f] > 0: pi *= self.probf[c][f]
    else: pi *= (1.0 - self.probf[c][f])
    p[c] = pi * self.probc[c]
    return self.cats[p.index(max(p))]

    class MultinomialBayes(NaiveBayes):
    ''' multinomial naive bayes classifier '''

    def model(self, X, Y):
    NaiveBayes.model(self, X, Y)
    sit = range(self.numx)
    cit = range(self.numc)
    fit = range(self.numf)
    sta1 = lambda c: sum(1 for y in Y if y == c) # N(cat)
    sta2 = lambda f, c: sum(X[i][f] for i in sit if Y[i] == c) # F(feature, cat)
    for c in cit:
    ci = self.cats[c]
    nc = sta1(ci) # N(c)
    pc = float(nc) / float(self.numx) # P(c)
    pf = [0] * self.numf
    for f in fit:
    pf[f] = sta2(f, ci)
    ss = sum(pf) + self.numx
    for f in fit:
    pf[f] = float(1 + pf[f]) / float(ss)
    self.probc.append(pc)
    self.probf.append(pf)

    def predict(self, x):
    p = [0] * self.numc
    cit = range(self.numc)
    fit = range(self.numf)
    fact = lambda x: x and x * fact(x - 1) or 1
    for c in cit:
    pi = 1
    for f in fit:
    nf = x[f]
    pi *= float(self.probf[c][f] ** nf) / float(fact(nf))
    p[c] = pi * self.probc[c]
    return self.cats[p.index(max(p))]
    80 changes: 80 additions & 0 deletions bnetwork.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,80 @@
    '''
    @author: Smith
    '''

    from bayes import NaiveBayes
    from nnetwork import NeuralNetwork
    from utils import udoc
    import random

    class BayesNetwork(object):

    def __init__(self, features):
    tar = {+1: [+1], -1:[-1]}
    self.features = features
    self.bayes = {}
    for k in self.features:
    self.bayes[k] = NaiveBayes(len(self.features[k]), [+1, -1])
    self.network = NeuralNetwork(tar, len(self.bayes), len(self.bayes), 1, 0.01)
    pass

    def model(self, docs):
    sf = {}
    for k in self.features:
    s = udoc.sampling(self.features[k], docs, {'+': +1, '-': -1})
    sf[k] = s
    self.bayes[k].model(s)

    for i in range(len(docs)):
    d = docs[i]
    c = +1 if d.cat == '+' else -1
    sx = []
    fx = []
    for k in self.features:
    s = sf[k][i]
    fx.append(self.bayes[k].predict(s))
    sx.append((fx, c))
    self.network.model(sx)

    def predict(self, doc):
    sx = []
    c = +1 if doc.cat == '+' else -1
    for k in self.features:
    s = udoc.sampling_one(self.features[k], doc, {'+': +1, '-': -1})
    sx.append(self.bayes[k].predict(s))
    return self.network.predict((sx, c))

    def validate(self, docs, fold = 10):
    p, r, a, f = [0] * 4
    s = list(docs)
    random.shuffle(s)
    for k in range(fold):
    trainset = [x for i, x in enumerate(s) if i % fold != k]
    testset = [x for i, x in enumerate(s) if i % fold == k]
    (pi, ri, ai, fi) = self.test(trainset, testset)
    print (pi, ri, ai, fi)
    p += pi; r += ri; a += ai; f += fi;
    m = float(fold)
    print (p / m, r / m, a / m, f / m)
    return (p / m, r / m, a / m, f / m)

    def test(self, trainset, testset):
    self.model(trainset)
    pc = [(self.predict(x), +1 if x.cat == '+' else -1) for x in testset]
    mp, np, nr, ma = [0] * 4
    for x in pc:
    if x[0] == -1 and x[1] == -1: mp += 1
    if x[0] == -1: np += 1
    if x[1] == -1: nr += 1
    if x[0] == x[1]: ma += 1
    p = (float(mp) / float(np)) if np > 0 else 0 # precision
    r = (float(mp) / float(nr)) if nr > 0 else 0 # recall
    a = float(ma) / float(len(pc)) # accuracy
    f = self.f_mesure(p, r, 1.0) # f-mesure
    return (p, r, a, f)

    def f_mesure(self, p, r, beta = 1):
    b = beta ** 2
    try: f = float((b + 1) * p * r) / float(b * p + r)
    except: f = 0
    return f
    37 changes: 37 additions & 0 deletions boost.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,37 @@
    '''
    @author: John Smith
    '''

    from base import Classifier

    class AdaBoost(Classifier):

    def __init__(self, classifier, T):
    self.classifier = classifier
    self.T = T
    pass

    def model(self, X, Y):
    import copy, math
    self.alpha = []
    self.classifiers = []
    N = len(X)
    nit = range(N)
    w = [1.0 / N] * N
    for t in range(self.T):
    cls = copy.deepcopy(self.classifier)
    cls.model(X, Y)
    Yp = [cls.predict(x) for x in X]
    e = sum(w[i] for i in nit if Yp[i] != Y[i]) / float(N)
    a = 0.5 * math.log((1 - e) / e)
    for i in nit:
    if Yp[i] == Y[i]: w[i] *= math.exp(-a)
    else: w[i] *= math.exp(a)
    z = sum(w) * 1.0
    for i in nit: w[i] /= z
    self.alpha.append(a)
    self.classifiers.append(cls)

    def predict(self, x):
    p = sum(self.alpha[t] * self.classifiers[t].predict(x) for t in range(self.T))
    return 1 if p > 0 else -1
    148 changes: 148 additions & 0 deletions nnetwork.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,148 @@
    '''
    @author: John Smith
    '''

    import math, random

    from base import Classifier

    class NeuralNetwork(Classifier):
    ''' neural network classifier '''

    @staticmethod
    def sampling(terms, docs):
    X = [] # features
    Y = [] # labels
    for d in docs:
    d.prepare()
    X.append([d.seglist.count(t) for t in terms])
    Y.append(1 if d.cat == '-' else -1)
    return (X, Y)

    def __init__(self, target, i, h, o, r = 0.01):

    self.target = target
    # network output for each category
    # e.g. {+1: [+1, +1, +1], [-1: [-1, -1, -1], ...}
    self.ni = i # number of input nodes
    self.nh = h # number of hidden nodes
    self.no = o # number of output nodes
    self.rate = r # learning rate

    def savem(self):
    data = {'target': self.target, 'rate': self.rate,
    'ni': self.ni, 'nh': self.nh, 'no': self.no,
    'vi': self.vi, 'vh': self.vh, 'vo': self.vo,
    'wi': self.wi, 'wo': self.wo
    }
    return data

    def loadm(self, data):
    self.target = data['target']
    self.rate = data['rate']
    self.ni = data['ni']
    self.nh = data['nh']
    self.no = data['no']
    self.vi = data['vi']
    self.vh = data['vh']
    self.vo = data['vo']
    self.wi = data['wi']
    self.wo = data['wo']

    def model(self, X, Y):

    ri = range(self.ni)
    rh = range(self.nh)
    ro = range(self.no)
    rx = range(len(X))

    self.vi = [1.0] * self.ni
    # values of input nodes

    self.vh = [1.0] * self.nh
    # values of hidden nodes

    self.vo = [1.0] * self.no
    # values of output nodes

    self.wi = [[random.uniform(-self.rate, self.rate) for j in ri] for x in rh]
    # weight matrix for input nodes to hidden nodes

    self.wo = [[random.uniform(-self.rate, self.rate) for j in rh] for x in ro]
    # weight matrix for hidden nodes to output nodes

    for i in rx:
    self.forword(X[i])
    self.backpropagate(self.target[Y[i]])
    # train the network

    def predict(self, x):
    output = self.forword(x)
    d = {t: self.euclidean(output, self.target[t]) for t in self.target}
    return min(d, key = d.get)

    def forword(self, inputs):

    for i in range(self.ni):
    self.vi[i] = inputs[i]
    # activate inputs nodes

    for i in range(self.nh):
    self.vh[i] = self.sigmod(self.dotproduct(self.vi, self.wi[i]))
    # activate hidden nodes

    for i in range(self.no):
    self.vo[i] = self.sigmod(self.dotproduct(self.vh, self.wo[i]))
    # activate output nodes

    return self.vo[:]

    def backpropagate(self, outputs):

    ri = range(self.ni)
    rh = range(self.nh)
    ro = range(self.no)

    o_deltas = [0.0] * self.no
    for i in ro:
    e = outputs[i] - self.vo[i]
    o_deltas[i] = self.dsigmoid(self.vo[i]) * e
    # calculate error for output

    h_deltas = [0.0] * self.nh
    for i in rh:
    e = 0.0
    for j in ro:
    e += o_deltas[j] * self.wo[j][i]
    h_deltas[i] = self.dsigmoid(self.vh[i]) * e
    # calculate error for hidden

    for i in ro:
    for j in rh:
    self.wo[i][j] += self.rate * o_deltas[i] * self.vh[j]
    # update output weights

    for i in rh:
    for j in ri:
    self.wi[i][j] += self.rate * h_deltas[i] * self.vi[j]
    # update input weights

    e = sum([0.5 * (outputs[i] - self.vo[i]) ** 2 for i in ro])
    #print e
    return e

    def sigmod(self, x):
    #return 1.0 / (1 + math.exp(-x))
    return math.tanh(x)

    def dsigmoid(self, x):
    #return x * (1.0 - x)
    return 1.0 - x ** 2

    def dotproduct(self, x, y):
    # calculate dot product of two vectors
    return math.fsum(x[i] * y[i] for i in range(len(x)))

    def euclidean(self, x, y):
    # calculate euclidean distance of two vector
    return math.sqrt(math.fsum([math.pow(x[i] - y[i], 2) for i in range(len(x))]))
    54 changes: 54 additions & 0 deletions validation.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,54 @@
    '''
    @author: John Smith
    '''

    class CrossValidation(object):

    def shuffle(self, samples):
    import random
    (X, Y) = samples
    array = range(len(X))
    random.shuffle(array)
    samples = ([X[i] for i in array], [Y[i] for i in array])
    return samples

    def validate(self, classifier, samples, cat, fold = 10):
    p, r, a, f = [0] * 4
    X, Y = self.shuffle(samples)
    sit = range(len(X))
    for k in range(fold):
    trainx = [X[i] for i in sit if i % fold != k]
    trainy = [Y[i] for i in sit if i % fold != k]
    testx = [X[i] for i in sit if i % fold == k]
    testy = [Y[i] for i in sit if i % fold == k]
    (pi, ri, ai, fi) = self.test(classifier, trainx, trainy, testx, testy, cat)
    self.printf(pi, ri, ai, fi)
    p += pi; r += ri; a += ai; f += fi;
    m = float(fold)
    self.printf(p / m, r / m, a / m, f / m)
    return (p / m, r / m, a / m, f / m)

    def test(self, classifier, trainx, trainy, testx, testy, cat):
    classifier.model(trainx, trainy)
    pc = [(classifier.predict(testx[i]), testy[i]) for i in range(len(testx))]
    mp, np, nr, ma = [0] * 4
    for x in pc:
    if x[0] == cat and x[1] == cat: mp += 1
    if x[0] == cat: np += 1
    if x[1] == cat: nr += 1
    if x[0] == x[1]: ma += 1
    p = (float(mp) / float(np)) if np > 0 else 0 # precision
    r = (float(mp) / float(nr)) if nr > 0 else 0 # recall
    a = float(ma) / float(len(pc)) # accuracy
    f = self.f_mesure(p, r, 1.0) # f-mesure
    return (p, r, a, f)

    def f_mesure(self, p, r, beta = 1):
    b = beta ** 2
    try: f = float((b + 1) * p * r) / float(b * p + r)
    except: f = 0
    return f

    def printf(self, p, r, a, f):
    t = (p * 100.0, r * 100.0, a * 100.0, f * 100.0)
    print 'P: %0.4f%% R: %0.4f%% A: %0.4f%% F: %0.4f%%' % t
    71 changes: 71 additions & 0 deletions winnow.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,71 @@
    '''
    @author: Smith
    '''

    from base import Classifier

    class BalanceWinnow(Classifier):

    @staticmethod
    def sampling(terms, docs):
    X = []
    Y = []
    for d in docs:
    d.prepare()
    X.append([1 if t in d.segset else 0 for t in terms])
    Y.append(1 if d.cat == '-' else 0)
    return (X, Y)

    @staticmethod
    def theta(terms, docs):
    a = [sum(d.seglist.count(t) for t in terms) for d in docs]
    return sum(a) / len(a)

    def __init__(self, alpha, beta, theta):
    self.alpha = alpha
    self.beta = beta
    self.theta = theta

    def savem(self):
    data = {'numf': self.numf,
    'alpha': self.alpha,
    'beta': self.beta,
    'theta': self.theta,
    'w_pos': self.w_pos,
    'w_neg': self.w_neg,
    }
    return data

    def loadm(self, data):
    self.features = data['numf']
    self.alpha = data['alpha']
    self.beta = data['beta']
    self.theta = data['theta']
    self.w_pos = data['w_pos']
    self.w_neg = data['w_neg']

    def model(self, X, Y):
    self.numf = len(X[0])
    self.w_pos = [2] * self.numf
    self.w_neg = [1] * self.numf
    sit = range(len(X))
    fit = range(self.numf)
    for i in sit:
    p = self.predict(X[i])
    if p == 1 and p != Y[i]:
    # decrease the weights
    for f in fit:
    if X[i][f] != 0:
    self.w_pos[f] *= self.beta
    self.w_neg[f] *= self.alpha
    elif p == 0 and p != Y[i]:
    # increase the weights
    for f in fit:
    if X[i][f] != 0:
    self.w_pos[f] *= self.alpha
    self.w_neg[f] *= self.beta

    def predict(self, x):
    r = range(self.numf)
    s = sum((self.w_pos[i] - self.w_neg[i]) * x[i] for i in r)
    return 1 if s > self.theta else 0