johnsmith17th · May 27, 2013 05:30 · May 27, 2013
diff --git a/base.py b/base.py
@@ -0,0 +1,48 @@
+'''
+@author: John Smith
+'''
+
+class Classifier(object):
+    ''' base classifier '''
+
+    def __init__(self, **kwargs):
+        pass
+
+    def loadm(self, data):
+        ''' load model from dictionary object '''
+        pass
+
+    def savem(self):
+        ''' get dictionary object of model '''
+        return { }
+
+    def load(self, filename):
+        ''' load model from file '''
+        import json
+        finp = open(filename, 'r')
+        text = finp.read()
+        finp.close()
+        data = json.loads(text)
+        self.loadm(data)
+        pass
+
+    def save(self, filename):
+        ''' save model to file '''
+        import json
+        data = self.savem()
+        text = json.dumps(data, ensure_ascii = False, indent = 4)
+        fout = open(filename, 'w')
+        fout.write(text)
+        fout.close()
+        pass
+
+    def model(self, X, Y):
+        ''' build classifier model '''
+        ''' X: the feature weight array of samples '''
+        ''' Y: the label array of samples '''
+        pass
+
+    def predict(self, x):
+        ''' to predict the label of sample '''
+        pass
+
diff --git a/bayes.py b/bayes.py
@@ -0,0 +1,114 @@
+'''
+@author: John Smith
+'''
+
+from base import Classifier
+
+class NaiveBayes(Classifier):
+    ''' base naive bayes classifier '''
+
+    @staticmethod
+    def sampling(terms, docs):
+        X = [] # features
+        Y = [] # labels
+        for d in docs:
+            d.prepare()
+            X.append([d.seglist.count(t) for t in terms])
+            Y.append(1 if d.cat == '-' else -1)
+        return (X, Y)
+
+    def savem(self):
+        data = {'probc': self.probc, 
+                'probf': self.probf, 
+                'cats': self.cats,
+                'numx': self.numx,
+                'numf': self.numf,
+                'numc': self.numc, 
+                }
+        return data
+
+    def loadm(self, data):
+        self.probc = data['probc']
+        self.probf = data['probf']
+        self.cats = data['cats']
+        self.numx = data['numx']
+        self.numf = data['numf']
+        self.numc = data['numc']
+
+    def model(self, X, Y):
+        Classifier.model(self, X, Y)
+        self.probc = []
+        self.probf = []
+        self.cats = list(set(Y))
+        self.numx = len(X) # number of sample
+        self.numf = len(X[0]) # number of feature
+        self.numc = len(self.cats) # number of category
+
+class BernoulliBayes(NaiveBayes):
+    ''' bernoulli naive bayes classifier '''
+
+    def model(self, X, Y):
+        NaiveBayes.model(self, X, Y)
+        sit = range(self.numx)
+        cit = range(self.numc)
+        fit = range(self.numf)
+        sta1 = lambda c: sum(1 for y in Y if y == c) # N(cat)
+        sta2 = lambda f, c: sum(1 for i in sit if Y[i] == c and X[i][f] > 0) # N(feature, cat)
+        for c in cit:
+            ci = self.cats[c]
+            nc = sta1(ci) # N(c)
+            pc = float(nc) / float(self.numx) # P(c)
+            pf = [0] * self.numf
+            for f in fit:
+                fn = sta2(f, ci) # N(f, c)
+                pf[f] = float(1 + fn) / float(self.numc + nc)
+            self.probc.append(pc)
+            self.probf.append(pf)
+
+    def predict(self, x):
+        p = [0] * self.numc
+        cit = range(self.numc)
+        fit = range(self.numf)
+        for c in cit:
+            pi = 1
+            for f in fit:
+                if x[f] > 0: pi *= self.probf[c][f]
+                else: pi *= (1.0 - self.probf[c][f])
+            p[c] = pi * self.probc[c]
+        return self.cats[p.index(max(p))]
+
+class MultinomialBayes(NaiveBayes):
+    ''' multinomial naive bayes classifier '''
+
+    def model(self, X, Y):
+        NaiveBayes.model(self, X, Y)
+        sit = range(self.numx)
+        cit = range(self.numc)
+        fit = range(self.numf)
+        sta1 = lambda c: sum(1 for y in Y if y == c) # N(cat)
+        sta2 = lambda f, c: sum(X[i][f] for i in sit if Y[i] == c) # F(feature, cat)
+        for c in cit:
+            ci = self.cats[c]
+            nc = sta1(ci) # N(c)
+            pc = float(nc) / float(self.numx) # P(c)
+            pf = [0] * self.numf
+            for f in fit:
+                pf[f] = sta2(f, ci)
+            ss = sum(pf) + self.numx
+            for f in fit:
+                pf[f] = float(1 + pf[f]) / float(ss)
+            self.probc.append(pc)
+            self.probf.append(pf)
+
+    def predict(self, x):
+        p = [0] * self.numc
+        cit = range(self.numc)
+        fit = range(self.numf)
+        fact = lambda x: x and x * fact(x - 1) or 1
+        for c in cit:
+            pi = 1            
+            for f in fit:
+                nf = x[f]
+                pi *= float(self.probf[c][f] ** nf) / float(fact(nf))
+            p[c] = pi * self.probc[c]
+        return self.cats[p.index(max(p))]
diff --git a/bnetwork.py b/bnetwork.py
@@ -0,0 +1,80 @@
+'''
+@author: Smith
+'''
+
+from bayes import NaiveBayes
+from nnetwork import NeuralNetwork
+from utils import udoc
+import random
+
+class BayesNetwork(object):
+
+    def __init__(self, features):        
+        tar = {+1: [+1], -1:[-1]}
+        self.features = features
+        self.bayes = {}
+        for k in self.features:
+            self.bayes[k] = NaiveBayes(len(self.features[k]), [+1, -1])
+        self.network = NeuralNetwork(tar, len(self.bayes), len(self.bayes), 1, 0.01)
+        pass
+
+    def model(self, docs):
+        sf = {}
+        for k in self.features:
+            s = udoc.sampling(self.features[k], docs, {'+': +1, '-': -1})
+            sf[k] = s
+            self.bayes[k].model(s)
+
+        for i in range(len(docs)):
+            d = docs[i]
+            c = +1 if d.cat == '+' else -1
+            sx = []
+            fx = []
+            for k in self.features:
+                s = sf[k][i]
+                fx.append(self.bayes[k].predict(s))
+            sx.append((fx, c))
+        self.network.model(sx)
+
+    def predict(self, doc):
+        sx = []
+        c = +1 if doc.cat == '+' else -1
+        for k in self.features:
+            s = udoc.sampling_one(self.features[k], doc, {'+': +1, '-': -1})
+            sx.append(self.bayes[k].predict(s))            
+        return self.network.predict((sx, c))
+
+    def validate(self, docs, fold = 10):
+        p, r, a, f = [0] * 4
+        s = list(docs)
+        random.shuffle(s)
+        for k in range(fold):
+            trainset = [x for i, x in enumerate(s) if i % fold != k]
+            testset = [x for i, x in enumerate(s) if i % fold == k]
+            (pi, ri, ai, fi) = self.test(trainset, testset)
+            print (pi, ri, ai, fi)
+            p += pi; r += ri; a += ai; f += fi;
+        m = float(fold)
+        print (p / m, r / m, a / m, f / m)
+        return (p / m, r / m, a / m, f / m)
+
+    def test(self, trainset, testset):
+        self.model(trainset)
+        pc = [(self.predict(x), +1 if x.cat == '+' else -1) for x in testset]
+        mp, np, nr, ma = [0] * 4
+        for x in pc:
+            if x[0] == -1 and x[1] == -1: mp += 1
+            if x[0] == -1: np += 1
+            if x[1] == -1: nr += 1
+            if x[0] == x[1]: ma += 1
+        p = (float(mp) / float(np)) if np > 0 else 0 # precision
+        r = (float(mp) / float(nr)) if nr > 0 else 0 # recall
+        a = float(ma) / float(len(pc)) # accuracy
+        f = self.f_mesure(p, r, 1.0) # f-mesure
+        return (p, r, a, f)
+
+    def f_mesure(self, p, r, beta = 1):
+        b = beta ** 2
+        try: f = float((b + 1) * p * r) / float(b * p + r)
+        except: f = 0
+        return f
diff --git a/boost.py b/boost.py
@@ -0,0 +1,37 @@
+'''
+@author: John Smith
+'''
+
+from base import Classifier
+
+class AdaBoost(Classifier):
+
+    def __init__(self, classifier, T):
+        self.classifier = classifier
+        self.T = T
+        pass
+
+    def model(self, X, Y):
+        import copy, math
+        self.alpha = []
+        self.classifiers = []
+        N = len(X)
+        nit = range(N)
+        w = [1.0 / N] * N
+        for t in range(self.T):            
+            cls = copy.deepcopy(self.classifier)
+            cls.model(X, Y)
+            Yp = [cls.predict(x) for x in X]
+            e = sum(w[i] for i in nit if Yp[i] != Y[i]) / float(N)
+            a = 0.5 * math.log((1 - e) / e)
+            for i in nit:
+                if Yp[i] == Y[i]: w[i] *= math.exp(-a)
+                else: w[i] *= math.exp(a)
+            z = sum(w) * 1.0
+            for i in nit: w[i] /= z
+            self.alpha.append(a)
+            self.classifiers.append(cls)
+
+    def predict(self, x):
+        p = sum(self.alpha[t] * self.classifiers[t].predict(x) for t in range(self.T))
+        return 1 if p > 0 else -1
diff --git a/nnetwork.py b/nnetwork.py
@@ -0,0 +1,148 @@
+'''
+@author: John Smith
+'''
+
+import math, random
+
+from base import Classifier
+
+class NeuralNetwork(Classifier):
+    ''' neural network classifier '''
+
+    @staticmethod
+    def sampling(terms, docs):
+        X = [] # features
+        Y = [] # labels
+        for d in docs:
+            d.prepare()
+            X.append([d.seglist.count(t) for t in terms])
+            Y.append(1 if d.cat == '-' else -1)
+        return (X, Y)
+
+    def __init__(self, target, i, h, o, r = 0.01):
+
+        self.target = target
+        # network output for each category
+        # e.g. {+1: [+1, +1, +1], [-1: [-1, -1, -1], ...}
+        self.ni = i # number of input nodes
+        self.nh = h # number of hidden nodes
+        self.no = o # number of output nodes
+        self.rate = r # learning rate
+
+    def savem(self):
+        data = {'target': self.target, 'rate': self.rate,
+                'ni': self.ni, 'nh': self.nh, 'no': self.no,
+                'vi': self.vi, 'vh': self.vh, 'vo': self.vo,
+                'wi': self.wi, 'wo': self.wo
+                }
+        return data
+
+    def loadm(self, data):
+        self.target = data['target']
+        self.rate = data['rate']
+        self.ni = data['ni']
+        self.nh = data['nh']
+        self.no = data['no']
+        self.vi = data['vi']
+        self.vh = data['vh']
+        self.vo = data['vo']
+        self.wi = data['wi']
+        self.wo = data['wo']
+
+    def model(self, X, Y):
+
+        ri = range(self.ni)
+        rh = range(self.nh)
+        ro = range(self.no)
+        rx = range(len(X))
+
+        self.vi = [1.0] * self.ni
+        # values of input nodes
+
+        self.vh = [1.0] * self.nh
+        # values of hidden nodes
+
+        self.vo = [1.0] * self.no
+        # values of output nodes
+
+        self.wi = [[random.uniform(-self.rate, self.rate) for j in ri] for x in rh]
+        # weight matrix for input nodes to hidden nodes
+
+        self.wo = [[random.uniform(-self.rate, self.rate) for j in rh] for x in ro]
+        # weight matrix for hidden nodes to output nodes
+
+        for i in rx:
+            self.forword(X[i])
+            self.backpropagate(self.target[Y[i]])
+        # train the network
+
+    def predict(self, x):
+        output = self.forword(x)
+        d = {t: self.euclidean(output, self.target[t]) for t in self.target}
+        return min(d, key = d.get)
+
+    def forword(self, inputs):
+
+        for i in range(self.ni):
+            self.vi[i] = inputs[i]
+        # activate inputs nodes
+
+        for i in range(self.nh):
+            self.vh[i] = self.sigmod(self.dotproduct(self.vi, self.wi[i]))
+        # activate hidden nodes
+
+        for i in range(self.no):
+            self.vo[i] = self.sigmod(self.dotproduct(self.vh, self.wo[i]))
+        # activate output nodes
+
+        return self.vo[:]
+
+    def backpropagate(self, outputs):
+
+        ri = range(self.ni)
+        rh = range(self.nh)
+        ro = range(self.no)
+
+        o_deltas = [0.0] * self.no
+        for i in ro:
+            e = outputs[i] - self.vo[i]
+            o_deltas[i] = self.dsigmoid(self.vo[i]) * e
+        # calculate error for output        
+
+        h_deltas = [0.0] * self.nh
+        for i in rh:
+            e = 0.0
+            for j in ro:
+                e += o_deltas[j] * self.wo[j][i]
+            h_deltas[i] = self.dsigmoid(self.vh[i]) * e
+        # calculate error for hidden            
+
+        for i in ro:
+            for j in rh:
+                self.wo[i][j] += self.rate * o_deltas[i] * self.vh[j]
+        # update output weights        
+
+        for i in rh:
+            for j in ri:
+                self.wi[i][j] += self.rate * h_deltas[i] * self.vi[j]
+        # update input weights
+
+        e = sum([0.5 * (outputs[i] - self.vo[i]) ** 2 for i in ro])
+        #print e
+        return e
+
+    def sigmod(self, x):
+        #return 1.0 / (1 + math.exp(-x))
+        return math.tanh(x)
+
+    def dsigmoid(self, x):
+        #return x * (1.0 - x)
+        return 1.0 - x ** 2
+
+    def dotproduct(self, x, y):
+        # calculate dot product of two vectors
+        return math.fsum(x[i] * y[i] for i in range(len(x)))    
+
+    def euclidean(self, x, y):
+        # calculate euclidean distance of two vector
+        return math.sqrt(math.fsum([math.pow(x[i] - y[i], 2) for i in range(len(x))]))
diff --git a/validation.py b/validation.py
@@ -0,0 +1,54 @@
+'''
+@author: John Smith
+'''
+
+class CrossValidation(object):
+
+    def shuffle(self, samples):
+        import random
+        (X, Y) = samples
+        array = range(len(X))
+        random.shuffle(array)
+        samples = ([X[i] for i in array], [Y[i] for i in array])
+        return samples
+
+    def validate(self, classifier, samples, cat, fold = 10):
+        p, r, a, f = [0] * 4
+        X, Y = self.shuffle(samples)
+        sit = range(len(X))
+        for k in range(fold):
+            trainx = [X[i] for i in sit if i % fold != k]
+            trainy = [Y[i] for i in sit if i % fold != k]
+            testx = [X[i] for i in sit if i % fold == k]
+            testy = [Y[i] for i in sit if i % fold == k]
+            (pi, ri, ai, fi) = self.test(classifier, trainx, trainy, testx, testy, cat)
+            self.printf(pi, ri, ai, fi)
+            p += pi; r += ri; a += ai; f += fi;
+        m = float(fold)
+        self.printf(p / m, r / m, a / m, f / m)
+        return (p / m, r / m, a / m, f / m)
+
+    def test(self, classifier, trainx, trainy, testx, testy, cat):
+        classifier.model(trainx, trainy)
+        pc = [(classifier.predict(testx[i]), testy[i]) for i in range(len(testx))]
+        mp, np, nr, ma = [0] * 4
+        for x in pc:
+            if x[0] == cat and x[1] == cat: mp += 1
+            if x[0] == cat: np += 1
+            if x[1] == cat: nr += 1
+            if x[0] == x[1]: ma += 1
+        p = (float(mp) / float(np)) if np > 0 else 0 # precision
+        r = (float(mp) / float(nr)) if nr > 0 else 0 # recall
+        a = float(ma) / float(len(pc)) # accuracy
+        f = self.f_mesure(p, r, 1.0) # f-mesure
+        return (p, r, a, f)    
+
+    def f_mesure(self, p, r, beta = 1):
+        b = beta ** 2
+        try: f = float((b + 1) * p * r) / float(b * p + r)
+        except: f = 0
+        return f
+
+    def printf(self, p, r, a, f):
+        t = (p * 100.0, r * 100.0, a * 100.0, f * 100.0)
+        print 'P: %0.4f%%  R: %0.4f%%  A: %0.4f%%  F: %0.4f%%' % t
diff --git a/winnow.py b/winnow.py
@@ -0,0 +1,71 @@
+'''
+@author: Smith
+'''
+
+from base import Classifier
+
+class BalanceWinnow(Classifier):
+
+    @staticmethod
+    def sampling(terms, docs):
+        X = []
+        Y = []
+        for d in docs:
+            d.prepare()
+            X.append([1 if t in d.segset else 0 for t in terms])
+            Y.append(1 if d.cat == '-' else 0)
+        return (X, Y)
+
+    @staticmethod
+    def theta(terms, docs):
+        a = [sum(d.seglist.count(t) for t in terms) for d in docs]
+        return sum(a) / len(a)    
+
+    def __init__(self, alpha, beta, theta):
+        self.alpha = alpha
+        self.beta = beta
+        self.theta = theta
+
+    def savem(self):
+        data = {'numf': self.numf,
+                'alpha': self.alpha,
+                'beta': self.beta,
+                'theta': self.theta,
+                'w_pos': self.w_pos,
+                'w_neg': self.w_neg,
+                }
+        return data
+
+    def loadm(self, data):
+        self.features = data['numf']
+        self.alpha = data['alpha']
+        self.beta = data['beta']
+        self.theta = data['theta']
+        self.w_pos = data['w_pos']
+        self.w_neg = data['w_neg']
+
+    def model(self, X, Y):
+        self.numf = len(X[0])
+        self.w_pos = [2] * self.numf
+        self.w_neg = [1] * self.numf
+        sit = range(len(X))
+        fit = range(self.numf)
+        for i in sit:
+            p = self.predict(X[i])            
+            if p == 1 and p != Y[i]:
+            # decrease the weights
+                for f in fit:
+                    if X[i][f] != 0:
+                        self.w_pos[f] *= self.beta
+                        self.w_neg[f] *= self.alpha
+            elif p == 0 and p != Y[i]:
+            # increase the weights
+                for f in fit:
+                    if X[i][f] != 0:
+                        self.w_pos[f] *= self.alpha
+                        self.w_neg[f] *= self.beta                        
+
+    def predict(self, x):
+        r = range(self.numf)
+        s = sum((self.w_pos[i] - self.w_neg[i]) * x[i] for i in r)
+        return 1 if s > self.theta else 0