Created
May 27, 2013 05:30
Revisions
-
johnsmith17th created this gist
May 27, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,48 @@ ''' @author: John Smith ''' class Classifier(object): ''' base classifier ''' def __init__(self, **kwargs): pass def loadm(self, data): ''' load model from dictionary object ''' pass def savem(self): ''' get dictionary object of model ''' return { } def load(self, filename): ''' load model from file ''' import json finp = open(filename, 'r') text = finp.read() finp.close() data = json.loads(text) self.loadm(data) pass def save(self, filename): ''' save model to file ''' import json data = self.savem() text = json.dumps(data, ensure_ascii = False, indent = 4) fout = open(filename, 'w') fout.write(text) fout.close() pass def model(self, X, Y): ''' build classifier model ''' ''' X: the feature weight array of samples ''' ''' Y: the label array of samples ''' pass def predict(self, x): ''' to predict the label of sample ''' pass
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,114 @@ ''' @author: John Smith ''' from base import Classifier class NaiveBayes(Classifier): ''' base naive bayes classifier ''' @staticmethod def sampling(terms, docs): X = [] # features Y = [] # labels for d in docs: d.prepare() X.append([d.seglist.count(t) for t in terms]) Y.append(1 if d.cat == '-' else -1) return (X, Y) def savem(self): data = {'probc': self.probc, 'probf': self.probf, 'cats': self.cats, 'numx': self.numx, 'numf': self.numf, 'numc': self.numc, } return data def loadm(self, data): self.probc = data['probc'] self.probf = data['probf'] self.cats = data['cats'] self.numx = data['numx'] self.numf = data['numf'] self.numc = data['numc'] def model(self, X, Y): Classifier.model(self, X, Y) self.probc = [] self.probf = [] self.cats = list(set(Y)) self.numx = len(X) # number of sample self.numf = len(X[0]) # number of feature self.numc = len(self.cats) # number of category class BernoulliBayes(NaiveBayes): ''' bernoulli naive bayes classifier ''' def model(self, X, Y): NaiveBayes.model(self, X, Y) sit = range(self.numx) cit = range(self.numc) fit = range(self.numf) sta1 = lambda c: sum(1 for y in Y if y == c) # N(cat) sta2 = lambda f, c: sum(1 for i in sit if Y[i] == c and X[i][f] > 0) # N(feature, cat) for c in cit: ci = self.cats[c] nc = sta1(ci) # N(c) pc = float(nc) / float(self.numx) # P(c) pf = [0] * self.numf for f in fit: fn = sta2(f, ci) # N(f, c) pf[f] = float(1 + fn) / float(self.numc + nc) self.probc.append(pc) self.probf.append(pf) def predict(self, x): p = [0] * self.numc cit = range(self.numc) fit = range(self.numf) for c in cit: pi = 1 for f in fit: if x[f] > 0: pi *= self.probf[c][f] else: pi *= (1.0 - self.probf[c][f]) p[c] = pi * self.probc[c] return self.cats[p.index(max(p))] class MultinomialBayes(NaiveBayes): ''' multinomial naive bayes classifier ''' def model(self, X, Y): NaiveBayes.model(self, X, Y) sit = range(self.numx) cit = range(self.numc) fit = range(self.numf) sta1 = lambda c: sum(1 for y in Y if y == c) # N(cat) sta2 = lambda f, c: sum(X[i][f] for i in sit if Y[i] == c) # F(feature, cat) for c in cit: ci = self.cats[c] nc = sta1(ci) # N(c) pc = float(nc) / float(self.numx) # P(c) pf = [0] * self.numf for f in fit: pf[f] = sta2(f, ci) ss = sum(pf) + self.numx for f in fit: pf[f] = float(1 + pf[f]) / float(ss) self.probc.append(pc) self.probf.append(pf) def predict(self, x): p = [0] * self.numc cit = range(self.numc) fit = range(self.numf) fact = lambda x: x and x * fact(x - 1) or 1 for c in cit: pi = 1 for f in fit: nf = x[f] pi *= float(self.probf[c][f] ** nf) / float(fact(nf)) p[c] = pi * self.probc[c] return self.cats[p.index(max(p))] This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,80 @@ ''' @author: Smith ''' from bayes import NaiveBayes from nnetwork import NeuralNetwork from utils import udoc import random class BayesNetwork(object): def __init__(self, features): tar = {+1: [+1], -1:[-1]} self.features = features self.bayes = {} for k in self.features: self.bayes[k] = NaiveBayes(len(self.features[k]), [+1, -1]) self.network = NeuralNetwork(tar, len(self.bayes), len(self.bayes), 1, 0.01) pass def model(self, docs): sf = {} for k in self.features: s = udoc.sampling(self.features[k], docs, {'+': +1, '-': -1}) sf[k] = s self.bayes[k].model(s) for i in range(len(docs)): d = docs[i] c = +1 if d.cat == '+' else -1 sx = [] fx = [] for k in self.features: s = sf[k][i] fx.append(self.bayes[k].predict(s)) sx.append((fx, c)) self.network.model(sx) def predict(self, doc): sx = [] c = +1 if doc.cat == '+' else -1 for k in self.features: s = udoc.sampling_one(self.features[k], doc, {'+': +1, '-': -1}) sx.append(self.bayes[k].predict(s)) return self.network.predict((sx, c)) def validate(self, docs, fold = 10): p, r, a, f = [0] * 4 s = list(docs) random.shuffle(s) for k in range(fold): trainset = [x for i, x in enumerate(s) if i % fold != k] testset = [x for i, x in enumerate(s) if i % fold == k] (pi, ri, ai, fi) = self.test(trainset, testset) print (pi, ri, ai, fi) p += pi; r += ri; a += ai; f += fi; m = float(fold) print (p / m, r / m, a / m, f / m) return (p / m, r / m, a / m, f / m) def test(self, trainset, testset): self.model(trainset) pc = [(self.predict(x), +1 if x.cat == '+' else -1) for x in testset] mp, np, nr, ma = [0] * 4 for x in pc: if x[0] == -1 and x[1] == -1: mp += 1 if x[0] == -1: np += 1 if x[1] == -1: nr += 1 if x[0] == x[1]: ma += 1 p = (float(mp) / float(np)) if np > 0 else 0 # precision r = (float(mp) / float(nr)) if nr > 0 else 0 # recall a = float(ma) / float(len(pc)) # accuracy f = self.f_mesure(p, r, 1.0) # f-mesure return (p, r, a, f) def f_mesure(self, p, r, beta = 1): b = beta ** 2 try: f = float((b + 1) * p * r) / float(b * p + r) except: f = 0 return f This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,37 @@ ''' @author: John Smith ''' from base import Classifier class AdaBoost(Classifier): def __init__(self, classifier, T): self.classifier = classifier self.T = T pass def model(self, X, Y): import copy, math self.alpha = [] self.classifiers = [] N = len(X) nit = range(N) w = [1.0 / N] * N for t in range(self.T): cls = copy.deepcopy(self.classifier) cls.model(X, Y) Yp = [cls.predict(x) for x in X] e = sum(w[i] for i in nit if Yp[i] != Y[i]) / float(N) a = 0.5 * math.log((1 - e) / e) for i in nit: if Yp[i] == Y[i]: w[i] *= math.exp(-a) else: w[i] *= math.exp(a) z = sum(w) * 1.0 for i in nit: w[i] /= z self.alpha.append(a) self.classifiers.append(cls) def predict(self, x): p = sum(self.alpha[t] * self.classifiers[t].predict(x) for t in range(self.T)) return 1 if p > 0 else -1 This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,148 @@ ''' @author: John Smith ''' import math, random from base import Classifier class NeuralNetwork(Classifier): ''' neural network classifier ''' @staticmethod def sampling(terms, docs): X = [] # features Y = [] # labels for d in docs: d.prepare() X.append([d.seglist.count(t) for t in terms]) Y.append(1 if d.cat == '-' else -1) return (X, Y) def __init__(self, target, i, h, o, r = 0.01): self.target = target # network output for each category # e.g. {+1: [+1, +1, +1], [-1: [-1, -1, -1], ...} self.ni = i # number of input nodes self.nh = h # number of hidden nodes self.no = o # number of output nodes self.rate = r # learning rate def savem(self): data = {'target': self.target, 'rate': self.rate, 'ni': self.ni, 'nh': self.nh, 'no': self.no, 'vi': self.vi, 'vh': self.vh, 'vo': self.vo, 'wi': self.wi, 'wo': self.wo } return data def loadm(self, data): self.target = data['target'] self.rate = data['rate'] self.ni = data['ni'] self.nh = data['nh'] self.no = data['no'] self.vi = data['vi'] self.vh = data['vh'] self.vo = data['vo'] self.wi = data['wi'] self.wo = data['wo'] def model(self, X, Y): ri = range(self.ni) rh = range(self.nh) ro = range(self.no) rx = range(len(X)) self.vi = [1.0] * self.ni # values of input nodes self.vh = [1.0] * self.nh # values of hidden nodes self.vo = [1.0] * self.no # values of output nodes self.wi = [[random.uniform(-self.rate, self.rate) for j in ri] for x in rh] # weight matrix for input nodes to hidden nodes self.wo = [[random.uniform(-self.rate, self.rate) for j in rh] for x in ro] # weight matrix for hidden nodes to output nodes for i in rx: self.forword(X[i]) self.backpropagate(self.target[Y[i]]) # train the network def predict(self, x): output = self.forword(x) d = {t: self.euclidean(output, self.target[t]) for t in self.target} return min(d, key = d.get) def forword(self, inputs): for i in range(self.ni): self.vi[i] = inputs[i] # activate inputs nodes for i in range(self.nh): self.vh[i] = self.sigmod(self.dotproduct(self.vi, self.wi[i])) # activate hidden nodes for i in range(self.no): self.vo[i] = self.sigmod(self.dotproduct(self.vh, self.wo[i])) # activate output nodes return self.vo[:] def backpropagate(self, outputs): ri = range(self.ni) rh = range(self.nh) ro = range(self.no) o_deltas = [0.0] * self.no for i in ro: e = outputs[i] - self.vo[i] o_deltas[i] = self.dsigmoid(self.vo[i]) * e # calculate error for output h_deltas = [0.0] * self.nh for i in rh: e = 0.0 for j in ro: e += o_deltas[j] * self.wo[j][i] h_deltas[i] = self.dsigmoid(self.vh[i]) * e # calculate error for hidden for i in ro: for j in rh: self.wo[i][j] += self.rate * o_deltas[i] * self.vh[j] # update output weights for i in rh: for j in ri: self.wi[i][j] += self.rate * h_deltas[i] * self.vi[j] # update input weights e = sum([0.5 * (outputs[i] - self.vo[i]) ** 2 for i in ro]) #print e return e def sigmod(self, x): #return 1.0 / (1 + math.exp(-x)) return math.tanh(x) def dsigmoid(self, x): #return x * (1.0 - x) return 1.0 - x ** 2 def dotproduct(self, x, y): # calculate dot product of two vectors return math.fsum(x[i] * y[i] for i in range(len(x))) def euclidean(self, x, y): # calculate euclidean distance of two vector return math.sqrt(math.fsum([math.pow(x[i] - y[i], 2) for i in range(len(x))])) This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,54 @@ ''' @author: John Smith ''' class CrossValidation(object): def shuffle(self, samples): import random (X, Y) = samples array = range(len(X)) random.shuffle(array) samples = ([X[i] for i in array], [Y[i] for i in array]) return samples def validate(self, classifier, samples, cat, fold = 10): p, r, a, f = [0] * 4 X, Y = self.shuffle(samples) sit = range(len(X)) for k in range(fold): trainx = [X[i] for i in sit if i % fold != k] trainy = [Y[i] for i in sit if i % fold != k] testx = [X[i] for i in sit if i % fold == k] testy = [Y[i] for i in sit if i % fold == k] (pi, ri, ai, fi) = self.test(classifier, trainx, trainy, testx, testy, cat) self.printf(pi, ri, ai, fi) p += pi; r += ri; a += ai; f += fi; m = float(fold) self.printf(p / m, r / m, a / m, f / m) return (p / m, r / m, a / m, f / m) def test(self, classifier, trainx, trainy, testx, testy, cat): classifier.model(trainx, trainy) pc = [(classifier.predict(testx[i]), testy[i]) for i in range(len(testx))] mp, np, nr, ma = [0] * 4 for x in pc: if x[0] == cat and x[1] == cat: mp += 1 if x[0] == cat: np += 1 if x[1] == cat: nr += 1 if x[0] == x[1]: ma += 1 p = (float(mp) / float(np)) if np > 0 else 0 # precision r = (float(mp) / float(nr)) if nr > 0 else 0 # recall a = float(ma) / float(len(pc)) # accuracy f = self.f_mesure(p, r, 1.0) # f-mesure return (p, r, a, f) def f_mesure(self, p, r, beta = 1): b = beta ** 2 try: f = float((b + 1) * p * r) / float(b * p + r) except: f = 0 return f def printf(self, p, r, a, f): t = (p * 100.0, r * 100.0, a * 100.0, f * 100.0) print 'P: %0.4f%% R: %0.4f%% A: %0.4f%% F: %0.4f%%' % t This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,71 @@ ''' @author: Smith ''' from base import Classifier class BalanceWinnow(Classifier): @staticmethod def sampling(terms, docs): X = [] Y = [] for d in docs: d.prepare() X.append([1 if t in d.segset else 0 for t in terms]) Y.append(1 if d.cat == '-' else 0) return (X, Y) @staticmethod def theta(terms, docs): a = [sum(d.seglist.count(t) for t in terms) for d in docs] return sum(a) / len(a) def __init__(self, alpha, beta, theta): self.alpha = alpha self.beta = beta self.theta = theta def savem(self): data = {'numf': self.numf, 'alpha': self.alpha, 'beta': self.beta, 'theta': self.theta, 'w_pos': self.w_pos, 'w_neg': self.w_neg, } return data def loadm(self, data): self.features = data['numf'] self.alpha = data['alpha'] self.beta = data['beta'] self.theta = data['theta'] self.w_pos = data['w_pos'] self.w_neg = data['w_neg'] def model(self, X, Y): self.numf = len(X[0]) self.w_pos = [2] * self.numf self.w_neg = [1] * self.numf sit = range(len(X)) fit = range(self.numf) for i in sit: p = self.predict(X[i]) if p == 1 and p != Y[i]: # decrease the weights for f in fit: if X[i][f] != 0: self.w_pos[f] *= self.beta self.w_neg[f] *= self.alpha elif p == 0 and p != Y[i]: # increase the weights for f in fit: if X[i][f] != 0: self.w_pos[f] *= self.alpha self.w_neg[f] *= self.beta def predict(self, x): r = range(self.numf) s = sum((self.w_pos[i] - self.w_neg[i]) * x[i] for i in r) return 1 if s > self.theta else 0