Created
May 1, 2017 06:54
-
-
Save zhengyangchoong/cee404d7c8cf71419a3e7dc6874afedb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import random | |
import csv | |
import collections | |
import matplotlib.pyplot as plt | |
import math | |
from sklearn import linear_model | |
from sklearn import svm | |
import cPickle as pickle | |
plt.style.use('fivethirtyeight') | |
def parseCSV(filename): # return ordered lists for each variable? so like a dictionary of lists | |
predictors = {} | |
with open(filename, 'rb') as f: | |
reader = csv.reader(f, delimiter=",") | |
c = 0 | |
for row in reader: | |
if c == 0: | |
headers = row | |
for i in headers: | |
predictors[i] = [] | |
_a = headers.index("Sex") | |
_b = headers.index("Age") | |
else: | |
if row[_a] == "male": | |
row[_a] = -1 | |
elif row[_a] == "female": | |
row[_a] = 1 | |
if row[_b] == '': | |
row[_b] = 30 | |
for i in xrange(len(row)): | |
try: | |
a = float(row[i]) | |
except: | |
a = row[i] | |
if a == '': | |
predictors[headers[i]].append(None) | |
else: | |
predictors[headers[i]].append(a) | |
c += 1 | |
return (headers,predictors) | |
# headers: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'] | |
def basicstuff(predictors): # no. of people. proportion of people who survived, sex distribution, class distribution, age distribution | |
print "No. of people: {}".format(len(predictors['PassengerId'])) | |
print "No. survived: {}".format(sum(predictors['Survived'])) # survival fraction is 38.4% | |
def plotAgeHist(): | |
plt.hist([x for x in predictors['Age'] if not x == None]) | |
plt.xlabel("Age") | |
plt.ylabel("Frequency") | |
plt.savefig("histogram_age.png",bbox_inches = 'tight', dpi = 300) | |
plt.clf() | |
def plotClass(): | |
_z = (collections.Counter(predictors['Pclass'])) | |
print _z | |
x = [1.0,2.0,3.0] | |
y = [_z[float(x[i])] for i in xrange(3)] | |
print x | |
print y | |
#plt.xlabel("Passenger Class") | |
plt.ylabel("Frequency") | |
plt.bar(x,y,color='C0',align='center') | |
plt.xticks(x, ["First class", "Second class", "Third class"]) | |
#plt.show() | |
plt.savefig("histogram_class.png", bbox_inches='tight', dpi=300) | |
plt.clf() | |
#print collections.Counter(predictors['Age']) | |
#x = (collections.Counter(predictors['Pclass'])) | |
#print collections.Counter(predictors['Sex']) | |
def crossplots(predictors): # visually find correlations between two arrays | |
'''class against survival rate, | |
age with survival rate | |
sex with survival rate ''' | |
def classSurvival(): | |
s_1 = [] | |
s_2 = [] | |
s_3 = [] | |
for i in xrange(len(predictors["Pclass"])): | |
if predictors["Pclass"][i] == 1.0: | |
s_1.append(predictors["Survived"][i]) | |
elif predictors["Pclass"][i] == 2.0: | |
s_2.append(predictors["Survived"][i]) | |
elif predictors["Pclass"][i] == 3.0: | |
s_3.append(predictors["Survived"][i]) | |
y = [sum(s_1)/len(s_1), sum(s_2)/len(s_2), sum(s_3)/len(s_3)] | |
x = [1.0,2.0,3.0] | |
print np.corrcoef(x,y) | |
plt.plot(x,y) | |
plt.xticks(x, ["First class", "Second class", "Third class"]) | |
#plt.xlabel("Class") | |
plt.ylabel("Probability of survival") | |
#plt.savefig("hist_survivalbyclass.png", bbox_inches = 'tight', dpi = 300) | |
plt.clf() | |
def ageSurvival(): | |
ages = [] | |
ids = [] | |
_age = {} | |
for i in xrange(len(predictors["Age"])): | |
if predictors["Age"][i] == None: | |
continue | |
else: | |
ids.append(i) | |
ages.append(predictors["Age"][i]) | |
for i in xrange(len(ages)): | |
x = math.floor(ages[i]/5.0) | |
if x not in _age: | |
_age[x] = [ids[i]] | |
else: | |
_age[x].append(ids[i]) | |
survivalwithage = [] | |
for i in sorted(_age.keys()): | |
_srate = 0 | |
for _id in _age[i]: | |
_srate += predictors["Survived"][_id] | |
survivalwithage.append(_srate/len(_age[i])) | |
xes = [(i+1) * 5 for i in sorted(_age.keys())] | |
plt.xlabel("<Age") | |
plt.ylabel("Probability of survival") | |
plt.plot(xes,survivalwithage) | |
plt.savefig("agesurvival.png", bbox_inches='tight', dpi=300) | |
plt.clf() | |
def genderSurvival(): | |
m = [] | |
f = [] | |
for i in xrange(len(predictors["Pclass"])): | |
if predictors["Sex"][i] == 1.0: | |
f.append(predictors["Survived"][i]) | |
elif predictors["Sex"][i] == -1.0: | |
m.append(predictors["Survived"][i]) | |
y = [sum(m)/len(m), sum(f)/len(f)] | |
print np.corrcoef([-1, 1], y) | |
classSurvival() | |
genderSurvival() | |
#ageSurvival() | |
def classify(predictors, testpredictors): | |
# produce list of array: [class, age, gender] | |
# produce another list of results | |
x = [] | |
y = [] | |
realx = [] | |
for i in xrange(len(predictors["PassengerId"])): | |
_x = [] | |
_x.append(predictors['Age'][i]) | |
_x.append(predictors['Sex'][i]) | |
_x.append(predictors['Pclass'][i]) | |
x.append(_x) | |
y.append(predictors['Survived'][i]) | |
ids = [] | |
for i in xrange(len(testpredictors["PassengerId"])): | |
_x = [] | |
_x.append(testpredictors['Age'][i]) | |
_x.append(testpredictors['Sex'][i]) | |
_x.append(testpredictors['Pclass'][i]) | |
ids.append(testpredictors["PassengerId"][i]) | |
realx.append(_x) | |
print len(realx) | |
x_train = x[:-50] | |
y_train = y[:-50] | |
x_test = x[-50:] | |
y_test = y[-50:] | |
#print x_train | |
def logisticfit(): | |
logistic = linear_model.LogisticRegression(C=1e6) | |
logistic.fit(x_train, y_train) | |
testresult = logistic.predict(x_test) | |
scores = [1 for i in xrange(len(testresult)) if testresult[i] == y_test[i]] | |
print sum(scores) | |
print logistic.get_params() | |
def svmfit(): | |
#svc = svm.SVC(kernel='poly', degree=2) | |
#svc.fit(x_train, y_train) | |
#testresult = svc.predict(x_test) | |
#scores = [1 for i in xrange(len(testresult)) if testresult[i] == y_test[i]] | |
#print scores | |
#pickle.dump(svc, open("model.p", "wb")) | |
svc = pickle.load(open("model.p", "rb")) | |
final = svc.predict(realx) | |
f = open("answer.csv", "w") | |
f.write("PassengerId,Survived\n") | |
for i in xrange(len(ids)): | |
f.write("{},{}\n".format(int(ids[i]), int(final[i]))) | |
#logisticfit() | |
svmfit() | |
def main(): | |
(headers, predictors) = parseCSV('train.csv') | |
crossplots(predictors) | |
#print "---" | |
#(_, testpredictors) = parseCSV('test.csv') | |
#classify(predictors,testpredictors) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment