Created
February 22, 2018 01:18
-
-
Save tekeburak/d7702f798ffbf19440d647ac1ea9512b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Sat Feb 3 17:13:19 2018 | |
@author: burak | |
""" | |
import numpy as np | |
#from sklearn.model_selection import train_test_split | |
from sklearn import preprocessing | |
from sklearn.svm import SVC | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis | |
from sklearn.metrics import accuracy_score | |
def load_train_data(): | |
# Change return line related to normal,time,frequency | |
x_train = np.load('/content/my_drive/tut_kaggle/X_train.npy') | |
#Compute the average over the frequency axis=1 | |
#return np.mean(x_train, axis=1) | |
#Compute the average over the time axis=2 | |
return np.mean(x_train, axis=2) | |
#20040-dimensional vector from each sample | |
#return np.resize(x_train,(4500,20040)) | |
def load_test_data(): | |
x_test = np.load('/content/my_drive/tut_kaggle/X_test.npy') | |
#Compute the average over the frequency axis=1 | |
#return np.mean(x_test, axis=1) | |
#Compute the average over the time axis=2 | |
return np.mean(x_test, axis=2) | |
#20040-dimensional vector from each sample | |
#return np.resize(x_test,(1500,20040)) | |
def load_train_labels(): | |
labels = [] | |
with open ('/content/my_drive/tut_kaggle/y_train.csv', 'r') as fp: | |
for line in fp: | |
# Skip the first line: | |
if "id,scene_label" in line: | |
continue | |
else: | |
values = line.split(",") | |
label = values[1] | |
labels.append(label) | |
return np.asarray(labels) | |
def load_cross_val(X_train, y_train): | |
train_index = [] | |
test_index = [] | |
train_data = [] | |
train_labels = [] | |
test_data = [] | |
test_labels = [] | |
with open ('/content/my_drive/tut_kaggle/crossvalidation_train.csv', 'r') as fp: | |
for line in fp: | |
# Skip the first line: | |
if "id,scene_label,set" in line: | |
continue | |
else: | |
values = line.strip().split(',') | |
if (values[2] == "train"): | |
train_index.append(values[0]) | |
else: | |
test_index.append(values[0]) | |
train_index = list(map(int, train_index)) | |
test_index = list(map(int, test_index)) | |
for line in train_index: | |
train_data.append(X_train[line,:]) | |
train_labels.append(y_train[line]) | |
for line in test_index: | |
test_data.append(X_train[line,:]) | |
test_labels.append(y_train[line]) | |
return train_data, test_data, train_labels, test_labels | |
#if __name__ == "__main__": should be removed before running it in Colab. | |
#if __name__ == "__main__": | |
# main code | |
X_train = load_train_data() | |
y_str_train = load_train_labels() # It's train data with class names. | |
# So convert the names to the number. | |
le = preprocessing.LabelEncoder() | |
le.fit(y_str_train) | |
y_train = le.transform(y_str_train) | |
X_train, X_test, y_train, y_test = load_cross_val(X_train, y_train) | |
# SVM Classifiers' parameters. I optimized C=13 and tol=0.01, penalty and tolerance respectively | |
clf_SVC = SVC(C=13, cache_size=200, class_weight=None, decision_function_shape='ovr', | |
degree=3, gamma= 'auto', kernel='rbf',max_iter=-1, probability=False, | |
random_state=0, shrinking=True,tol=0.01, verbose=0) | |
clf_SVC.fit(X_train, y_train) | |
y_pred_SVC = clf_SVC.predict(X_test) | |
acc_SVC = accuracy_score(y_test, y_pred_SVC) | |
# K-Neighbors Classifier | |
clf_KNN = KNeighborsClassifier() | |
clf_KNN.fit(X_train, y_train) | |
y_pred_KNN = clf_KNN.predict(X_test) | |
acc_KNN = accuracy_score(y_test, y_pred_KNN) | |
# Linear Discriminant Analysis | |
clf_LDA = LinearDiscriminantAnalysis() | |
clf_LDA.fit(X_train, y_train) | |
y_pred_LDA = clf_LDA.predict(X_test) | |
acc_LDA = accuracy_score(y_test, y_pred_LDA) | |
# As you can see that, Support Vector Machine Classifier's accuracy | |
# is the best. So we can use it for creating submission file. | |
secret_test = load_test_data() | |
y_pred = clf_SVC.predict(secret_test) | |
labels = list(le.inverse_transform(y_pred)) | |
with open("/content/my_drive/tut_kaggle/sub.csv", "w") as fp: | |
fp.write("Id,Scene_label\n") | |
for i, label in enumerate(labels): | |
fp.write("%d,%s" % (i, label)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment