Last active
July 19, 2017 01:46
-
-
Save shengch02/fa77c4d2998dd1e48604c560458a22ea to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from collections import Counter | |
from sklearn.model_selection import KFold | |
from sklearn.ensemble import RandomForestClassifier | |
import xgboost as xgb | |
from sklearn.metrics import matthews_corrcoef as mc | |
import operator | |
NROWS = 25000 | |
CHUNKSIZE = 5000 | |
ID = 'Id' | |
RESPONSE = 'Response' | |
#analyze and understand the date data, auto-correlation graph | |
date = pd.read_csv('traindate.csv') | |
date_feature_cols = np.setdiff1d(date.columns.tolist(), [ID, RESPONSE]) | |
times = [] | |
for feature_name in date_feature_cols: | |
df = date[[feature_name]].copy() | |
df.columns = ['time'] | |
df['Station'] = feature_name.split('_')[1][1:] | |
df = df.dropna() | |
times.append(df) | |
times = pd.concat(times) | |
times['time'] = (times['time']*100).astype(int) | |
time_cnt = times.groupby('time').count()[['Station']].reset_index() | |
time_cnt.columns = ['time', 'cnt'] | |
plt.plot(time_cnt['time'], time_cnt['cnt']) | |
plt.show() | |
time_tick = range(time_cnt['time'].min(), time_cnt['time'].max()+1, 1) | |
time_tick = pd.DataFrame({'time': time_tick}) | |
time_station_cnt = (pd.merge(time_tick, time_cnt, on='time', how='left').fillna(0))['cnt'].tolist() | |
auto_corr = [1]+[np.corrcoef(time_station_cnt[:-k],time_station_cnt[k:])[0, 1] for k in range(1, 5000)] | |
plt.plot(auto_corr) | |
plt.show() | |
#read the date data | |
train = pd.read_csv('traindate.csv', usecols=[ID], nrows=NROWS) | |
test = pd.read_csv('testdate.csv', usecols=[ID], nrows=NROWS) | |
train['Starttime'] = -1.0 | |
test['Starttime'] = -1.0 | |
train['Duration'] = -1.0 | |
test['Duration'] = -1.0 | |
#find the starting time and duration for each ID | |
for tr, te in zip(pd.read_csv('traindate.csv', chunksize=CHUNKSIZE), pd.read_csv('testdate.csv', chunksize=CHUNKSIZE)): | |
feature_columns = np.setdiff1d(np.array(tr.columns), [ID]) | |
tr['Min'] = tr[feature_columns].min(axis=1) | |
tr['Duration'] = tr[feature_columns].max(axis=1) - tr[feature_columns].min(axis=1) | |
te['Min'] = te[feature_columns].min(axis=1) | |
te['Duration'] = te[feature_columns].max(axis=1) - te[feature_columns].min(axis=1) | |
train.loc[train[ID].isin(tr[ID]), 'Starttime'] = tr['Min'].values | |
train.loc[train[ID].isin(tr[ID]), 'Duration'] = tr['Duration'].values | |
test.loc[test[ID].isin(te[ID]), 'Starttime'] = te['Min'].values | |
test.loc[test[ID].isin(te[ID]), 'Duration'] = te['Duration'].values | |
#concat train and test data | |
n_train = train.shape[0] | |
train_test = pd.concat((train, test)).reset_index(drop=True).reset_index(drop=False) | |
#Startweek and Endweek: the start and end time in a week | |
train_test = train_test[train_test['Starttime'].notnull()] | |
train_test['Starttime'] = (train_test['Starttime']*100).astype(int) | |
train_test['Duration'] = (train_test['Duration']*100).astype(int) | |
train_test['Endtime'] = train_test['Starttime']+train_test['Duration'] | |
train_test['Startweek'] = train_test['Starttime'].mod(1680) | |
train_test['Endweek'] = train_test['Endtime'].mod(1680) | |
#find the difference of ID, Starttime, Duration between neighboring data | |
train_test['Diff1'] = train_test[ID].diff().fillna(9999999).astype(int) | |
train_test['Diff2'] = train_test[ID].iloc[::-1].diff().fillna(9999999).astype(int) | |
train_test = train_test.sort(['Starttime'], ascending=True) | |
train_test['Diff3'] = train_test[ID].diff().fillna(9999999).astype(int) | |
train_test['Diff4'] = train_test[ID].iloc[::-1].diff().fillna(9999999).astype(int) | |
train_test['DiffStarttime1'] = train_test['Starttime'].diff().fillna(9999999).astype(int) | |
train_test['DiffStarttime2'] = train_test['Starttime'].iloc[::-1].diff().fillna(9999999).astype(int) | |
train_test['DiffDuration1'] = train_test['Duration'].diff().fillna(9999999).astype(int) | |
train_test['DiffDuration2'] = train_test['Duration'].iloc[::-1].diff().fillna(9999999).astype(int) | |
#count the number of records with the same Starttime, and the number of record in the next 2.5h | |
train_test['Samestartcnt'] = 1 | |
num_samebegin = train_test.groupby('Starttime').sum()[['Samestartcnt']].reset_index(drop=False) | |
train_test = train_test.drop('Samestartcnt', axis=1).merge(num_samebegin, how='left', on='Starttime') | |
train_test['Next2.5hcnt'] = 1 | |
num_next2_5h = train_test.groupby('Starttime').sum()[['Next2.5hcnt']].reset_index(drop=False) | |
for i in range(len(num_next2_5h)): | |
num_next2_5h['Next2.5hcnt'][i] = num_next2_5h[(num_next2_5h['Starttime']>num_next2_5h['Starttime'][i]) & (num_next2_5h['Starttime']<(num_next2_5h['Starttime'][i]+25))]['Next2.5hcnt'].sum() | |
train_test = train_test.drop('Next2.5hcnt', axis=1).merge(num_next2_5h, how='left', on='Starttime') | |
#seperate train and test | |
train_test = train_test.sort(['index'], ascending=True).drop('index', axis=1) | |
train = train_test[:n_train].reset_index(drop=True) | |
test = train_test[n_train:].reset_index(drop=True) | |
#draw the Shopfloor graph with network drawer graphviz | |
def cnt_pair(shop_floor, station_pair): | |
for pair in station_pair: | |
try: | |
shop_floor[pair] += 1 | |
except: | |
shop_floor[pair] = 1 | |
return shop_floor | |
from graphviz import Graph | |
date_cols = pd.read_csv('traindate.csv', nrows=2) | |
date_cols = date_cols.drop('Id', axis=1).count().reset_index() | |
date_cols['station'] = date_cols['index'].apply(lambda x: x.split('_')[1]) | |
stations = date_cols.drop_duplicates('station')['station'].tolist() | |
date_cols = date_cols.drop_duplicates('station')['index'].tolist() | |
date = pd.read_csv('traindate.csv', usecols=date_cols) | |
date.columns = stations | |
shop_floor = {} | |
for i in range(len(date)): | |
station_pair = date[i:i+1] | |
station_pair = station_pair.columns[station_pair.notnull().any()].tolist() | |
station_pair = zip(station_pair[:-1], station_pair[1:]) | |
shop_floor = cnt_pair(shop_floor, station_pair) | |
shop_floor = {key:value for key, value in shop_floor.iteritems() if value>100} | |
g = Graph(format='pdf') | |
key = shop_floor.keys() | |
for aa in key: | |
g.edge(aa[0], aa[1], label=str(shop_floor[aa])) | |
g.render('img/shop_floor', view=True) | |
#according to the shop_floor graph, we can see that there are two parallel productions lines: S0-S11, and | |
# s12 - S23, we are going to confirm that the corresponding station pairs have the same amount of features, | |
# for example, both S0 and S12 have 12 features | |
date_cols = pd.read_csv('trainnumeric.csv', nrows=2) | |
date_cols = date_cols.drop(RESPONSE, axis=1) | |
date_cols = date_cols.drop(ID, axis=1).count().reset_index() | |
date_cols['Station'] = date_cols['index'].apply(lambda x : x.split('_')[1]) | |
stations_feature = Counter(date_cols['Station'].tolist()) | |
print stations_feature | |
#find out the parallel features on stations S0--S23 | |
parallel_cols = {} | |
for i in range(52): | |
parallel_cols['S'+str(i)] = date_cols[date_cols['Station']==('S'+str(i))]['index'].tolist() | |
#read the numerical data, and then merge the parallel cols | |
train_numeric = pd.read_csv('trainnumeric.csv') | |
for c1, c2 in zip(parallel_cols['S0'], parallel_cols['S12']): | |
train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1) | |
train_numeric = train_numeric.drop([c1, c2], axis=1) | |
for c1, c2 in zip(parallel_cols['S1'], parallel_cols['S13']): | |
train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1) | |
train_numeric = train_numeric.drop([c1, c2], axis=1) | |
for c1, c2, c3, c4 in zip(parallel_cols['S2'], parallel_cols['S3'], parallel_cols['S15'], parallel_cols['S14']): | |
train_numeric[c1+c2+c3+c4] = train_numeric[[c1,c2,c3,c4]].max(axis=1) | |
train_numeric = train_numeric.drop([c1, c2, c3, c4], axis=1) | |
for c1, c2, c3, c4 in zip(parallel_cols['S4'], parallel_cols['S5'], parallel_cols['S17'], parallel_cols['S16']): | |
train_numeric[c1+c2+c3+c4] = train_numeric[[c1,c2,c3,c4]].max(axis=1) | |
train_numeric = train_numeric.drop([c1, c2, c3, c4], axis=1) | |
for c1, c2, c3, c4 in zip(parallel_cols['S6'], parallel_cols['S7'], parallel_cols['S18'], parallel_cols['S19']): | |
train_numeric[c1+c2+c3+c4] = train_numeric[[c1,c2,c3,c4]].max(axis=1) | |
train_numeric = train_numeric.drop([c1, c2, c3, c4], axis=1) | |
for c1, c2 in zip(parallel_cols['S8'], parallel_cols['S20']): | |
train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1) | |
train_numeric = train_numeric.drop([c1, c2], axis=1) | |
for c1, c2, c3, c4, c5, c6 in zip(parallel_cols['S9'], parallel_cols['S10'], parallel_cols['S11'], parallel_cols['S21'], parallel_cols['S22'], parallel_cols['S23']): | |
train_numeric[c1+c2+c3+c4+c5+c6] = train_numeric[[c1,c2,c3,c4,c5,c6]].max(axis=1) | |
train_numeric = train_numeric.drop([c1, c2, c3, c4, c5, c6], axis=1) | |
for c1, c2 in zip(parallel_cols['S43'], parallel_cols['S44']): | |
train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1) | |
train_numeric = train_numeric.drop([c1, c2], axis=1) | |
for c1, c2 in zip(parallel_cols['S50'], parallel_cols['S49']): | |
train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1) | |
train_numeric = train_numeric.drop([c1, c2], axis=1) | |
for c1, c2 in zip(parallel_cols['S36'], parallel_cols['S35']): | |
train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1) | |
train_numeric = train_numeric.drop([c1, c2], axis=1) | |
#merge the numerical features and the date features | |
train_data = train_numeric.merge(train, on=ID, how='left') | |
#cross-validation | |
mcc0, mcc1, mcc2 = [], [], [] | |
kf = KFold(n_splits=5) | |
for cv_train_index, cv_test_index in kf.split(train_data): | |
cv_train, cv_test = train_data.ix[cv_train_index], train_data.ix[cv_test_index] | |
feature_cols = np.setdiff1d(cv_train.columns.tolist(), [RESPONSE, ID]) | |
prior = cv_train[RESPONSE].sum()*1.0/len(cv_train) | |
xgb_params = { | |
'seed': 0, | |
'colsample_bytree': 0.7, | |
'silent': 1, | |
'subsample': 0.7, | |
'learning_rate': 0.1, | |
'objective': 'binary:logistic', | |
'max_depth': 4, | |
'num_parallel_tree': 1, | |
'min_child_weight': 2, | |
'eval_metric': 'auc', | |
'base_score': prior | |
} | |
dtrain = xgb.DMatrix(cv_train[feature_cols], cv_train[RESPONSE]) | |
dtest = xgb.DMatrix(cv_test[feature_cols]) | |
num_boost_rounds = 50 | |
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds) | |
y_pred = model.predict(dtest) | |
y_pred = y_pred/y_pred.max() | |
critical_val = np.sort(y_pred)[int(len(y_pred)*(1.0-prior))] | |
pred = [1 if i>=critical_val else 0 for i in y_pred] | |
mcc0.append(mc(pred, cv_test[RESPONSE])) | |
#xgb importance of the features | |
outputfile = open('xgb.fmap', 'w') | |
i = 0 | |
for feat in feature_cols: | |
outputfile.write('{0}\t{1}\tq\n'.format(i, feat)) | |
i += 1 | |
outputfile.close() | |
importance = model.get_fscore(fmap='xgb.fmap') | |
importance = sorted(importance.items(), key=operator.itemgetter(1)) | |
df = pd.DataFrame(importance, columns=['feature', 'fscore']) | |
#try different feature set | |
new_feature = df['feature'][-300:].tolist() | |
dtrain = xgb.DMatrix(cv_train[new_feature], cv_train[RESPONSE]) | |
dtest = xgb.DMatrix(cv_test[new_feature]) | |
num_boost_rounds = 50 | |
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds) | |
y_pred = model.predict(dtest) | |
y_pred = y_pred/y_pred.max() | |
critical_val = np.sort(y_pred)[int(len(y_pred)*(1.0-prior))] | |
pred = [1 if i>=critical_val else 0 for i in y_pred] | |
mcc1.append(mc(pred, cv_test[RESPONSE])) | |
#try another feature set | |
new_feature = parallel_cols['S32']+parallel_cols['S33']+['Diff4', 'Diff3', 'Duration', 'Endweek'] | |
dtrain = xgb.DMatrix(cv_train[new_feature], cv_train[RESPONSE]) | |
dtest = xgb.DMatrix(cv_test[new_feature]) | |
num_boost_rounds = 50 | |
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds) | |
y_pred = model.predict(dtest) | |
y_pred = y_pred/y_pred.max() | |
critical_val = np.sort(y_pred)[int(len(y_pred)*(1.0-prior))] | |
pred = [1 if i>=critical_val else 0 for i in y_pred] | |
mcc2.append(mc(pred, cv_test[RESPONSE])) | |
print 'whole features MCC0' | |
print mcc0 | |
print 'part features MCC1' | |
print mcc1 | |
print 'part features MCC2' | |
print mcc2 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment