Skip to content

Instantly share code, notes, and snippets.

@shengch02
Last active July 19, 2017 01:46
Show Gist options
  • Save shengch02/fa77c4d2998dd1e48604c560458a22ea to your computer and use it in GitHub Desktop.
Save shengch02/fa77c4d2998dd1e48604c560458a22ea to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import matthews_corrcoef as mc
import operator
NROWS = 25000
CHUNKSIZE = 5000
ID = 'Id'
RESPONSE = 'Response'
#analyze and understand the date data, auto-correlation graph
date = pd.read_csv('traindate.csv')
date_feature_cols = np.setdiff1d(date.columns.tolist(), [ID, RESPONSE])
times = []
for feature_name in date_feature_cols:
df = date[[feature_name]].copy()
df.columns = ['time']
df['Station'] = feature_name.split('_')[1][1:]
df = df.dropna()
times.append(df)
times = pd.concat(times)
times['time'] = (times['time']*100).astype(int)
time_cnt = times.groupby('time').count()[['Station']].reset_index()
time_cnt.columns = ['time', 'cnt']
plt.plot(time_cnt['time'], time_cnt['cnt'])
plt.show()
time_tick = range(time_cnt['time'].min(), time_cnt['time'].max()+1, 1)
time_tick = pd.DataFrame({'time': time_tick})
time_station_cnt = (pd.merge(time_tick, time_cnt, on='time', how='left').fillna(0))['cnt'].tolist()
auto_corr = [1]+[np.corrcoef(time_station_cnt[:-k],time_station_cnt[k:])[0, 1] for k in range(1, 5000)]
plt.plot(auto_corr)
plt.show()
#read the date data
train = pd.read_csv('traindate.csv', usecols=[ID], nrows=NROWS)
test = pd.read_csv('testdate.csv', usecols=[ID], nrows=NROWS)
train['Starttime'] = -1.0
test['Starttime'] = -1.0
train['Duration'] = -1.0
test['Duration'] = -1.0
#find the starting time and duration for each ID
for tr, te in zip(pd.read_csv('traindate.csv', chunksize=CHUNKSIZE), pd.read_csv('testdate.csv', chunksize=CHUNKSIZE)):
feature_columns = np.setdiff1d(np.array(tr.columns), [ID])
tr['Min'] = tr[feature_columns].min(axis=1)
tr['Duration'] = tr[feature_columns].max(axis=1) - tr[feature_columns].min(axis=1)
te['Min'] = te[feature_columns].min(axis=1)
te['Duration'] = te[feature_columns].max(axis=1) - te[feature_columns].min(axis=1)
train.loc[train[ID].isin(tr[ID]), 'Starttime'] = tr['Min'].values
train.loc[train[ID].isin(tr[ID]), 'Duration'] = tr['Duration'].values
test.loc[test[ID].isin(te[ID]), 'Starttime'] = te['Min'].values
test.loc[test[ID].isin(te[ID]), 'Duration'] = te['Duration'].values
#concat train and test data
n_train = train.shape[0]
train_test = pd.concat((train, test)).reset_index(drop=True).reset_index(drop=False)
#Startweek and Endweek: the start and end time in a week
train_test = train_test[train_test['Starttime'].notnull()]
train_test['Starttime'] = (train_test['Starttime']*100).astype(int)
train_test['Duration'] = (train_test['Duration']*100).astype(int)
train_test['Endtime'] = train_test['Starttime']+train_test['Duration']
train_test['Startweek'] = train_test['Starttime'].mod(1680)
train_test['Endweek'] = train_test['Endtime'].mod(1680)
#find the difference of ID, Starttime, Duration between neighboring data
train_test['Diff1'] = train_test[ID].diff().fillna(9999999).astype(int)
train_test['Diff2'] = train_test[ID].iloc[::-1].diff().fillna(9999999).astype(int)
train_test = train_test.sort(['Starttime'], ascending=True)
train_test['Diff3'] = train_test[ID].diff().fillna(9999999).astype(int)
train_test['Diff4'] = train_test[ID].iloc[::-1].diff().fillna(9999999).astype(int)
train_test['DiffStarttime1'] = train_test['Starttime'].diff().fillna(9999999).astype(int)
train_test['DiffStarttime2'] = train_test['Starttime'].iloc[::-1].diff().fillna(9999999).astype(int)
train_test['DiffDuration1'] = train_test['Duration'].diff().fillna(9999999).astype(int)
train_test['DiffDuration2'] = train_test['Duration'].iloc[::-1].diff().fillna(9999999).astype(int)
#count the number of records with the same Starttime, and the number of record in the next 2.5h
train_test['Samestartcnt'] = 1
num_samebegin = train_test.groupby('Starttime').sum()[['Samestartcnt']].reset_index(drop=False)
train_test = train_test.drop('Samestartcnt', axis=1).merge(num_samebegin, how='left', on='Starttime')
train_test['Next2.5hcnt'] = 1
num_next2_5h = train_test.groupby('Starttime').sum()[['Next2.5hcnt']].reset_index(drop=False)
for i in range(len(num_next2_5h)):
num_next2_5h['Next2.5hcnt'][i] = num_next2_5h[(num_next2_5h['Starttime']>num_next2_5h['Starttime'][i]) & (num_next2_5h['Starttime']<(num_next2_5h['Starttime'][i]+25))]['Next2.5hcnt'].sum()
train_test = train_test.drop('Next2.5hcnt', axis=1).merge(num_next2_5h, how='left', on='Starttime')
#seperate train and test
train_test = train_test.sort(['index'], ascending=True).drop('index', axis=1)
train = train_test[:n_train].reset_index(drop=True)
test = train_test[n_train:].reset_index(drop=True)
#draw the Shopfloor graph with network drawer graphviz
def cnt_pair(shop_floor, station_pair):
for pair in station_pair:
try:
shop_floor[pair] += 1
except:
shop_floor[pair] = 1
return shop_floor
from graphviz import Graph
date_cols = pd.read_csv('traindate.csv', nrows=2)
date_cols = date_cols.drop('Id', axis=1).count().reset_index()
date_cols['station'] = date_cols['index'].apply(lambda x: x.split('_')[1])
stations = date_cols.drop_duplicates('station')['station'].tolist()
date_cols = date_cols.drop_duplicates('station')['index'].tolist()
date = pd.read_csv('traindate.csv', usecols=date_cols)
date.columns = stations
shop_floor = {}
for i in range(len(date)):
station_pair = date[i:i+1]
station_pair = station_pair.columns[station_pair.notnull().any()].tolist()
station_pair = zip(station_pair[:-1], station_pair[1:])
shop_floor = cnt_pair(shop_floor, station_pair)
shop_floor = {key:value for key, value in shop_floor.iteritems() if value>100}
g = Graph(format='pdf')
key = shop_floor.keys()
for aa in key:
g.edge(aa[0], aa[1], label=str(shop_floor[aa]))
g.render('img/shop_floor', view=True)
#according to the shop_floor graph, we can see that there are two parallel productions lines: S0-S11, and
# s12 - S23, we are going to confirm that the corresponding station pairs have the same amount of features,
# for example, both S0 and S12 have 12 features
date_cols = pd.read_csv('trainnumeric.csv', nrows=2)
date_cols = date_cols.drop(RESPONSE, axis=1)
date_cols = date_cols.drop(ID, axis=1).count().reset_index()
date_cols['Station'] = date_cols['index'].apply(lambda x : x.split('_')[1])
stations_feature = Counter(date_cols['Station'].tolist())
print stations_feature
#find out the parallel features on stations S0--S23
parallel_cols = {}
for i in range(52):
parallel_cols['S'+str(i)] = date_cols[date_cols['Station']==('S'+str(i))]['index'].tolist()
#read the numerical data, and then merge the parallel cols
train_numeric = pd.read_csv('trainnumeric.csv')
for c1, c2 in zip(parallel_cols['S0'], parallel_cols['S12']):
train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
train_numeric = train_numeric.drop([c1, c2], axis=1)
for c1, c2 in zip(parallel_cols['S1'], parallel_cols['S13']):
train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
train_numeric = train_numeric.drop([c1, c2], axis=1)
for c1, c2, c3, c4 in zip(parallel_cols['S2'], parallel_cols['S3'], parallel_cols['S15'], parallel_cols['S14']):
train_numeric[c1+c2+c3+c4] = train_numeric[[c1,c2,c3,c4]].max(axis=1)
train_numeric = train_numeric.drop([c1, c2, c3, c4], axis=1)
for c1, c2, c3, c4 in zip(parallel_cols['S4'], parallel_cols['S5'], parallel_cols['S17'], parallel_cols['S16']):
train_numeric[c1+c2+c3+c4] = train_numeric[[c1,c2,c3,c4]].max(axis=1)
train_numeric = train_numeric.drop([c1, c2, c3, c4], axis=1)
for c1, c2, c3, c4 in zip(parallel_cols['S6'], parallel_cols['S7'], parallel_cols['S18'], parallel_cols['S19']):
train_numeric[c1+c2+c3+c4] = train_numeric[[c1,c2,c3,c4]].max(axis=1)
train_numeric = train_numeric.drop([c1, c2, c3, c4], axis=1)
for c1, c2 in zip(parallel_cols['S8'], parallel_cols['S20']):
train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
train_numeric = train_numeric.drop([c1, c2], axis=1)
for c1, c2, c3, c4, c5, c6 in zip(parallel_cols['S9'], parallel_cols['S10'], parallel_cols['S11'], parallel_cols['S21'], parallel_cols['S22'], parallel_cols['S23']):
train_numeric[c1+c2+c3+c4+c5+c6] = train_numeric[[c1,c2,c3,c4,c5,c6]].max(axis=1)
train_numeric = train_numeric.drop([c1, c2, c3, c4, c5, c6], axis=1)
for c1, c2 in zip(parallel_cols['S43'], parallel_cols['S44']):
train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
train_numeric = train_numeric.drop([c1, c2], axis=1)
for c1, c2 in zip(parallel_cols['S50'], parallel_cols['S49']):
train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
train_numeric = train_numeric.drop([c1, c2], axis=1)
for c1, c2 in zip(parallel_cols['S36'], parallel_cols['S35']):
train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
train_numeric = train_numeric.drop([c1, c2], axis=1)
#merge the numerical features and the date features
train_data = train_numeric.merge(train, on=ID, how='left')
#cross-validation
mcc0, mcc1, mcc2 = [], [], []
kf = KFold(n_splits=5)
for cv_train_index, cv_test_index in kf.split(train_data):
cv_train, cv_test = train_data.ix[cv_train_index], train_data.ix[cv_test_index]
feature_cols = np.setdiff1d(cv_train.columns.tolist(), [RESPONSE, ID])
prior = cv_train[RESPONSE].sum()*1.0/len(cv_train)
xgb_params = {
'seed': 0,
'colsample_bytree': 0.7,
'silent': 1,
'subsample': 0.7,
'learning_rate': 0.1,
'objective': 'binary:logistic',
'max_depth': 4,
'num_parallel_tree': 1,
'min_child_weight': 2,
'eval_metric': 'auc',
'base_score': prior
}
dtrain = xgb.DMatrix(cv_train[feature_cols], cv_train[RESPONSE])
dtest = xgb.DMatrix(cv_test[feature_cols])
num_boost_rounds = 50
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)
y_pred = y_pred/y_pred.max()
critical_val = np.sort(y_pred)[int(len(y_pred)*(1.0-prior))]
pred = [1 if i>=critical_val else 0 for i in y_pred]
mcc0.append(mc(pred, cv_test[RESPONSE]))
#xgb importance of the features
outputfile = open('xgb.fmap', 'w')
i = 0
for feat in feature_cols:
outputfile.write('{0}\t{1}\tq\n'.format(i, feat))
i += 1
outputfile.close()
importance = model.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
#try different feature set
new_feature = df['feature'][-300:].tolist()
dtrain = xgb.DMatrix(cv_train[new_feature], cv_train[RESPONSE])
dtest = xgb.DMatrix(cv_test[new_feature])
num_boost_rounds = 50
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)
y_pred = y_pred/y_pred.max()
critical_val = np.sort(y_pred)[int(len(y_pred)*(1.0-prior))]
pred = [1 if i>=critical_val else 0 for i in y_pred]
mcc1.append(mc(pred, cv_test[RESPONSE]))
#try another feature set
new_feature = parallel_cols['S32']+parallel_cols['S33']+['Diff4', 'Diff3', 'Duration', 'Endweek']
dtrain = xgb.DMatrix(cv_train[new_feature], cv_train[RESPONSE])
dtest = xgb.DMatrix(cv_test[new_feature])
num_boost_rounds = 50
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)
y_pred = y_pred/y_pred.max()
critical_val = np.sort(y_pred)[int(len(y_pred)*(1.0-prior))]
pred = [1 if i>=critical_val else 0 for i in y_pred]
mcc2.append(mc(pred, cv_test[RESPONSE]))
print 'whole features MCC0'
print mcc0
print 'part features MCC1'
print mcc1
print 'part features MCC2'
print mcc2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment