shengch02 · July 19, 2017 01:46
diff --git a/Bosch Kaggle competition solution b/Bosch Kaggle competition solution
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from collections import Counter
 from sklearn.model_selection import KFold
 from sklearn.ensemble import RandomForestClassifier
 import xgboost as xgb
 from sklearn.metrics import matthews_corrcoef as mc
 import operator

 NROWS = 25000
 CHUNKSIZE = 5000
 ID = 'Id'
 RESPONSE = 'Response'

 #analyze and understand the date data, auto-correlation graph
 date = pd.read_csv('traindate.csv')
 date_feature_cols = np.setdiff1d(date.columns.tolist(), [ID, RESPONSE])
 times = []
 for feature_name in date_feature_cols:
 	df = date[[feature_name]].copy()
 	df.columns = ['time']
 	df['Station'] = feature_name.split('_')[1][1:]
 	df = df.dropna()
 	times.append(df)
 times = pd.concat(times)
 times['time'] = (times['time']*100).astype(int)
 time_cnt = times.groupby('time').count()[['Station']].reset_index()
 time_cnt.columns = ['time', 'cnt']
 plt.plot(time_cnt['time'], time_cnt['cnt'])
 plt.show()
 time_tick = range(time_cnt['time'].min(), time_cnt['time'].max()+1, 1)
 time_tick = pd.DataFrame({'time': time_tick})
 time_station_cnt = (pd.merge(time_tick, time_cnt, on='time', how='left').fillna(0))['cnt'].tolist()
 auto_corr = [1]+[np.corrcoef(time_station_cnt[:-k],time_station_cnt[k:])[0, 1] for k in range(1, 5000)]
 plt.plot(auto_corr)
 plt.show()

 #read the date data
 train = pd.read_csv('traindate.csv', usecols=[ID], nrows=NROWS)
 test = pd.read_csv('testdate.csv', usecols=[ID], nrows=NROWS)
 train['Starttime'] = -1.0
 test['Starttime'] = -1.0
 train['Duration'] = -1.0
 test['Duration'] = -1.0
 #find the starting time and duration for each ID
 for tr, te in zip(pd.read_csv('traindate.csv', chunksize=CHUNKSIZE), pd.read_csv('testdate.csv', chunksize=CHUNKSIZE)):
 	feature_columns = np.setdiff1d(np.array(tr.columns), [ID])
 	tr['Min'] = tr[feature_columns].min(axis=1)
 	tr['Duration'] = tr[feature_columns].max(axis=1) - tr[feature_columns].min(axis=1)
 	te['Min'] = te[feature_columns].min(axis=1)
 	te['Duration'] = te[feature_columns].max(axis=1) - te[feature_columns].min(axis=1)
 	train.loc[train[ID].isin(tr[ID]), 'Starttime'] = tr['Min'].values
 	train.loc[train[ID].isin(tr[ID]), 'Duration'] = tr['Duration'].values
 	test.loc[test[ID].isin(te[ID]), 'Starttime'] = te['Min'].values
 	test.loc[test[ID].isin(te[ID]), 'Duration'] = te['Duration'].values
 #concat train and test data
 n_train = train.shape[0]
 train_test = pd.concat((train, test)).reset_index(drop=True).reset_index(drop=False)
 #Startweek and Endweek: the start and end time in a week
 train_test  = train_test[train_test['Starttime'].notnull()]
 train_test['Starttime'] = (train_test['Starttime']*100).astype(int)
 train_test['Duration'] = (train_test['Duration']*100).astype(int)
 train_test['Endtime'] = train_test['Starttime']+train_test['Duration']
 train_test['Startweek'] = train_test['Starttime'].mod(1680)
 train_test['Endweek'] = train_test['Endtime'].mod(1680)
 #find the difference of ID, Starttime, Duration between neighboring data
 train_test['Diff1'] = train_test[ID].diff().fillna(9999999).astype(int)
 train_test['Diff2'] = train_test[ID].iloc[::-1].diff().fillna(9999999).astype(int)
 train_test = train_test.sort(['Starttime'], ascending=True)
 train_test['Diff3'] = train_test[ID].diff().fillna(9999999).astype(int)
 train_test['Diff4'] = train_test[ID].iloc[::-1].diff().fillna(9999999).astype(int)
 train_test['DiffStarttime1'] = train_test['Starttime'].diff().fillna(9999999).astype(int)
 train_test['DiffStarttime2'] = train_test['Starttime'].iloc[::-1].diff().fillna(9999999).astype(int)
 train_test['DiffDuration1'] = train_test['Duration'].diff().fillna(9999999).astype(int)
 train_test['DiffDuration2'] = train_test['Duration'].iloc[::-1].diff().fillna(9999999).astype(int)
 #count the number of records with the same Starttime, and the number of record in the next 2.5h
 train_test['Samestartcnt'] = 1
 num_samebegin =  train_test.groupby('Starttime').sum()[['Samestartcnt']].reset_index(drop=False)
 train_test = train_test.drop('Samestartcnt', axis=1).merge(num_samebegin, how='left', on='Starttime')
 train_test['Next2.5hcnt'] = 1
 num_next2_5h = train_test.groupby('Starttime').sum()[['Next2.5hcnt']].reset_index(drop=False)
 for i in range(len(num_next2_5h)):
 	num_next2_5h['Next2.5hcnt'][i] = num_next2_5h[(num_next2_5h['Starttime']>num_next2_5h['Starttime'][i]) & (num_next2_5h['Starttime']<(num_next2_5h['Starttime'][i]+25))]['Next2.5hcnt'].sum()
 train_test = train_test.drop('Next2.5hcnt', axis=1).merge(num_next2_5h, how='left', on='Starttime')
 #seperate train and test
 train_test = train_test.sort(['index'], ascending=True).drop('index', axis=1)
 train = train_test[:n_train].reset_index(drop=True)
 test = train_test[n_train:].reset_index(drop=True)

 #draw the Shopfloor graph with network drawer graphviz
 def cnt_pair(shop_floor, station_pair):
 	for pair in station_pair:
 		try:
 			shop_floor[pair] += 1
 		except:
 			shop_floor[pair] = 1
 	return shop_floor

 from graphviz import Graph
 date_cols = pd.read_csv('traindate.csv', nrows=2)
 date_cols = date_cols.drop('Id', axis=1).count().reset_index()
 date_cols['station'] = date_cols['index'].apply(lambda x: x.split('_')[1])
 stations = date_cols.drop_duplicates('station')['station'].tolist()
 date_cols = date_cols.drop_duplicates('station')['index'].tolist()
 date = pd.read_csv('traindate.csv', usecols=date_cols)
 date.columns = stations
 shop_floor = {}
 for i in range(len(date)):
 	station_pair = date[i:i+1]
 	station_pair = station_pair.columns[station_pair.notnull().any()].tolist()
 	station_pair = zip(station_pair[:-1], station_pair[1:])
 	shop_floor  = cnt_pair(shop_floor, station_pair)
 shop_floor = {key:value for key, value in shop_floor.iteritems() if value>100}
 g = Graph(format='pdf')
 key = shop_floor.keys()
 for aa in key:
 	g.edge(aa[0], aa[1], label=str(shop_floor[aa]))
 g.render('img/shop_floor', view=True)

 #according to the shop_floor graph, we can see that there are two parallel productions lines: S0-S11, and 
 # s12 - S23, we are going to confirm that the corresponding station pairs have the same amount of features,
 # for example, both S0 and S12 have 12 features
 date_cols = pd.read_csv('trainnumeric.csv', nrows=2)
 date_cols = date_cols.drop(RESPONSE, axis=1)
 date_cols = date_cols.drop(ID, axis=1).count().reset_index()
 date_cols['Station'] = date_cols['index'].apply(lambda x : x.split('_')[1])
 stations_feature = Counter(date_cols['Station'].tolist())
 print stations_feature
 #find out the parallel features on stations S0--S23
 parallel_cols = {}
 for i in range(52):
 	parallel_cols['S'+str(i)] = date_cols[date_cols['Station']==('S'+str(i))]['index'].tolist()
 #read the numerical data, and then merge the parallel cols
 train_numeric = pd.read_csv('trainnumeric.csv')
 for c1, c2 in zip(parallel_cols['S0'], parallel_cols['S12']):
 	train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
 	train_numeric = train_numeric.drop([c1, c2], axis=1)
 for c1, c2 in zip(parallel_cols['S1'], parallel_cols['S13']):
 	train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
 	train_numeric = train_numeric.drop([c1, c2], axis=1)
 for c1, c2, c3, c4 in zip(parallel_cols['S2'], parallel_cols['S3'], parallel_cols['S15'], parallel_cols['S14']):
 	train_numeric[c1+c2+c3+c4] = train_numeric[[c1,c2,c3,c4]].max(axis=1)
 	train_numeric = train_numeric.drop([c1, c2, c3, c4], axis=1)
 for c1, c2, c3, c4 in zip(parallel_cols['S4'], parallel_cols['S5'], parallel_cols['S17'], parallel_cols['S16']):
 	train_numeric[c1+c2+c3+c4] = train_numeric[[c1,c2,c3,c4]].max(axis=1)
 	train_numeric = train_numeric.drop([c1, c2, c3, c4], axis=1)
 for c1, c2, c3, c4 in zip(parallel_cols['S6'], parallel_cols['S7'], parallel_cols['S18'], parallel_cols['S19']):
 	train_numeric[c1+c2+c3+c4] = train_numeric[[c1,c2,c3,c4]].max(axis=1)
 	train_numeric = train_numeric.drop([c1, c2, c3, c4], axis=1)
 for c1, c2 in zip(parallel_cols['S8'], parallel_cols['S20']):
 	train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
 	train_numeric = train_numeric.drop([c1, c2], axis=1)
 for c1, c2, c3, c4, c5, c6 in zip(parallel_cols['S9'], parallel_cols['S10'], parallel_cols['S11'], parallel_cols['S21'], parallel_cols['S22'], parallel_cols['S23']):
 	train_numeric[c1+c2+c3+c4+c5+c6] = train_numeric[[c1,c2,c3,c4,c5,c6]].max(axis=1)
 	train_numeric = train_numeric.drop([c1, c2, c3, c4, c5, c6], axis=1)
 for c1, c2 in zip(parallel_cols['S43'], parallel_cols['S44']):
 	train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
 	train_numeric = train_numeric.drop([c1, c2], axis=1)
 for c1, c2 in zip(parallel_cols['S50'], parallel_cols['S49']):
 	train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
 	train_numeric = train_numeric.drop([c1, c2], axis=1)
 for c1, c2 in zip(parallel_cols['S36'], parallel_cols['S35']):
 	train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
 	train_numeric = train_numeric.drop([c1, c2], axis=1)

 #merge the numerical features and the date features
 train_data = train_numeric.merge(train, on=ID, how='left')
 #cross-validation
 mcc0, mcc1, mcc2 = [], [], []
 kf = KFold(n_splits=5)
 for cv_train_index, cv_test_index in kf.split(train_data):
 	cv_train, cv_test = train_data.ix[cv_train_index], train_data.ix[cv_test_index]
 	feature_cols = np.setdiff1d(cv_train.columns.tolist(), [RESPONSE, ID])
 	prior = cv_train[RESPONSE].sum()*1.0/len(cv_train)
 	xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 2,
    'eval_metric': 'auc',
    'base_score': prior
 }
 	dtrain = xgb.DMatrix(cv_train[feature_cols], cv_train[RESPONSE])
 	dtest = xgb.DMatrix(cv_test[feature_cols])
 	num_boost_rounds = 50
 	model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
 	y_pred = model.predict(dtest)
 	y_pred = y_pred/y_pred.max()
 	critical_val = np.sort(y_pred)[int(len(y_pred)*(1.0-prior))]
 	pred = [1 if i>=critical_val else 0 for i in y_pred]
 	mcc0.append(mc(pred, cv_test[RESPONSE]))
 #xgb importance of the features
 	outputfile = open('xgb.fmap', 'w')
 	i = 0
 	for feat in feature_cols:
 		outputfile.write('{0}\t{1}\tq\n'.format(i, feat))
 		i += 1
 	outputfile.close()
 	importance = model.get_fscore(fmap='xgb.fmap')
 	importance = sorted(importance.items(), key=operator.itemgetter(1))
 	df = pd.DataFrame(importance, columns=['feature', 'fscore'])
 #try different feature set
 	new_feature = df['feature'][-300:].tolist()
 	dtrain = xgb.DMatrix(cv_train[new_feature], cv_train[RESPONSE])
 	dtest = xgb.DMatrix(cv_test[new_feature])
 	num_boost_rounds = 50
 	model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
 	y_pred = model.predict(dtest)
 	y_pred = y_pred/y_pred.max()
 	critical_val = np.sort(y_pred)[int(len(y_pred)*(1.0-prior))]
 	pred = [1 if i>=critical_val else 0 for i in y_pred]
 	mcc1.append(mc(pred, cv_test[RESPONSE]))
 #try another feature set 
 	new_feature = parallel_cols['S32']+parallel_cols['S33']+['Diff4', 'Diff3', 'Duration', 'Endweek']
 	dtrain = xgb.DMatrix(cv_train[new_feature], cv_train[RESPONSE])
 	dtest = xgb.DMatrix(cv_test[new_feature])
 	num_boost_rounds = 50
 	model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
 	y_pred = model.predict(dtest)
 	y_pred = y_pred/y_pred.max()
 	critical_val = np.sort(y_pred)[int(len(y_pred)*(1.0-prior))]
 	pred = [1 if i>=critical_val else 0 for i in y_pred]
 	mcc2.append(mc(pred, cv_test[RESPONSE]))
 print 'whole features MCC0'
 print mcc0
 print 'part features MCC1'
 print mcc1
 print 'part features MCC2'
 print mcc2
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from collections import Counter
	from sklearn.model_selection import KFold
	from sklearn.ensemble import RandomForestClassifier
	import xgboost as xgb
	from sklearn.metrics import matthews_corrcoef as mc
	import operator

	NROWS = 25000
	CHUNKSIZE = 5000
	ID = 'Id'
	RESPONSE = 'Response'

	#analyze and understand the date data, auto-correlation graph
	date = pd.read_csv('traindate.csv')
	date_feature_cols = np.setdiff1d(date.columns.tolist(), [ID, RESPONSE])
	times = []
	for feature_name in date_feature_cols:
	df = date[[feature_name]].copy()
	df.columns = ['time']
	df['Station'] = feature_name.split('_')[1][1:]
	df = df.dropna()
	times.append(df)
	times = pd.concat(times)
	times['time'] = (times['time']*100).astype(int)
	time_cnt = times.groupby('time').count()[['Station']].reset_index()
	time_cnt.columns = ['time', 'cnt']
	plt.plot(time_cnt['time'], time_cnt['cnt'])
	plt.show()
	time_tick = range(time_cnt['time'].min(), time_cnt['time'].max()+1, 1)
	time_tick = pd.DataFrame({'time': time_tick})
	time_station_cnt = (pd.merge(time_tick, time_cnt, on='time', how='left').fillna(0))['cnt'].tolist()
	auto_corr = [1]+[np.corrcoef(time_station_cnt[:-k],time_station_cnt[k:])[0, 1] for k in range(1, 5000)]
	plt.plot(auto_corr)
	plt.show()

	#read the date data
	train = pd.read_csv('traindate.csv', usecols=[ID], nrows=NROWS)
	test = pd.read_csv('testdate.csv', usecols=[ID], nrows=NROWS)
	train['Starttime'] = -1.0
	test['Starttime'] = -1.0
	train['Duration'] = -1.0
	test['Duration'] = -1.0
	#find the starting time and duration for each ID
	for tr, te in zip(pd.read_csv('traindate.csv', chunksize=CHUNKSIZE), pd.read_csv('testdate.csv', chunksize=CHUNKSIZE)):
	feature_columns = np.setdiff1d(np.array(tr.columns), [ID])
	tr['Min'] = tr[feature_columns].min(axis=1)
	tr['Duration'] = tr[feature_columns].max(axis=1) - tr[feature_columns].min(axis=1)
	te['Min'] = te[feature_columns].min(axis=1)
	te['Duration'] = te[feature_columns].max(axis=1) - te[feature_columns].min(axis=1)
	train.loc[train[ID].isin(tr[ID]), 'Starttime'] = tr['Min'].values
	train.loc[train[ID].isin(tr[ID]), 'Duration'] = tr['Duration'].values
	test.loc[test[ID].isin(te[ID]), 'Starttime'] = te['Min'].values
	test.loc[test[ID].isin(te[ID]), 'Duration'] = te['Duration'].values
	#concat train and test data
	n_train = train.shape[0]
	train_test = pd.concat((train, test)).reset_index(drop=True).reset_index(drop=False)
	#Startweek and Endweek: the start and end time in a week
	train_test = train_test[train_test['Starttime'].notnull()]
	train_test['Starttime'] = (train_test['Starttime']*100).astype(int)
	train_test['Duration'] = (train_test['Duration']*100).astype(int)
	train_test['Endtime'] = train_test['Starttime']+train_test['Duration']
	train_test['Startweek'] = train_test['Starttime'].mod(1680)
	train_test['Endweek'] = train_test['Endtime'].mod(1680)
	#find the difference of ID, Starttime, Duration between neighboring data
	train_test['Diff1'] = train_test[ID].diff().fillna(9999999).astype(int)
	train_test['Diff2'] = train_test[ID].iloc[::-1].diff().fillna(9999999).astype(int)
	train_test = train_test.sort(['Starttime'], ascending=True)
	train_test['Diff3'] = train_test[ID].diff().fillna(9999999).astype(int)
	train_test['Diff4'] = train_test[ID].iloc[::-1].diff().fillna(9999999).astype(int)
	train_test['DiffStarttime1'] = train_test['Starttime'].diff().fillna(9999999).astype(int)
	train_test['DiffStarttime2'] = train_test['Starttime'].iloc[::-1].diff().fillna(9999999).astype(int)
	train_test['DiffDuration1'] = train_test['Duration'].diff().fillna(9999999).astype(int)
	train_test['DiffDuration2'] = train_test['Duration'].iloc[::-1].diff().fillna(9999999).astype(int)
	#count the number of records with the same Starttime, and the number of record in the next 2.5h
	train_test['Samestartcnt'] = 1
	num_samebegin = train_test.groupby('Starttime').sum()[['Samestartcnt']].reset_index(drop=False)
	train_test = train_test.drop('Samestartcnt', axis=1).merge(num_samebegin, how='left', on='Starttime')
	train_test['Next2.5hcnt'] = 1
	num_next2_5h = train_test.groupby('Starttime').sum()[['Next2.5hcnt']].reset_index(drop=False)
	for i in range(len(num_next2_5h)):
	num_next2_5h['Next2.5hcnt'][i] = num_next2_5h[(num_next2_5h['Starttime']>num_next2_5h['Starttime'][i]) & (num_next2_5h['Starttime']<(num_next2_5h['Starttime'][i]+25))]['Next2.5hcnt'].sum()
	train_test = train_test.drop('Next2.5hcnt', axis=1).merge(num_next2_5h, how='left', on='Starttime')
	#seperate train and test
	train_test = train_test.sort(['index'], ascending=True).drop('index', axis=1)
	train = train_test[:n_train].reset_index(drop=True)
	test = train_test[n_train:].reset_index(drop=True)

	#draw the Shopfloor graph with network drawer graphviz
	def cnt_pair(shop_floor, station_pair):
	for pair in station_pair:
	try:
	shop_floor[pair] += 1
	except:
	shop_floor[pair] = 1
	return shop_floor

	from graphviz import Graph
	date_cols = pd.read_csv('traindate.csv', nrows=2)
	date_cols = date_cols.drop('Id', axis=1).count().reset_index()
	date_cols['station'] = date_cols['index'].apply(lambda x: x.split('_')[1])
	stations = date_cols.drop_duplicates('station')['station'].tolist()
	date_cols = date_cols.drop_duplicates('station')['index'].tolist()
	date = pd.read_csv('traindate.csv', usecols=date_cols)
	date.columns = stations
	shop_floor = {}
	for i in range(len(date)):
	station_pair = date[i:i+1]
	station_pair = station_pair.columns[station_pair.notnull().any()].tolist()
	station_pair = zip(station_pair[:-1], station_pair[1:])
	shop_floor = cnt_pair(shop_floor, station_pair)
	shop_floor = {key:value for key, value in shop_floor.iteritems() if value>100}
	g = Graph(format='pdf')
	key = shop_floor.keys()
	for aa in key:
	g.edge(aa[0], aa[1], label=str(shop_floor[aa]))
	g.render('img/shop_floor', view=True)

	#according to the shop_floor graph, we can see that there are two parallel productions lines: S0-S11, and
	# s12 - S23, we are going to confirm that the corresponding station pairs have the same amount of features,
	# for example, both S0 and S12 have 12 features
	date_cols = pd.read_csv('trainnumeric.csv', nrows=2)
	date_cols = date_cols.drop(RESPONSE, axis=1)
	date_cols = date_cols.drop(ID, axis=1).count().reset_index()
	date_cols['Station'] = date_cols['index'].apply(lambda x : x.split('_')[1])
	stations_feature = Counter(date_cols['Station'].tolist())
	print stations_feature
	#find out the parallel features on stations S0--S23
	parallel_cols = {}
	for i in range(52):
	parallel_cols['S'+str(i)] = date_cols[date_cols['Station']==('S'+str(i))]['index'].tolist()
	#read the numerical data, and then merge the parallel cols
	train_numeric = pd.read_csv('trainnumeric.csv')
	for c1, c2 in zip(parallel_cols['S0'], parallel_cols['S12']):
	train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
	train_numeric = train_numeric.drop([c1, c2], axis=1)
	for c1, c2 in zip(parallel_cols['S1'], parallel_cols['S13']):
	train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
	train_numeric = train_numeric.drop([c1, c2], axis=1)
	for c1, c2, c3, c4 in zip(parallel_cols['S2'], parallel_cols['S3'], parallel_cols['S15'], parallel_cols['S14']):
	train_numeric[c1+c2+c3+c4] = train_numeric[[c1,c2,c3,c4]].max(axis=1)
	train_numeric = train_numeric.drop([c1, c2, c3, c4], axis=1)
	for c1, c2, c3, c4 in zip(parallel_cols['S4'], parallel_cols['S5'], parallel_cols['S17'], parallel_cols['S16']):
	train_numeric[c1+c2+c3+c4] = train_numeric[[c1,c2,c3,c4]].max(axis=1)
	train_numeric = train_numeric.drop([c1, c2, c3, c4], axis=1)
	for c1, c2, c3, c4 in zip(parallel_cols['S6'], parallel_cols['S7'], parallel_cols['S18'], parallel_cols['S19']):
	train_numeric[c1+c2+c3+c4] = train_numeric[[c1,c2,c3,c4]].max(axis=1)
	train_numeric = train_numeric.drop([c1, c2, c3, c4], axis=1)
	for c1, c2 in zip(parallel_cols['S8'], parallel_cols['S20']):
	train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
	train_numeric = train_numeric.drop([c1, c2], axis=1)
	for c1, c2, c3, c4, c5, c6 in zip(parallel_cols['S9'], parallel_cols['S10'], parallel_cols['S11'], parallel_cols['S21'], parallel_cols['S22'], parallel_cols['S23']):
	train_numeric[c1+c2+c3+c4+c5+c6] = train_numeric[[c1,c2,c3,c4,c5,c6]].max(axis=1)
	train_numeric = train_numeric.drop([c1, c2, c3, c4, c5, c6], axis=1)
	for c1, c2 in zip(parallel_cols['S43'], parallel_cols['S44']):
	train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
	train_numeric = train_numeric.drop([c1, c2], axis=1)
	for c1, c2 in zip(parallel_cols['S50'], parallel_cols['S49']):
	train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
	train_numeric = train_numeric.drop([c1, c2], axis=1)
	for c1, c2 in zip(parallel_cols['S36'], parallel_cols['S35']):
	train_numeric[c1+c2] = train_numeric[[c1,c2]].max(axis=1)
	train_numeric = train_numeric.drop([c1, c2], axis=1)

	#merge the numerical features and the date features
	train_data = train_numeric.merge(train, on=ID, how='left')
	#cross-validation
	mcc0, mcc1, mcc2 = [], [], []
	kf = KFold(n_splits=5)
	for cv_train_index, cv_test_index in kf.split(train_data):
	cv_train, cv_test = train_data.ix[cv_train_index], train_data.ix[cv_test_index]
	feature_cols = np.setdiff1d(cv_train.columns.tolist(), [RESPONSE, ID])
	prior = cv_train[RESPONSE].sum()*1.0/len(cv_train)
	xgb_params = {
	'seed': 0,
	'colsample_bytree': 0.7,
	'silent': 1,
	'subsample': 0.7,
	'learning_rate': 0.1,
	'objective': 'binary:logistic',
	'max_depth': 4,
	'num_parallel_tree': 1,
	'min_child_weight': 2,
	'eval_metric': 'auc',
	'base_score': prior
	}
	dtrain = xgb.DMatrix(cv_train[feature_cols], cv_train[RESPONSE])
	dtest = xgb.DMatrix(cv_test[feature_cols])
	num_boost_rounds = 50
	model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
	y_pred = model.predict(dtest)
	y_pred = y_pred/y_pred.max()
	critical_val = np.sort(y_pred)[int(len(y_pred)*(1.0-prior))]
	pred = [1 if i>=critical_val else 0 for i in y_pred]
	mcc0.append(mc(pred, cv_test[RESPONSE]))
	#xgb importance of the features
	outputfile = open('xgb.fmap', 'w')
	i = 0
	for feat in feature_cols:
	outputfile.write('{0}\t{1}\tq\n'.format(i, feat))
	i += 1
	outputfile.close()
	importance = model.get_fscore(fmap='xgb.fmap')
	importance = sorted(importance.items(), key=operator.itemgetter(1))
	df = pd.DataFrame(importance, columns=['feature', 'fscore'])
	#try different feature set
	new_feature = df['feature'][-300:].tolist()
	dtrain = xgb.DMatrix(cv_train[new_feature], cv_train[RESPONSE])
	dtest = xgb.DMatrix(cv_test[new_feature])
	num_boost_rounds = 50
	model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
	y_pred = model.predict(dtest)
	y_pred = y_pred/y_pred.max()
	critical_val = np.sort(y_pred)[int(len(y_pred)*(1.0-prior))]
	pred = [1 if i>=critical_val else 0 for i in y_pred]
	mcc1.append(mc(pred, cv_test[RESPONSE]))
	#try another feature set
	new_feature = parallel_cols['S32']+parallel_cols['S33']+['Diff4', 'Diff3', 'Duration', 'Endweek']
	dtrain = xgb.DMatrix(cv_train[new_feature], cv_train[RESPONSE])
	dtest = xgb.DMatrix(cv_test[new_feature])
	num_boost_rounds = 50
	model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
	y_pred = model.predict(dtest)
	y_pred = y_pred/y_pred.max()
	critical_val = np.sort(y_pred)[int(len(y_pred)*(1.0-prior))]
	pred = [1 if i>=critical_val else 0 for i in y_pred]
	mcc2.append(mc(pred, cv_test[RESPONSE]))
	print 'whole features MCC0'
	print mcc0
	print 'part features MCC1'
	print mcc1
	print 'part features MCC2'
	print mcc2