Skip to content

Instantly share code, notes, and snippets.

@shengch02
Created July 19, 2017 01:42
Show Gist options
  • Save shengch02/8cd7a0366106b6819e50653ce1071952 to your computer and use it in GitHub Desktop.
Save shengch02/8cd7a0366106b6819e50653ce1071952 to your computer and use it in GitHub Desktop.
XGB
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.cross_validation import cross_val_score, train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score
data0 = pd.read_csv('./data/train.csv', sep=',', header=False, names=range(378))
data = data0
for col in range(2,10):
nofc = data.shape[1]
for i in range(len(data[col].unique())):
data[data.shape[1]]=0
mapdic = {}
for i, ai in enumerate(list(data[col].unique())):
mapdic[ai]=i
data[col] = data[col].map(mapdic)
for i in range(data.shape[0]):
data[nofc+data[col][i]][i]=1
r2 = []
for offset in np.arange(0.0, 0.2, 0.005):
X = data
predictors=range(10, 572)
for i in range(10, 378):
if float(sum(X[i]==1))/X.shape[0]<offset:
predictors.remove(i)
for i in range(378, 572):
if float(sum(X[i]==1))/X.shape[0]<0:
predictors.remove(i)
target = 1
for nfeature in range(1, 2, 10):
ssttdd = []
for iii in range(50):
train, test = train_test_split(data, test_size=0.3)
# rfe = RFE(model, nfeature)
# X = np.asarray(train[predictors])
# Y = np.asarray(train[target])
# fit = rfe.fit(X, Y)
# predictor = [predictors[i] for i in range(len(fit.support_)) if fit.support_[i]]
predictor=predictors
mn = train[target].mean()
std = train[target].std()
lr = xgb.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True
, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None)
lr.fit(train[predictor], train[target])
prt = lr.predict(test[predictor])
for ii in range(len(prt)):
if np.abs(prt[ii]-mn)>40.0*std:
prt[ii]=mn
ssttdd.append(r2_score(test[target], prt))
print str(offset) + ' ' +str(np.mean(ssttdd))+ ' ' +str(np.std(ssttdd))
r2.append(np.mean(ssttdd))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment