Created
July 19, 2017 01:43
-
-
Save shengch02/6b5e00e305808e99675cafe40424229f to your computer and use it in GitHub Desktop.
XGB + Pipeline
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
#import seaborn as sns | |
#matplotlib inline | |
from sklearn import model_selection, preprocessing | |
import xgboost as xgb | |
import datetime | |
import operator | |
from sklearn.cross_validation import train_test_split | |
from sklearn.metrics import r2_score | |
from sklearn.decomposition import TruncatedSVD | |
from itertools import combinations | |
from sklearn.linear_model import ElasticNet | |
from sklearn.decomposition import PCA, FastICA | |
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin | |
from sklearn.linear_model import ElasticNetCV, LassoLarsCV | |
from sklearn.ensemble import GradientBoostingRegressor | |
from sklearn.pipeline import make_pipeline, make_union | |
from sklearn.utils import check_array | |
from sklearn.random_projection import GaussianRandomProjection | |
from sklearn.random_projection import SparseRandomProjection | |
from sklearn.preprocessing import LabelEncoder | |
class StackingEstimator(BaseEstimator, TransformerMixin): | |
def __init__(self, estimator): | |
self.estimator = estimator | |
def fit(self, X, y=None, **fit_params): | |
self.estimator.fit(X, y, **fit_params) | |
return self | |
def transform(self, X): | |
X = check_array(X) | |
X_transformed = np.copy(X) | |
# add class probabilities as a synthetic feature | |
if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'): | |
X_transformed = np.hstack((self.estimator.predict_proba(X), X)) | |
# add class prodiction as a synthetic feature | |
X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed)) | |
return X_transformed | |
train0 = pd.read_csv('./data/train.csv') | |
r2mean = [] | |
r2std = [] | |
for ea in np.arange(0.01, 0.03, 0.005): | |
r2 = [] | |
for iij in range(5): | |
train, test = train_test_split(train0, train_size=0.7, random_state=iij) | |
y_train = train['y'] | |
y_test = test['y'] | |
test = test.drop(['y'], axis=1) | |
for c in train.columns: | |
if train[c].dtype == 'object': | |
lbl = LabelEncoder() | |
lbl.fit(list(train[c].values) + list(test[c].values)) | |
train[c] = lbl.transform(list(train[c].values)) | |
test[c] = lbl.transform(list(test[c].values)) | |
n_comp = 12 | |
# tSVD | |
tsvd = TruncatedSVD(n_components=n_comp, random_state=420) | |
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1)) | |
tsvd_results_test = tsvd.transform(test) | |
# PCA | |
pca = PCA(n_components=n_comp, random_state=420) | |
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1)) | |
pca2_results_test = pca.transform(test) | |
# ICA | |
ica = FastICA(n_components=n_comp, random_state=420) | |
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) | |
ica2_results_test = ica.transform(test) | |
# GRP | |
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) | |
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1)) | |
grp_results_test = grp.transform(test) | |
# SRP | |
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) | |
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1)) | |
srp_results_test = srp.transform(test) | |
#save columns list before adding the decomposition components | |
usable_columns = list(set(train.columns) - set(['y'])) | |
# Append decomposition components to datasets | |
for i in range(1, n_comp + 1): | |
train['pca_' + str(i)] = pca2_results_train[:, i - 1] | |
test['pca_' + str(i)] = pca2_results_test[:, i - 1] | |
train['ica_' + str(i)] = ica2_results_train[:, i - 1] | |
test['ica_' + str(i)] = ica2_results_test[:, i - 1] | |
train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] | |
test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] | |
train['grp_' + str(i)] = grp_results_train[:, i - 1] | |
test['grp_' + str(i)] = grp_results_test[:, i - 1] | |
train['srp_' + str(i)] = srp_results_train[:, i - 1] | |
test['srp_' + str(i)] = srp_results_test[:, i - 1] | |
#usable_columns = list(set(train.columns) - set(['y'])) | |
y_train = train['y'].values | |
y_mean = np.mean(y_train) | |
id_test = test['ID'].values | |
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) | |
finaltrainset = train[usable_columns].values | |
finaltestset = test[usable_columns].values | |
xgb_params = { | |
'n_trees': 520, | |
'eta': 0.02, | |
'max_depth': 4, | |
'subsample': 0.93, | |
'objective': 'reg:linear', | |
'eval_metric': 'rmse', | |
'base_score': y_mean, # base prediction = mean(target) | |
'silent': 1 | |
} | |
# NOTE: Make sure that the class is labeled 'class' in the data file | |
dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train) | |
dtest = xgb.DMatrix(test) | |
num_boost_rounds = 1250 | |
# train model | |
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds) | |
y_pred = model.predict(dtest) | |
stacked_pipeline = make_pipeline( | |
StackingEstimator(estimator=LassoLarsCV(normalize=True)), | |
StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)), | |
LassoLarsCV() | |
) | |
stacked_pipeline.fit(finaltrainset, y_train) | |
results = stacked_pipeline.predict(finaltestset) | |
sub = pd.DataFrame() | |
sub['ID'] = id_test | |
sub['y'] = y_pred*0.125 + results*0.875 | |
r2.append(r2_score(y_test, sub['y'])) | |
r2mean.append(np.mean(r2)) | |
r2std.append(np.std(r2)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment