My current environment is...
- Python 3.5.2
- Scikit-learn 0.18.1
- XGBoost (python-package) 0.6
- LightGBM (python-package) v2.0
| # -*- coding: utf-8 -*- | |
| # | |
| # digits_xgb.py | |
| # date. 1/2/2017, 5/29 | |
| # | |
| # I'm going to learn how to tune xgboost model. | |
| # 1. K-fold cross validation, 2. GridSearch | |
| # | |
| import numpy as np | |
| import xgboost as xgb | |
| from sklearn.datasets import load_digits | |
| from sklearn.model_selection import train_test_split, KFold | |
| from sklearn.model_selection import GridSearchCV | |
| from sklearn.metrics import accuracy_score, confusion_matrix | |
| # from sklearn.linear_model import LogisticRegression | |
| def load_data(): | |
| digits = load_digits() | |
| y = digits.target | |
| n_samples = len(digits.images) | |
| X = digits.images.reshape((n_samples, -1)) | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.33, random_state=0) | |
| return X_train, X_test, y_train, y_test | |
| def xgb_gridsearch(X_train, X_test, y_train, y_test, n_folds=5): | |
| ''' | |
| Base analysis process by XGBoost (Grid Search) | |
| ''' | |
| param_grid = { | |
| 'max_depth': [3, 4, 5], | |
| 'learning_rate': [0.1, 0.2], | |
| 'n_estimators': [100] } | |
| xgbclf = xgb.XGBClassifier() | |
| # Run Grid Search process | |
| fit_params = {'eval_metric': 'mlogloss', | |
| 'verbose': False, | |
| 'early_stopping_rounds': 10, | |
| 'eval_set': [(X_test, y_test)]} | |
| gs_clf = GridSearchCV(xgbclf, param_grid, | |
| n_jobs=1, cv=n_folds, | |
| fit_params=fit_params, | |
| scoring='accuracy') | |
| gs_clf.fit(X_train, y_train) | |
| best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1]) | |
| print('score:', score) | |
| for param_name in sorted(best_parameters.keys()): | |
| print('%s: %r' % (param_name, best_parameters[param_name])) | |
| xgbclf_best = xgb.XGBClassifier(**best_parameters) | |
| xgbclf_best.fit(X_train, y_train) | |
| y_pred_train = xgbclf_best.predict_proba(X_train) | |
| y_pred_test = xgbclf_best.predict_proba(X_test) | |
| return y_pred_train, y_pred_test | |
| # | |
| if __name__ == '__main__': | |
| X_train, X_test, y_train, y_test = load_data() | |
| print('XGBoost process:') | |
| y_pred_tr, y_pred_ave = xgb_gridsearch(X_train, X_test, y_train, y_test) | |
| y_pred_ave = np.argmax(y_pred_ave, axis=1) | |
| # Evaluation the result | |
| accu = accuracy_score(y_test, y_pred_ave) | |
| print('\nAveraged model:') | |
| print('accuracy = {:>.4f}'.format(accu)) | |
| confmat = confusion_matrix(y_test, y_pred_ave) | |
| print('\nconfusion matrix:') | |
| print(confmat) |
| # | |
| # forest_fires_gs_lgb.py | |
| # date. 5/29/2017 | |
| # | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.model_selection import GridSearchCV | |
| from sklearn.metrics import mean_squared_error | |
| import lightgbm as lgb | |
| def load_data(): | |
| fn = '../../Data/ForestFires/forestfires.csv' | |
| forestfires = pd.read_csv(fn) | |
| feats = forestfires.columns | |
| X = forestfires.iloc[:, :-1].values | |
| y = forestfires.iloc[:, -1].values | |
| # extract subset | |
| X = X[:, 4:] | |
| feats = feats[4:-1] | |
| return X, y, feats | |
| def lgb_gridsearch(X_train, X_test, y_train, y_test, n_folds=5): | |
| ''' | |
| LightGBM grid search cv | |
| ''' | |
| param_grid = { | |
| 'objective': ['regression'], | |
| 'num_leaves': [15, 23, 31], | |
| 'learning_rate': [0.1, 0.2], | |
| 'n_estimators': [100]} | |
| fit_params = { | |
| 'eval_metric': 'l2', | |
| 'eval_set': [(X_test, y_test)], | |
| 'verbose': False, | |
| 'early_stopping_rounds': 10} | |
| # define classifier | |
| lgb_reg = lgb.LGBMRegressor() | |
| gs_reg = GridSearchCV(lgb_reg, param_grid, | |
| n_jobs=1, cv=n_folds, | |
| fit_params=fit_params, | |
| scoring='neg_mean_squared_error') # important for Regression | |
| gs_reg.fit(X_train, y_train) | |
| means = gs_reg.cv_results_['mean_test_score'] | |
| stds = gs_reg.cv_results_['std_test_score'] | |
| for mean, std, params in zip(means, stds, gs_reg.cv_results_['params']): | |
| print("%0.3f (+/-%0.03f) for %r" | |
| % (mean, std * 2, params)) | |
| best_parameters = gs_reg.best_params_ | |
| print('\nbest parameters:') | |
| for param_name in sorted(best_parameters.keys()): | |
| print('%s: %r' % (param_name, best_parameters[param_name])) | |
| reg_best = lgb.LGBMRegressor(**best_parameters) | |
| return reg_best | |
| if __name__ == '__main__': | |
| X, y, feats = load_data() | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, | |
| test_size=0.25, random_state=2017) | |
| lgb_reg = lgb_gridsearch(X_train, X_test, y_train, y_test) | |
| # Re-fit | |
| lgb_reg.fit(X_train, y_train, | |
| eval_metric='l2', | |
| eval_set=[(X_test, y_test)], | |
| early_stopping_rounds=10) | |
| y_pred = lgb_reg.predict(X_test, num_iteration=lgb_reg.best_iteration) | |
| mse = mean_squared_error(y_test, y_pred) | |
| print('\nrmse = ', mse ** 0.5) |