tomokishii · August 21, 2018 05:26
diff --git a/README.md b/README.md
diff --git a/digits_xgb.py b/digits_xgb.py
 # -*- coding: utf-8 -*-
 #
 #   digits_xgb.py
 #       date. 1/2/2017, 5/29
 #
 #   I'm going to learn how to tune xgboost model.
 #       1. K-fold cross validation, 2. GridSearch
 #       

 import numpy as np
 import xgboost as xgb
 from sklearn.datasets import load_digits
 from sklearn.model_selection import train_test_split, KFold
 from sklearn.model_selection import GridSearchCV
 from sklearn.metrics import accuracy_score, confusion_matrix
 # from sklearn.linear_model import LogisticRegression

 def load_data():
    digits = load_digits()
    y = digits.target
    n_samples = len(digits.images)
    X = digits.images.reshape((n_samples, -1))

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=0)

    return X_train, X_test, y_train, y_test


 def xgb_gridsearch(X_train, X_test, y_train, y_test, n_folds=5):
    '''
      Base analysis process by XGBoost (Grid Search)
    '''
    param_grid = {
        'max_depth': [3, 4, 5], 
        'learning_rate': [0.1, 0.2],
        'n_estimators': [100]  }
    xgbclf = xgb.XGBClassifier()

    # Run Grid Search process
    fit_params = {'eval_metric': 'mlogloss',
                'verbose': False,
                'early_stopping_rounds': 10,
                'eval_set': [(X_test, y_test)]}
 
    gs_clf = GridSearchCV(xgbclf, param_grid, 
                          n_jobs=1, cv=n_folds,
                          fit_params=fit_params,
                          scoring='accuracy')
    gs_clf.fit(X_train, y_train)

    best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
    print('score:', score)
    for param_name in sorted(best_parameters.keys()):
        print('%s: %r' % (param_name, best_parameters[param_name]))

    xgbclf_best = xgb.XGBClassifier(**best_parameters)
    xgbclf_best.fit(X_train, y_train)
    y_pred_train = xgbclf_best.predict_proba(X_train)
    y_pred_test = xgbclf_best.predict_proba(X_test)

    return y_pred_train, y_pred_test
 #


 if __name__ == '__main__':
    X_train, X_test, y_train, y_test = load_data()

    print('XGBoost process:')
    y_pred_tr, y_pred_ave = xgb_gridsearch(X_train, X_test, y_train, y_test)
    y_pred_ave = np.argmax(y_pred_ave, axis=1)

    # Evaluation the result
    accu = accuracy_score(y_test, y_pred_ave)
    print('\nAveraged model:')
    print('accuracy = {:>.4f}'.format(accu))

    confmat = confusion_matrix(y_test, y_pred_ave)
    print('\nconfusion matrix:')
    print(confmat)
diff --git a/forest_fires_gs_lgb.py b/forest_fires_gs_lgb.py
 #
 #   forest_fires_gs_lgb.py
 #       date. 5/29/2017
 #

 import numpy as np
 import pandas as pd

 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import GridSearchCV
 from sklearn.metrics import mean_squared_error
 import lightgbm as lgb

 def load_data():
    fn = '../../Data/ForestFires/forestfires.csv'
    forestfires = pd.read_csv(fn)
    feats = forestfires.columns
    X = forestfires.iloc[:, :-1].values
    y = forestfires.iloc[:, -1].values

    # extract subset
    X = X[:, 4:]
    feats = feats[4:-1]

    return X, y, feats

 def lgb_gridsearch(X_train, X_test, y_train, y_test, n_folds=5):
    '''
      LightGBM grid search cv
    '''
    param_grid = {
        'objective': ['regression'],
        'num_leaves': [15, 23, 31],
        'learning_rate': [0.1, 0.2],
        'n_estimators': [100]}

    fit_params = {
        'eval_metric': 'l2',
        'eval_set': [(X_test, y_test)],
        'verbose': False,
        'early_stopping_rounds': 10}

    # define classifier
    lgb_reg = lgb.LGBMRegressor()
    gs_reg = GridSearchCV(lgb_reg, param_grid,
                          n_jobs=1, cv=n_folds,
                          fit_params=fit_params,
                          scoring='neg_mean_squared_error')     # important for Regression
    gs_reg.fit(X_train, y_train)

    means = gs_reg.cv_results_['mean_test_score']
    stds = gs_reg.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, gs_reg.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

    best_parameters = gs_reg.best_params_
    print('\nbest parameters:')
    for param_name in sorted(best_parameters.keys()):
        print('%s: %r' % (param_name, best_parameters[param_name]))

    reg_best = lgb.LGBMRegressor(**best_parameters)

    return reg_best



 if __name__ == '__main__':

    X, y, feats = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                            test_size=0.25, random_state=2017)

    lgb_reg = lgb_gridsearch(X_train, X_test, y_train, y_test)
    
    # Re-fit
    lgb_reg.fit(X_train, y_train, 
                    eval_metric='l2',
                    eval_set=[(X_test, y_test)],
                    early_stopping_rounds=10)
    
    y_pred = lgb_reg.predict(X_test, num_iteration=lgb_reg.best_iteration)
    mse = mean_squared_error(y_test, y_pred)

    print('\nrmse = ', mse ** 0.5)
	# -- coding: utf-8 --
	#
	# digits_xgb.py
	# date. 1/2/2017, 5/29
	#
	# I'm going to learn how to tune xgboost model.
	# 1. K-fold cross validation, 2. GridSearch
	#

	import numpy as np
	import xgboost as xgb
	from sklearn.datasets import load_digits
	from sklearn.model_selection import train_test_split, KFold
	from sklearn.model_selection import GridSearchCV
	from sklearn.metrics import accuracy_score, confusion_matrix
	# from sklearn.linear_model import LogisticRegression

	def load_data():
	digits = load_digits()
	y = digits.target
	n_samples = len(digits.images)
	X = digits.images.reshape((n_samples, -1))

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.33, random_state=0)

	return X_train, X_test, y_train, y_test


	def xgb_gridsearch(X_train, X_test, y_train, y_test, n_folds=5):
	'''
	Base analysis process by XGBoost (Grid Search)
	'''
	param_grid = {
	'max_depth': [3, 4, 5],
	'learning_rate': [0.1, 0.2],
	'n_estimators': [100] }
	xgbclf = xgb.XGBClassifier()

	# Run Grid Search process
	fit_params = {'eval_metric': 'mlogloss',
	'verbose': False,
	'early_stopping_rounds': 10,
	'eval_set': [(X_test, y_test)]}

	gs_clf = GridSearchCV(xgbclf, param_grid,
	n_jobs=1, cv=n_folds,
	fit_params=fit_params,
	scoring='accuracy')
	gs_clf.fit(X_train, y_train)

	best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
	print('score:', score)
	for param_name in sorted(best_parameters.keys()):
	print('%s: %r' % (param_name, best_parameters[param_name]))

	xgbclf_best = xgb.XGBClassifier(**best_parameters)
	xgbclf_best.fit(X_train, y_train)
	y_pred_train = xgbclf_best.predict_proba(X_train)
	y_pred_test = xgbclf_best.predict_proba(X_test)

	return y_pred_train, y_pred_test
	#


	if __name__ == '__main__':
	X_train, X_test, y_train, y_test = load_data()

	print('XGBoost process:')
	y_pred_tr, y_pred_ave = xgb_gridsearch(X_train, X_test, y_train, y_test)
	y_pred_ave = np.argmax(y_pred_ave, axis=1)

	# Evaluation the result
	accu = accuracy_score(y_test, y_pred_ave)
	print('\nAveraged model:')
	print('accuracy = {:>.4f}'.format(accu))

	confmat = confusion_matrix(y_test, y_pred_ave)
	print('\nconfusion matrix:')
	print(confmat)
	#
	# forest_fires_gs_lgb.py
	# date. 5/29/2017
	#

	import numpy as np
	import pandas as pd

	from sklearn.model_selection import train_test_split
	from sklearn.model_selection import GridSearchCV
	from sklearn.metrics import mean_squared_error
	import lightgbm as lgb

	def load_data():
	fn = '../../Data/ForestFires/forestfires.csv'
	forestfires = pd.read_csv(fn)
	feats = forestfires.columns
	X = forestfires.iloc[:, :-1].values
	y = forestfires.iloc[:, -1].values

	# extract subset
	X = X[:, 4:]
	feats = feats[4:-1]

	return X, y, feats

	def lgb_gridsearch(X_train, X_test, y_train, y_test, n_folds=5):
	'''
	LightGBM grid search cv
	'''
	param_grid = {
	'objective': ['regression'],
	'num_leaves': [15, 23, 31],
	'learning_rate': [0.1, 0.2],
	'n_estimators': [100]}

	fit_params = {
	'eval_metric': 'l2',
	'eval_set': [(X_test, y_test)],
	'verbose': False,
	'early_stopping_rounds': 10}

	# define classifier
	lgb_reg = lgb.LGBMRegressor()
	gs_reg = GridSearchCV(lgb_reg, param_grid,
	n_jobs=1, cv=n_folds,
	fit_params=fit_params,
	scoring='neg_mean_squared_error') # important for Regression
	gs_reg.fit(X_train, y_train)

	means = gs_reg.cv_results_['mean_test_score']
	stds = gs_reg.cv_results_['std_test_score']
	for mean, std, params in zip(means, stds, gs_reg.cv_results_['params']):
	print("%0.3f (+/-%0.03f) for %r"
	% (mean, std * 2, params))

	best_parameters = gs_reg.best_params_
	print('\nbest parameters:')
	for param_name in sorted(best_parameters.keys()):
	print('%s: %r' % (param_name, best_parameters[param_name]))

	reg_best = lgb.LGBMRegressor(**best_parameters)

	return reg_best



	if __name__ == '__main__':

	X, y, feats = load_data()
	X_train, X_test, y_train, y_test = train_test_split(X, y,
	test_size=0.25, random_state=2017)

	lgb_reg = lgb_gridsearch(X_train, X_test, y_train, y_test)

	# Re-fit
	lgb_reg.fit(X_train, y_train,
	eval_metric='l2',
	eval_set=[(X_test, y_test)],
	early_stopping_rounds=10)

	y_pred = lgb_reg.predict(X_test, num_iteration=lgb_reg.best_iteration)
	mse = mean_squared_error(y_test, y_pred)

	print('\nrmse = ', mse ** 0.5)