vidit0210 · February 29, 2020 14:38
diff --git a/Tree-Boosting-Bagging-CV b/Tree-Boosting-Bagging-CV
 # Compute the array containing the 10-folds CV MSEs
 MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10, 
                                  scoring='neg_mean_squared_error', 
                                  n_jobs=-1) 

 # Compute the 10-folds CV RMSE
 RMSE_CV = (MSE_CV_scores.mean())**(1/2)

 # Print RMSE_CV
 print('CV RMSE: {:.2f}'.format(RMSE_CV))

 --------------

 # Import mean_squared_error from sklearn.metrics as MSE
 from sklearn.metrics import mean_squared_error as MSE

 # Fit dt to the training set
 dt.fit(X_train, y_train)

 # Predict the labels of the training set
 y_pred_train = dt.predict(X_train)

 # Evaluate the training set RMSE of dt

 RMSE_train = (MSE(y_train, y_pred_train))**(1/2)

 # Print RMSE_train
 print('Train RMSE: {:.2f}'.format(RMSE_train))

 ----------------------------------

 Ensemble Learning
 ------------------------

 # Set seed for reproducibility
 SEED=1

 # Instantiate lr
 lr = LogisticRegression(random_state=SEED)

 # Instantiate knn
 knn = KNN(n_neighbors=27)

 # Instantiate dt
 dt = DecisionTreeClassifier(min_samples_leaf=0.13, random_state=SEED)

 # Define the list classifiers
 classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]

 # Iterate over the pre-defined list of classifiers
 for clf_name, clf in classifiers:    
 
    # Fit clf to the training set
    clf.fit(X_train, y_train)    
   
    # Predict y_pred
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred) 
   
    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy))
    
 ---------
 Bagging
 ---------
 # Import DecisionTreeClassifier
 from sklearn.tree import DecisionTreeClassifier

 # Import BaggingClassifier
 from sklearn.ensemble import BaggingClassifier

 # Instantiate dt
 dt = DecisionTreeClassifier(random_state=1)

 # Instantiate bc
 bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)

 # Fit bc to the training set
 bc.fit(X_train, y_train)

 # Predict test set labels
 y_pred = bc.predict(X_test)

 # Evaluate acc_test
 acc_test = accuracy_score(y_test, y_pred)
 print('Test set accuracy of bc: {:.2f}'.format(acc_test)) 

 -----
 OOB ( Out of Box
 ----
 # Import DecisionTreeClassifier
 from sklearn.tree import DecisionTreeClassifier

 # Import BaggingClassifier
 from sklearn.ensemble import BaggingClassifier

 # Instantiate dt
 dt = DecisionTreeClassifier(min_samples_leaf=8, random_state=1)

 # Instantiate bc
 bc = BaggingClassifier(base_estimator=dt, 
                       n_estimators=50,
                       oob_score=True,
                       random_state=1)
                       
       -----------
 Random Forest Regressor
 -----
 # Import RandomForestRegressor
 from sklearn.ensemble import RandomForestRegressor

 # Instantiate rf
 rf = RandomForestRegressor(n_estimators=25,
            random_state=2)
            
 # Fit rf to the training set    
 rf.fit(X_train, y_train) 

 # Import mean_squared_error as MSE
 from sklearn.metrics import mean_squared_error as MSE

 # Predict the test set labels
 y_pred = rf.predict(X_test)

 # Evaluate the test set RMSE
 rmse_test = MSE(y_test, y_pred)**(1/2)

 # Print rmse_test
 print('Test set RMSE of rf: {:.2f}'.format(rmse_test))

 # Create a pd.Series of features importances
 importances = pd.Series(data=rf.feature_importances_,
                        index= X_train.columns)

 # Sort importances
 importances_sorted = importances.sort_values()

 # Draw a horizontal barplot of importances_sorted
 importances_sorted.plot(kind='barh', color='lightgreen')
 plt.title('Features Importances')
 plt.show()

 ----------
 Boosting
 ---------
 # Import DecisionTreeClassifier
 from sklearn.tree import DecisionTreeClassifier

 # Import AdaBoostClassifier
 from sklearn.ensemble import AdaBoostClassifier

 # Instantiate dt
 dt = DecisionTreeClassifier(max_depth=2, random_state=1)

 # Instantiate ada
 ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

 # Fit ada to the training set
 ada.fit(X_train, y_train)

 # Compute the probabilities of obtaining the positive class
 y_pred_proba = ada.predict_proba(X_test)[:,1]

 # Import roc_auc_score
 from sklearn.metrics import roc_auc_score

 # Evaluate test-set roc_auc_score
 ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

 # Print roc_auc_score
 print('ROC AUC score: {:.2f}'.format(ada_roc_auc))
 -----
 Gradient Boosting
 ---
 # Import GradientBoostingRegressor
 from sklearn.ensemble import GradientBoostingRegressor

 # Instantiate gb
 gb = GradientBoostingRegressor(max_depth=4,
                               n_estimators=200,
                               random_state=2)
                               # Fit gb to the training set
 gb.fit(X_train, y_train)

 # Predict test set labels
 y_pred = gb.predict(X_test)

 # Import mean_squared_error as MSE
 from sklearn.metrics import mean_squared_error as MSE

 # Compute MSE
 mse_test = MSE(y_test, y_pred)

 # Compute RMSE
 rmse_test = mse_test**(1/2)

 # Print RMSE
 print('Test set RMSE of gb: {:.3f}'.format(rmse_test))
 -------
 Stochastic Gradient Boosting
 -------
 # Import GradientBoostingRegressor
 from sklearn.ensemble import GradientBoostingRegressor

 # Instantiate sgbr
 sgbr = GradientBoostingRegressor(max_depth=4, 
                                 subsample=0.9,
                                 max_features=0.75,
                                 n_estimators=200,                                
                                 random_state=2)
                                 
 # Fit sgbr to the training set
 sgbr.fit(X_train, y_train)

 # Predict test set labels
 y_pred = sgbr.predict(X_test)

 # Import mean_squared_error as MSE
 from sklearn.metrics import mean_squared_error as MSE

 # Compute test set MSE
 mse_test = MSE(y_test, y_pred)

 # Compute test set RMSE
 rmse_test = mse_test**(1/2)

 # Print rmse_test
 print('Test set RMSE of sgbr: {:.3f}'.format(rmse_test))
 -----
 Random Forest Hyperparameter Tuning
 ----
 rf.get_params() ---> Gives de

 # Define the dictionary 'params_rf'
 params_rf = {
             'n_estimators': [100, 350, 500],
             'max_features': ['log2', 'auto', 'sqrt'],
             'min_samples_leaf': [2, 10, 30], 
             }# Import GridSearchCV
             
 from sklearn.model_selection import  GridSearchCV

 # Instantiate grid_rf
 grid_rf = GridSearchCV(estimator=rf,
                       param_grid=params_rf,
                       scoring='neg_mean_squared_error',
                       cv=3,
                       verbose=1,
                       n_jobs=-1)
                       
                       
 # Import mean_squared_error from sklearn.metrics as MSE 
 from sklearn.metrics import mean_squared_error as MSE

 # Extract the best estimator
 best_model = grid_rf.best_estimator_

 # Predict test set labels
 y_pred = best_model.predict(X_test)

 # Compute rmse_test
 rmse_test = MSE(y_test, y_pred)**(1/2)

 # Print rmse_test
 print('Test RMSE of best model: {:.3f}'.format(rmse_test))
	# Compute the array containing the 10-folds CV MSEs
	MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10,
	scoring='neg_mean_squared_error',
	n_jobs=-1)

	# Compute the 10-folds CV RMSE
	RMSE_CV = (MSE_CV_scores.mean())**(1/2)

	# Print RMSE_CV
	print('CV RMSE: {:.2f}'.format(RMSE_CV))

	--------------

	# Import mean_squared_error from sklearn.metrics as MSE
	from sklearn.metrics import mean_squared_error as MSE

	# Fit dt to the training set
	dt.fit(X_train, y_train)

	# Predict the labels of the training set
	y_pred_train = dt.predict(X_train)

	# Evaluate the training set RMSE of dt

	RMSE_train = (MSE(y_train, y_pred_train))**(1/2)

	# Print RMSE_train
	print('Train RMSE: {:.2f}'.format(RMSE_train))

	----------------------------------

	Ensemble Learning
	------------------------

	# Set seed for reproducibility
	SEED=1

	# Instantiate lr
	lr = LogisticRegression(random_state=SEED)

	# Instantiate knn
	knn = KNN(n_neighbors=27)

	# Instantiate dt
	dt = DecisionTreeClassifier(min_samples_leaf=0.13, random_state=SEED)

	# Define the list classifiers
	classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]

	# Iterate over the pre-defined list of classifiers
	for clf_name, clf in classifiers:

	# Fit clf to the training set
	clf.fit(X_train, y_train)

	# Predict y_pred
	y_pred = clf.predict(X_test)

	# Calculate accuracy
	accuracy = accuracy_score(y_test, y_pred)

	# Evaluate clf's accuracy on the test set
	print('{:s} : {:.3f}'.format(clf_name, accuracy))

	---------
	Bagging
	---------
	# Import DecisionTreeClassifier
	from sklearn.tree import DecisionTreeClassifier

	# Import BaggingClassifier
	from sklearn.ensemble import BaggingClassifier

	# Instantiate dt
	dt = DecisionTreeClassifier(random_state=1)

	# Instantiate bc
	bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)

	# Fit bc to the training set
	bc.fit(X_train, y_train)

	# Predict test set labels
	y_pred = bc.predict(X_test)

	# Evaluate acc_test
	acc_test = accuracy_score(y_test, y_pred)
	print('Test set accuracy of bc: {:.2f}'.format(acc_test))

	-----
	OOB ( Out of Box
	----
	# Import DecisionTreeClassifier
	from sklearn.tree import DecisionTreeClassifier

	# Import BaggingClassifier
	from sklearn.ensemble import BaggingClassifier

	# Instantiate dt
	dt = DecisionTreeClassifier(min_samples_leaf=8, random_state=1)

	# Instantiate bc
	bc = BaggingClassifier(base_estimator=dt,
	n_estimators=50,
	oob_score=True,
	random_state=1)

	-----------
	Random Forest Regressor
	-----
	# Import RandomForestRegressor
	from sklearn.ensemble import RandomForestRegressor

	# Instantiate rf
	rf = RandomForestRegressor(n_estimators=25,
	random_state=2)

	# Fit rf to the training set
	rf.fit(X_train, y_train)

	# Import mean_squared_error as MSE
	from sklearn.metrics import mean_squared_error as MSE

	# Predict the test set labels
	y_pred = rf.predict(X_test)

	# Evaluate the test set RMSE
	rmse_test = MSE(y_test, y_pred)**(1/2)

	# Print rmse_test
	print('Test set RMSE of rf: {:.2f}'.format(rmse_test))

	# Create a pd.Series of features importances
	importances = pd.Series(data=rf.feature_importances_,
	index= X_train.columns)

	# Sort importances
	importances_sorted = importances.sort_values()

	# Draw a horizontal barplot of importances_sorted
	importances_sorted.plot(kind='barh', color='lightgreen')
	plt.title('Features Importances')
	plt.show()

	----------
	Boosting
	---------
	# Import DecisionTreeClassifier
	from sklearn.tree import DecisionTreeClassifier

	# Import AdaBoostClassifier
	from sklearn.ensemble import AdaBoostClassifier

	# Instantiate dt
	dt = DecisionTreeClassifier(max_depth=2, random_state=1)

	# Instantiate ada
	ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

	# Fit ada to the training set
	ada.fit(X_train, y_train)

	# Compute the probabilities of obtaining the positive class
	y_pred_proba = ada.predict_proba(X_test)[:,1]

	# Import roc_auc_score
	from sklearn.metrics import roc_auc_score

	# Evaluate test-set roc_auc_score
	ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

	# Print roc_auc_score
	print('ROC AUC score: {:.2f}'.format(ada_roc_auc))
	-----
	Gradient Boosting
	---
	# Import GradientBoostingRegressor
	from sklearn.ensemble import GradientBoostingRegressor

	# Instantiate gb
	gb = GradientBoostingRegressor(max_depth=4,
	n_estimators=200,
	random_state=2)
	# Fit gb to the training set
	gb.fit(X_train, y_train)

	# Predict test set labels
	y_pred = gb.predict(X_test)

	# Import mean_squared_error as MSE
	from sklearn.metrics import mean_squared_error as MSE

	# Compute MSE
	mse_test = MSE(y_test, y_pred)

	# Compute RMSE
	rmse_test = mse_test**(1/2)

	# Print RMSE
	print('Test set RMSE of gb: {:.3f}'.format(rmse_test))
	-------
	Stochastic Gradient Boosting
	-------
	# Import GradientBoostingRegressor
	from sklearn.ensemble import GradientBoostingRegressor

	# Instantiate sgbr
	sgbr = GradientBoostingRegressor(max_depth=4,
	subsample=0.9,
	max_features=0.75,
	n_estimators=200,
	random_state=2)

	# Fit sgbr to the training set
	sgbr.fit(X_train, y_train)

	# Predict test set labels
	y_pred = sgbr.predict(X_test)

	# Import mean_squared_error as MSE
	from sklearn.metrics import mean_squared_error as MSE

	# Compute test set MSE
	mse_test = MSE(y_test, y_pred)

	# Compute test set RMSE
	rmse_test = mse_test**(1/2)

	# Print rmse_test
	print('Test set RMSE of sgbr: {:.3f}'.format(rmse_test))
	-----
	Random Forest Hyperparameter Tuning
	----
	rf.get_params() ---> Gives de

	# Define the dictionary 'params_rf'
	params_rf = {
	'n_estimators': [100, 350, 500],
	'max_features': ['log2', 'auto', 'sqrt'],
	'min_samples_leaf': [2, 10, 30],
	}# Import GridSearchCV

	from sklearn.model_selection import GridSearchCV

	# Instantiate grid_rf
	grid_rf = GridSearchCV(estimator=rf,
	param_grid=params_rf,
	scoring='neg_mean_squared_error',
	cv=3,
	verbose=1,
	n_jobs=-1)


	# Import mean_squared_error from sklearn.metrics as MSE
	from sklearn.metrics import mean_squared_error as MSE

	# Extract the best estimator
	best_model = grid_rf.best_estimator_

	# Predict test set labels
	y_pred = best_model.predict(X_test)

	# Compute rmse_test
	rmse_test = MSE(y_test, y_pred)**(1/2)

	# Print rmse_test
	print('Test RMSE of best model: {:.3f}'.format(rmse_test))