Created
February 29, 2020 14:38
-
-
Save vidit0210/4e6375abc5c9773b44709e154e57740d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Compute the array containing the 10-folds CV MSEs | |
MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10, | |
scoring='neg_mean_squared_error', | |
n_jobs=-1) | |
# Compute the 10-folds CV RMSE | |
RMSE_CV = (MSE_CV_scores.mean())**(1/2) | |
# Print RMSE_CV | |
print('CV RMSE: {:.2f}'.format(RMSE_CV)) | |
-------------- | |
# Import mean_squared_error from sklearn.metrics as MSE | |
from sklearn.metrics import mean_squared_error as MSE | |
# Fit dt to the training set | |
dt.fit(X_train, y_train) | |
# Predict the labels of the training set | |
y_pred_train = dt.predict(X_train) | |
# Evaluate the training set RMSE of dt | |
RMSE_train = (MSE(y_train, y_pred_train))**(1/2) | |
# Print RMSE_train | |
print('Train RMSE: {:.2f}'.format(RMSE_train)) | |
---------------------------------- | |
Ensemble Learning | |
------------------------ | |
# Set seed for reproducibility | |
SEED=1 | |
# Instantiate lr | |
lr = LogisticRegression(random_state=SEED) | |
# Instantiate knn | |
knn = KNN(n_neighbors=27) | |
# Instantiate dt | |
dt = DecisionTreeClassifier(min_samples_leaf=0.13, random_state=SEED) | |
# Define the list classifiers | |
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)] | |
# Iterate over the pre-defined list of classifiers | |
for clf_name, clf in classifiers: | |
# Fit clf to the training set | |
clf.fit(X_train, y_train) | |
# Predict y_pred | |
y_pred = clf.predict(X_test) | |
# Calculate accuracy | |
accuracy = accuracy_score(y_test, y_pred) | |
# Evaluate clf's accuracy on the test set | |
print('{:s} : {:.3f}'.format(clf_name, accuracy)) | |
--------- | |
Bagging | |
--------- | |
# Import DecisionTreeClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
# Import BaggingClassifier | |
from sklearn.ensemble import BaggingClassifier | |
# Instantiate dt | |
dt = DecisionTreeClassifier(random_state=1) | |
# Instantiate bc | |
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1) | |
# Fit bc to the training set | |
bc.fit(X_train, y_train) | |
# Predict test set labels | |
y_pred = bc.predict(X_test) | |
# Evaluate acc_test | |
acc_test = accuracy_score(y_test, y_pred) | |
print('Test set accuracy of bc: {:.2f}'.format(acc_test)) | |
----- | |
OOB ( Out of Box | |
---- | |
# Import DecisionTreeClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
# Import BaggingClassifier | |
from sklearn.ensemble import BaggingClassifier | |
# Instantiate dt | |
dt = DecisionTreeClassifier(min_samples_leaf=8, random_state=1) | |
# Instantiate bc | |
bc = BaggingClassifier(base_estimator=dt, | |
n_estimators=50, | |
oob_score=True, | |
random_state=1) | |
----------- | |
Random Forest Regressor | |
----- | |
# Import RandomForestRegressor | |
from sklearn.ensemble import RandomForestRegressor | |
# Instantiate rf | |
rf = RandomForestRegressor(n_estimators=25, | |
random_state=2) | |
# Fit rf to the training set | |
rf.fit(X_train, y_train) | |
# Import mean_squared_error as MSE | |
from sklearn.metrics import mean_squared_error as MSE | |
# Predict the test set labels | |
y_pred = rf.predict(X_test) | |
# Evaluate the test set RMSE | |
rmse_test = MSE(y_test, y_pred)**(1/2) | |
# Print rmse_test | |
print('Test set RMSE of rf: {:.2f}'.format(rmse_test)) | |
# Create a pd.Series of features importances | |
importances = pd.Series(data=rf.feature_importances_, | |
index= X_train.columns) | |
# Sort importances | |
importances_sorted = importances.sort_values() | |
# Draw a horizontal barplot of importances_sorted | |
importances_sorted.plot(kind='barh', color='lightgreen') | |
plt.title('Features Importances') | |
plt.show() | |
---------- | |
Boosting | |
--------- | |
# Import DecisionTreeClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
# Import AdaBoostClassifier | |
from sklearn.ensemble import AdaBoostClassifier | |
# Instantiate dt | |
dt = DecisionTreeClassifier(max_depth=2, random_state=1) | |
# Instantiate ada | |
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1) | |
# Fit ada to the training set | |
ada.fit(X_train, y_train) | |
# Compute the probabilities of obtaining the positive class | |
y_pred_proba = ada.predict_proba(X_test)[:,1] | |
# Import roc_auc_score | |
from sklearn.metrics import roc_auc_score | |
# Evaluate test-set roc_auc_score | |
ada_roc_auc = roc_auc_score(y_test, y_pred_proba) | |
# Print roc_auc_score | |
print('ROC AUC score: {:.2f}'.format(ada_roc_auc)) | |
----- | |
Gradient Boosting | |
--- | |
# Import GradientBoostingRegressor | |
from sklearn.ensemble import GradientBoostingRegressor | |
# Instantiate gb | |
gb = GradientBoostingRegressor(max_depth=4, | |
n_estimators=200, | |
random_state=2) | |
# Fit gb to the training set | |
gb.fit(X_train, y_train) | |
# Predict test set labels | |
y_pred = gb.predict(X_test) | |
# Import mean_squared_error as MSE | |
from sklearn.metrics import mean_squared_error as MSE | |
# Compute MSE | |
mse_test = MSE(y_test, y_pred) | |
# Compute RMSE | |
rmse_test = mse_test**(1/2) | |
# Print RMSE | |
print('Test set RMSE of gb: {:.3f}'.format(rmse_test)) | |
------- | |
Stochastic Gradient Boosting | |
------- | |
# Import GradientBoostingRegressor | |
from sklearn.ensemble import GradientBoostingRegressor | |
# Instantiate sgbr | |
sgbr = GradientBoostingRegressor(max_depth=4, | |
subsample=0.9, | |
max_features=0.75, | |
n_estimators=200, | |
random_state=2) | |
# Fit sgbr to the training set | |
sgbr.fit(X_train, y_train) | |
# Predict test set labels | |
y_pred = sgbr.predict(X_test) | |
# Import mean_squared_error as MSE | |
from sklearn.metrics import mean_squared_error as MSE | |
# Compute test set MSE | |
mse_test = MSE(y_test, y_pred) | |
# Compute test set RMSE | |
rmse_test = mse_test**(1/2) | |
# Print rmse_test | |
print('Test set RMSE of sgbr: {:.3f}'.format(rmse_test)) | |
----- | |
Random Forest Hyperparameter Tuning | |
---- | |
rf.get_params() ---> Gives de | |
# Define the dictionary 'params_rf' | |
params_rf = { | |
'n_estimators': [100, 350, 500], | |
'max_features': ['log2', 'auto', 'sqrt'], | |
'min_samples_leaf': [2, 10, 30], | |
}# Import GridSearchCV | |
from sklearn.model_selection import GridSearchCV | |
# Instantiate grid_rf | |
grid_rf = GridSearchCV(estimator=rf, | |
param_grid=params_rf, | |
scoring='neg_mean_squared_error', | |
cv=3, | |
verbose=1, | |
n_jobs=-1) | |
# Import mean_squared_error from sklearn.metrics as MSE | |
from sklearn.metrics import mean_squared_error as MSE | |
# Extract the best estimator | |
best_model = grid_rf.best_estimator_ | |
# Predict test set labels | |
y_pred = best_model.predict(X_test) | |
# Compute rmse_test | |
rmse_test = MSE(y_test, y_pred)**(1/2) | |
# Print rmse_test | |
print('Test RMSE of best model: {:.3f}'.format(rmse_test)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment