Created
February 29, 2020 20:18
-
-
Save vidit0210/efc9b86616988abace609ae36a37306a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
---- | |
Starting | |
---- | |
# Import xgboost | |
import xgboost as xgb | |
# Create arrays for the features and the target: X, y | |
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1] | |
# Create the training and test sets | |
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=123) | |
# Instantiate the XGBClassifier: xg_cl | |
xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123) | |
# Fit the classifier to the training set | |
xg_cl.fit(X_train,y_train) | |
# Predict the labels of the test set: preds | |
preds = xg_cl.predict(X_test) | |
# Compute the accuracy: accuracy | |
accuracy = float(np.sum(preds==y_test))/y_test.shape[0] | |
print("accuracy: %f" % (accuracy)) | |
---- | |
XGBOOST Cross Validation | |
----- | |
# Create arrays for the features and the target: X, y | |
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1] | |
# Create the DMatrix from X and y: churn_dmatrix | |
churn_dmatrix = xgb.DMatrix(data=X, label=y) | |
# Create the parameter dictionary: params | |
params = {"objective":"reg:logistic", "max_depth":3} | |
# Perform cross-validation: cv_results | |
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, | |
nfold=3, num_boost_round=5, | |
metrics="error", as_pandas=True, seed=123) | |
# Print cv_results | |
print(cv_results) | |
# Print the accuracy | |
print(((1-cv_results["test-error-mean"]).iloc[-1])) | |
----- | |
AUC CURVE | |
----- | |
# Perform cross_validation: cv_results | |
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, | |
nfold=3, num_boost_round=5, | |
metrics="auc", as_pandas=True, seed=123) | |
# Print cv_results | |
print(cv_results) | |
# Print the AUC | |
print((cv_results["test-auc-mean"]).iloc[-1]) | |
-------- | |
XG Boost Regression | |
------- | |
# Create the training and test sets | |
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123) | |
# Instantiate the XGBRegressor: xg_reg | |
xg_reg = xgb.XGBRegressor(objective="reg:linear", n_estimators=10, seed=123) | |
# Fit the regressor to the training set | |
xg_reg.fit(X_train, y_train) | |
# Predict the labels of the test set: preds | |
preds = xg_reg.predict(X_test) | |
# Compute the rmse: rmse | |
rmse = np.sqrt(mean_squared_error(y_test, preds)) | |
print("RMSE: %f" % (rmse)) | |
---- | |
XG Boost GLearner Have to use DMatrix as it is not common | |
---- | |
# Convert the training and testing sets into DMatrixes: DM_train, DM_test | |
DM_train = xgb.DMatrix(data=X_train, label=y_train) | |
DM_test = xgb.DMatrix(data=X_test, label=y_test) | |
# Create the parameter dictionary: params | |
params = {"booster":"gblinear", "objective":"reg:linear"} | |
# Train the model: xg_reg | |
xg_reg = xgb.train(params = params, dtrain=DM_train, num_boost_round=5) | |
# Predict the labels of the test set: preds | |
preds = xg_reg.predict(DM_test) | |
# Compute and print the RMSE | |
rmse = np.sqrt(mean_squared_error(y_test,preds)) | |
print("RMSE: %f" % (rmse)) | |
----- | |
CV Fold using XG Boost | |
---- | |
# Create the DMatrix: housing_dmatrix | |
housing_dmatrix = xgb.DMatrix(data=X,label=y) | |
# Create the parameter dictionary: params | |
params = {"objective":"reg:linear", "max_depth":4} | |
# Perform cross-validation: cv_results | |
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=4, num_boost_round=5, metrics="mae", as_pandas=True, seed=123) | |
# Print cv_results | |
print(cv_results) | |
# Extract and print final round boosting round metric | |
print((cv_results["test-mae-mean"]).tail(1)) | |
---- | |
Regularization | |
----- | |
# Create the DMatrix: housing_dmatrix | |
housing_dmatrix = xgb.DMatrix(data=X, label=y) | |
reg_params = [1, 10, 100] | |
# Create the initial parameter dictionary for varying l2 strength: params | |
params = {"objective":"reg:linear","max_depth":3} | |
# Create an empty list for storing rmses as a function of l2 complexity | |
rmses_l2 = [] | |
# Iterate over reg_params | |
for reg in reg_params: | |
# Update l2 strength | |
params["lambda"] = reg | |
# Pass this updated param dictionary into cv | |
cv_results_rmse = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2, num_boost_round=5, metrics="rmse", as_pandas=True, seed=123) | |
# Append best rmse (final round) to rmses_l2 | |
rmses_l2.append(cv_results_rmse["test-rmse-mean"].tail(1).values[0]) | |
# Look at best rmse per l2 param | |
print("Best rmse as a function of l2:") | |
print(pd.DataFrame(list(zip(reg_params, rmses_l2)), columns=["l2","rmse"])) | |
----- | |
Visulaizing Most Important DataSet | |
----- | |
# Create the DMatrix: housing_dmatrix | |
housing_dmatrix = xgb.DMatrix(data=X, label=y) | |
# Create the parameter dictionary: params | |
params = {"objective":"reg:linear", "max_depth":4} | |
# Train the model: xg_reg | |
xg_reg = xgb.train(params=params, dtrain=housing_dmatrix, num_boost_round=10) | |
# Plot the feature importances | |
xgb.plot_importance(xg_reg) | |
plt.show() | |
----- | |
Tuning XGBoost | |
------ | |
# Create the DMatrix: housing_dmatrix | |
housing_dmatrix = xgb.DMatrix(data=X, label=y) | |
# Create the parameter dictionary for each tree: params | |
params = {"objective":"reg:linear", "max_depth":3} | |
# Create list of number of boosting rounds | |
num_rounds = [5, 10, 15] | |
# Empty list to store final round rmse per XGBoost model | |
final_rmse_per_round = [] | |
# Iterate over num_rounds and build one model per num_boost_round parameter | |
for curr_num_rounds in num_rounds: | |
# Perform cross-validation: cv_results | |
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=curr_num_rounds, metrics="rmse", as_pandas=True, seed=123) | |
# Append final round RMSE | |
final_rmse_per_round.append(cv_results["test-rmse-mean"].tail().values[-1]) | |
# Print the resultant DataFrame | |
num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round)) | |
print(pd.DataFrame(num_rounds_rmses,columns=["num_boosting_rounds","rmse"])) | |
------- | |
Automated boosting round selection using early_stopping | |
------- | |
# Create your housing DMatrix: housing_dmatrix | |
housing_dmatrix = xgb.DMatrix(data=X,label=y) | |
# Create the parameter dictionary for each tree: params | |
params = {"objective":"reg:linear", "max_depth":4} | |
# Perform cross-validation with early stopping: cv_results | |
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=50, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123) | |
# Print cv_results | |
print(cv_results) | |
------- | |
Tunig eta | |
----- | |
# Create your housing DMatrix: housing_dmatrix | |
housing_dmatrix = xgb.DMatrix(data=X, label=y) | |
# Create the parameter dictionary for each tree (boosting round) | |
params = {"objective":"reg:linear", "max_depth":3} | |
# Create list of eta values and empty list to store final round rmse per xgboost model | |
eta_vals = [0.001, 0.01, 0.1] | |
best_rmse = [] | |
# Systematically vary the eta | |
for curr_val in eta_vals: | |
params["eta"] = curr_val | |
# Perform cross-validation: cv_results | |
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, | |
num_boost_round=10, early_stopping_rounds=5, | |
metrics="rmse", as_pandas=True, seed=123) | |
# Append the final round rmse to best_rmse | |
best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1]) | |
# Print the resultant DataFrame | |
print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=["eta","best_rmse"])) | |
--- | |
Tuning max_depth | |
----- | |
# Create your housing DMatrix: housing_dmatrix | |
housing_dmatrix = xgb.DMatrix(data=X,label=y) | |
# Create the parameter dictionary | |
params = {"objective":"reg:linear"} | |
# Create list of max_depth values | |
max_depths = [2, 5, 10, 20] | |
best_rmse = [] | |
# Systematically vary the max_depth | |
for curr_val in max_depths: | |
params["max_depth"] = curr_val | |
# Perform cross-validation | |
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2, | |
num_boost_round=10, early_stopping_rounds=5, | |
metrics="rmse", as_pandas=True, seed=123) | |
# Append the final round rmse to best_rmse | |
best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1]) | |
# Print the resultant DataFrame | |
print(pd.DataFrame(list(zip(max_depths, best_rmse)),columns=["max_depth","best_rmse"])) | |
----- | |
Tuning colsample_bytree | |
---- | |
# Create your housing DMatrix | |
housing_dmatrix = xgb.DMatrix(data=X,label=y) | |
# Create the parameter dictionary | |
params={"objective":"reg:linear","max_depth":3} | |
# Create list of hyperparameter values | |
colsample_bytree_vals = [0.1, 0.5, 0.8, 1] | |
best_rmse = [] | |
# Systematically vary the hyperparameter value | |
for curr_val in colsample_bytree_vals: | |
params["colsample_bytree"] = curr_val | |
# Perform cross-validation | |
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2, | |
num_boost_round=10, early_stopping_rounds=5, | |
metrics="rmse", as_pandas=True, seed=123) | |
# Append the final round rmse to best_rmse | |
best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1]) | |
# Print the resultant DataFrame | |
print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=["colsample_bytree","best_rmse"])) | |
------ | |
Grid search with XGBoost | |
------ | |
# Create the parameter grid: gbm_param_grid | |
gbm_param_grid = { | |
'colsample_bytree': [0.3, 0.7], | |
'n_estimators': [50], | |
'max_depth': [2, 5] | |
} | |
# Instantiate the regressor: gbm | |
gbm = xgb.XGBRegressor() | |
# Perform grid search: grid_mse | |
grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, | |
scoring='neg_mean_squared_error', cv=4, verbose=1) | |
grid_mse.fit(X, y) | |
# Print the best parameters and lowest RMSE | |
print("Best parameters found: ", grid_mse.best_params_) | |
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_))) | |
--------- | |
Random search with XGBoost | |
---------- | |
# Create the parameter grid: gbm_param_grid | |
gbm_param_grid = { | |
'n_estimators': [25], | |
'max_depth': range(2, 12) | |
} | |
# Instantiate the regressor: gbm | |
gbm = xgb.XGBRegressor(n_estimators=10) | |
# Perform random search: grid_mse | |
randomized_mse = RandomizedSearchCV(estimator=gbm, param_distributions=gbm_param_grid, | |
n_iter=5, scoring='neg_mean_squared_error', cv=4, verbose=1) | |
randomized_mse.fit(X, y) | |
# Print the best parameters and lowest RMSE | |
print("Best parameters found: ",randomized_mse.best_params_) | |
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_))) | |
-------- | |
Encoding categorical columns I: LabelEncoder | |
--------- | |
# Import LabelEncoder | |
from sklearn.preprocessing import LabelEncoder | |
# Fill missing values with 0 | |
df.LotFrontage = df.LotFrontage.fillna(0) | |
# Create a boolean mask for categorical columns | |
categorical_mask = (df.dtypes == object) | |
# Get list of categorical column names | |
categorical_columns = df.columns[categorical_mask].tolist() | |
# Print the head of the categorical columns | |
print(df[categorical_columns].head()) | |
# Create LabelEncoder object: le | |
le = LabelEncoder() | |
# Apply LabelEncoder to categorical columns | |
df[categorical_columns] = df[categorical_columns].apply(lambda x: le.fit_transform(x)) | |
# Print the head of the LabelEncoded categorical columns | |
print(df[categorical_columns].head()) | |
------ | |
Encoding categorical columns II: OneHotEncoder | |
------- | |
# Import OneHotEncoder | |
from sklearn.preprocessing import OneHotEncoder | |
# Create OneHotEncoder: ohe | |
ohe = OneHotEncoder(categorical_features=categorical_mask, sparse=False) | |
# Apply OneHotEncoder to categorical columns - output is no longer a dataframe: df_encoded | |
df_encoded = ohe.fit_transform(df) | |
# Print first 5 rows of the resulting dataset - again, this will no longer be a pandas dataframe | |
print(df_encoded[:5, :]) | |
# Print the shape of the original DataFrame | |
print(df.shape) | |
# Print the shape of the transformed array | |
print(df_encoded.shape) | |
------- | |
Encoding categorical columns III: DictVectorizer-LabelEncoder followed by OneHotEncoder - can be simplified by using a DictVectorizer. | |
------ | |
# Import DictVectorizer | |
from sklearn.feature_extraction import DictVectorizer | |
# Convert df into a dictionary: df_dict | |
df_dict = df.to_dict("records") | |
# Create the DictVectorizer object: dv | |
dv = DictVectorizer(sparse=False) | |
# Apply dv on df: df_encoded | |
df_encoded = dv.fit_transform(df_dict) | |
# Print the resulting first five rows | |
print(df_encoded[:5,:]) | |
# Print the vocabulary | |
print(dv.vocabulary_) | |
----- | |
Preprocessing within a pipeline | |
------- | |
# Import necessary modules | |
from sklearn.feature_extraction import DictVectorizer | |
from sklearn.pipeline import Pipeline | |
# Fill LotFrontage missing values with 0 | |
X.LotFrontage = X.LotFrontage.fillna(0) | |
# Setup the pipeline steps: steps | |
steps = [("ohe_onestep", DictVectorizer(sparse=False)), | |
("xgb_model", xgb.XGBRegressor())] | |
# Create the pipeline: xgb_pipeline | |
xgb_pipeline = Pipeline(steps) | |
# Fit the pipeline | |
xgb_pipeline.fit(X.to_dict("records"), y) | |
-------- | |
-----IMPORTANT STUFF--- | |
------ | |
Cross-validating your XGBoost model | |
----- | |
# Import necessary modules | |
from sklearn.feature_extraction import DictVectorizer | |
from sklearn.pipeline import Pipeline | |
from sklearn.model_selection import cross_val_score | |
# Fill LotFrontage missing values with 0 | |
X.LotFrontage = X.LotFrontage.fillna(0) | |
# Setup the pipeline steps: steps | |
steps = [("ohe_onestep", DictVectorizer(sparse=False)), | |
("xgb_model", xgb.XGBRegressor(max_depth=2, objective="reg:linear"))] | |
# Create the pipeline: xgb_pipeline | |
xgb_pipeline = Pipeline(steps) | |
# Cross-validate the model | |
cross_val_scores = cross_val_score(xgb_pipeline, X.to_dict("records"), y, cv=10, scoring="neg_mean_squared_error") | |
# Print the 10-fold RMSE | |
print("10-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val_scores)))) | |
--- | |
KIDNEY CASE STUDY | |
next gist |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Great example!
Consider setting the "scale_pos_weight" parameter for imbalanced binary classification tasks, such as churn datasets.