vidit0210 · February 29, 2020 20:18 · Jason2Brownlee · May 25, 2024
diff --git a/XG-Boost-DataCamp b/XG-Boost-DataCamp
 ----
 Starting
 ----
 # Import xgboost
 import xgboost as xgb

 # Create arrays for the features and the target: X, y
 X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

 # Create the training and test sets
 X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=123)

 # Instantiate the XGBClassifier: xg_cl
 xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123)

 # Fit the classifier to the training set
 xg_cl.fit(X_train,y_train)

 # Predict the labels of the test set: preds
 preds = xg_cl.predict(X_test)

 # Compute the accuracy: accuracy
 accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
 print("accuracy: %f" % (accuracy))

 ----
 XGBOOST Cross Validation
 -----
 # Create arrays for the features and the target: X, y
 X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

 # Create the DMatrix from X and y: churn_dmatrix
 churn_dmatrix = xgb.DMatrix(data=X, label=y)

 # Create the parameter dictionary: params
 params = {"objective":"reg:logistic", "max_depth":3}

 # Perform cross-validation: cv_results
 cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                    nfold=3, num_boost_round=5, 
                    metrics="error", as_pandas=True, seed=123)

 # Print cv_results
 print(cv_results)

 # Print the accuracy
 print(((1-cv_results["test-error-mean"]).iloc[-1]))

 -----
 AUC CURVE
 -----
 # Perform cross_validation: cv_results
 cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, 
                    nfold=3, num_boost_round=5, 
                    metrics="auc", as_pandas=True, seed=123)

 # Print cv_results
 print(cv_results)

 # Print the AUC
 print((cv_results["test-auc-mean"]).iloc[-1])

 --------
 XG Boost Regression
 -------
 # Create the training and test sets
 X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

 # Instantiate the XGBRegressor: xg_reg
 xg_reg = xgb.XGBRegressor(objective="reg:linear", n_estimators=10, seed=123)

 # Fit the regressor to the training set
 xg_reg.fit(X_train, y_train)

 # Predict the labels of the test set: preds
 preds = xg_reg.predict(X_test)

 # Compute the rmse: rmse
 rmse = np.sqrt(mean_squared_error(y_test, preds))
 print("RMSE: %f" % (rmse))
 ----
 XG Boost GLearner Have to use DMatrix as it is not common
 ----
 # Convert the training and testing sets into DMatrixes: DM_train, DM_test
 DM_train = xgb.DMatrix(data=X_train, label=y_train)
 DM_test =  xgb.DMatrix(data=X_test, label=y_test)

 # Create the parameter dictionary: params
 params = {"booster":"gblinear", "objective":"reg:linear"}

 # Train the model: xg_reg
 xg_reg = xgb.train(params = params, dtrain=DM_train, num_boost_round=5)

 # Predict the labels of the test set: preds
 preds = xg_reg.predict(DM_test)

 # Compute and print the RMSE
 rmse = np.sqrt(mean_squared_error(y_test,preds))
 print("RMSE: %f" % (rmse))
 -----
 CV Fold using XG Boost
 ----
 # Create the DMatrix: housing_dmatrix
 housing_dmatrix = xgb.DMatrix(data=X,label=y)

 # Create the parameter dictionary: params
 params = {"objective":"reg:linear", "max_depth":4}

 # Perform cross-validation: cv_results
 cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=4, num_boost_round=5, metrics="mae", as_pandas=True, seed=123)

 # Print cv_results
 print(cv_results)

 # Extract and print final round boosting round metric
 print((cv_results["test-mae-mean"]).tail(1))
 ----
 Regularization
 -----
 # Create the DMatrix: housing_dmatrix
 housing_dmatrix = xgb.DMatrix(data=X, label=y)

 reg_params = [1, 10, 100]

 # Create the initial parameter dictionary for varying l2 strength: params
 params = {"objective":"reg:linear","max_depth":3}

 # Create an empty list for storing rmses as a function of l2 complexity
 rmses_l2 = []

 # Iterate over reg_params
 for reg in reg_params:

    # Update l2 strength
    params["lambda"] = reg
    
    # Pass this updated param dictionary into cv
    cv_results_rmse = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2, num_boost_round=5, metrics="rmse", as_pandas=True, seed=123)
    
    # Append best rmse (final round) to rmses_l2
    rmses_l2.append(cv_results_rmse["test-rmse-mean"].tail(1).values[0])

 # Look at best rmse per l2 param
 print("Best rmse as a function of l2:")
 print(pd.DataFrame(list(zip(reg_params, rmses_l2)), columns=["l2","rmse"]))

 -----
 Visulaizing Most Important DataSet
 -----
 # Create the DMatrix: housing_dmatrix
 housing_dmatrix = xgb.DMatrix(data=X, label=y)

 # Create the parameter dictionary: params
 params = {"objective":"reg:linear", "max_depth":4}

 # Train the model: xg_reg
 xg_reg = xgb.train(params=params, dtrain=housing_dmatrix, num_boost_round=10)

 # Plot the feature importances
 xgb.plot_importance(xg_reg)
 plt.show()
 -----
 Tuning XGBoost
 ------
 # Create the DMatrix: housing_dmatrix
 housing_dmatrix = xgb.DMatrix(data=X, label=y)

 # Create the parameter dictionary for each tree: params 
 params = {"objective":"reg:linear", "max_depth":3}

 # Create list of number of boosting rounds
 num_rounds = [5, 10, 15]

 # Empty list to store final round rmse per XGBoost model
 final_rmse_per_round = []

 # Iterate over num_rounds and build one model per num_boost_round parameter
 for curr_num_rounds in num_rounds:

    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=curr_num_rounds, metrics="rmse", as_pandas=True, seed=123)
    
    # Append final round RMSE
    final_rmse_per_round.append(cv_results["test-rmse-mean"].tail().values[-1])

 # Print the resultant DataFrame
 num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round))
 print(pd.DataFrame(num_rounds_rmses,columns=["num_boosting_rounds","rmse"]))

 -------
 Automated boosting round selection using early_stopping
 -------
 # Create your housing DMatrix: housing_dmatrix
 housing_dmatrix = xgb.DMatrix(data=X,label=y)

 # Create the parameter dictionary for each tree: params
 params = {"objective":"reg:linear", "max_depth":4}

 # Perform cross-validation with early stopping: cv_results
 cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=50, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)

 # Print cv_results
 print(cv_results)
 -------
 Tunig eta
 -----
 # Create your housing DMatrix: housing_dmatrix
 housing_dmatrix = xgb.DMatrix(data=X, label=y)

 # Create the parameter dictionary for each tree (boosting round)
 params = {"objective":"reg:linear", "max_depth":3}

 # Create list of eta values and empty list to store final round rmse per xgboost model
 eta_vals = [0.001, 0.01, 0.1]
 best_rmse = []

 # Systematically vary the eta
 for curr_val in eta_vals:

    params["eta"] = curr_val
    
    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3,
                        num_boost_round=10, early_stopping_rounds=5,
                        metrics="rmse", as_pandas=True, seed=123)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

 # Print the resultant DataFrame


 print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=["eta","best_rmse"]))

 ---
 Tuning max_depth
 -----
 # Create your housing DMatrix: housing_dmatrix
 housing_dmatrix = xgb.DMatrix(data=X,label=y)

 # Create the parameter dictionary
 params = {"objective":"reg:linear"}

 # Create list of max_depth values
 max_depths = [2, 5, 10, 20]
 best_rmse = []

 # Systematically vary the max_depth
 for curr_val in max_depths:

    params["max_depth"] = curr_val
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2,
                 num_boost_round=10, early_stopping_rounds=5,
                 metrics="rmse", as_pandas=True, seed=123)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

 # Print the resultant DataFrame
 print(pd.DataFrame(list(zip(max_depths, best_rmse)),columns=["max_depth","best_rmse"]))

 -----
 Tuning colsample_bytree
 ----
 # Create your housing DMatrix
 housing_dmatrix = xgb.DMatrix(data=X,label=y)

 # Create the parameter dictionary
 params={"objective":"reg:linear","max_depth":3}

 # Create list of hyperparameter values
 colsample_bytree_vals = [0.1, 0.5, 0.8, 1]
 best_rmse = []

 # Systematically vary the hyperparameter value 
 for curr_val in colsample_bytree_vals:

    params["colsample_bytree"] = curr_val
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2,
                 num_boost_round=10, early_stopping_rounds=5,
                 metrics="rmse", as_pandas=True, seed=123)
    
    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

 # Print the resultant DataFrame
 print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=["colsample_bytree","best_rmse"]))
 ------
 Grid search with XGBoost
 ------
 # Create the parameter grid: gbm_param_grid
 gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5]
 }

 # Instantiate the regressor: gbm
 gbm = xgb.XGBRegressor()

 # Perform grid search: grid_mse
 grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid,
                        scoring='neg_mean_squared_error', cv=4, verbose=1)
 grid_mse.fit(X, y)

 # Print the best parameters and lowest RMSE
 print("Best parameters found: ", grid_mse.best_params_)
 print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))
 ---------
 Random search with XGBoost
 ----------
 # Create the parameter grid: gbm_param_grid 
 gbm_param_grid = {
    'n_estimators': [25],
    'max_depth': range(2, 12)
 }

 # Instantiate the regressor: gbm
 gbm = xgb.XGBRegressor(n_estimators=10)

 # Perform random search: grid_mse
 randomized_mse = RandomizedSearchCV(estimator=gbm, param_distributions=gbm_param_grid,
                                    n_iter=5, scoring='neg_mean_squared_error', cv=4, verbose=1)
 randomized_mse.fit(X, y)

 # Print the best parameters and lowest RMSE
 print("Best parameters found: ",randomized_mse.best_params_)
 print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))
 --------
 Encoding categorical columns I: LabelEncoder
 ---------
 # Import LabelEncoder
 from sklearn.preprocessing import LabelEncoder

 # Fill missing values with 0
 df.LotFrontage = df.LotFrontage.fillna(0)

 # Create a boolean mask for categorical columns
 categorical_mask = (df.dtypes == object)

 # Get list of categorical column names
 categorical_columns = df.columns[categorical_mask].tolist()

 # Print the head of the categorical columns
 print(df[categorical_columns].head())

 # Create LabelEncoder object: le
 le = LabelEncoder()

 # Apply LabelEncoder to categorical columns
 df[categorical_columns] = df[categorical_columns].apply(lambda x: le.fit_transform(x))

 # Print the head of the LabelEncoded categorical columns
 print(df[categorical_columns].head())

 ------
 Encoding categorical columns II: OneHotEncoder
 -------
 # Import OneHotEncoder
 from sklearn.preprocessing import OneHotEncoder

 # Create OneHotEncoder: ohe
 ohe = OneHotEncoder(categorical_features=categorical_mask, sparse=False)

 # Apply OneHotEncoder to categorical columns - output is no longer a dataframe: df_encoded
 df_encoded = ohe.fit_transform(df)

 # Print first 5 rows of the resulting dataset - again, this will no longer be a pandas dataframe
 print(df_encoded[:5, :])

 # Print the shape of the original DataFrame
 print(df.shape)

 # Print the shape of the transformed array
 print(df_encoded.shape)
 -------
 Encoding categorical columns III: DictVectorizer-LabelEncoder followed by OneHotEncoder - can be simplified by using a DictVectorizer.
 ------
 # Import DictVectorizer
 from sklearn.feature_extraction import DictVectorizer

 # Convert df into a dictionary: df_dict
 df_dict = df.to_dict("records")

 # Create the DictVectorizer object: dv
 dv = DictVectorizer(sparse=False)

 # Apply dv on df: df_encoded
 df_encoded = dv.fit_transform(df_dict)

 # Print the resulting first five rows
 print(df_encoded[:5,:])

 # Print the vocabulary
 print(dv.vocabulary_)
 -----
 Preprocessing within a pipeline
 -------
 # Import necessary modules
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline

 # Fill LotFrontage missing values with 0
 X.LotFrontage = X.LotFrontage.fillna(0)

 # Setup the pipeline steps: steps
 steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor())]

 # Create the pipeline: xgb_pipeline
 xgb_pipeline = Pipeline(steps)

 # Fit the pipeline
 xgb_pipeline.fit(X.to_dict("records"), y)
 --------
 -----IMPORTANT STUFF---
 ------
 Cross-validating your XGBoost model
 -----

 # Import necessary modules
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import cross_val_score

 # Fill LotFrontage missing values with 0
 X.LotFrontage = X.LotFrontage.fillna(0)

 # Setup the pipeline steps: steps
 steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor(max_depth=2, objective="reg:linear"))]

 # Create the pipeline: xgb_pipeline
 xgb_pipeline = Pipeline(steps)

 # Cross-validate the model
 cross_val_scores = cross_val_score(xgb_pipeline, X.to_dict("records"), y, cv=10, scoring="neg_mean_squared_error")

 # Print the 10-fold RMSE
 print("10-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val_scores))))
 ---
 KIDNEY CASE STUDY
 next gist
	----
	Starting
	----
	# Import xgboost
	import xgboost as xgb

	# Create arrays for the features and the target: X, y
	X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

	# Create the training and test sets
	X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=123)

	# Instantiate the XGBClassifier: xg_cl
	xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123)

	# Fit the classifier to the training set
	xg_cl.fit(X_train,y_train)

	# Predict the labels of the test set: preds
	preds = xg_cl.predict(X_test)

	# Compute the accuracy: accuracy
	accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
	print("accuracy: %f" % (accuracy))

	----
	XGBOOST Cross Validation
	-----
	# Create arrays for the features and the target: X, y
	X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

	# Create the DMatrix from X and y: churn_dmatrix
	churn_dmatrix = xgb.DMatrix(data=X, label=y)

	# Create the parameter dictionary: params
	params = {"objective":"reg:logistic", "max_depth":3}

	# Perform cross-validation: cv_results
	cv_results = xgb.cv(dtrain=churn_dmatrix, params=params,
	nfold=3, num_boost_round=5,
	metrics="error", as_pandas=True, seed=123)

	# Print cv_results
	print(cv_results)

	# Print the accuracy
	print(((1-cv_results["test-error-mean"]).iloc[-1]))

	-----
	AUC CURVE
	-----
	# Perform cross_validation: cv_results
	cv_results = xgb.cv(dtrain=churn_dmatrix, params=params,
	nfold=3, num_boost_round=5,
	metrics="auc", as_pandas=True, seed=123)

	# Print cv_results
	print(cv_results)

	# Print the AUC
	print((cv_results["test-auc-mean"]).iloc[-1])

	--------
	XG Boost Regression
	-------
	# Create the training and test sets
	X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

	# Instantiate the XGBRegressor: xg_reg
	xg_reg = xgb.XGBRegressor(objective="reg:linear", n_estimators=10, seed=123)

	# Fit the regressor to the training set
	xg_reg.fit(X_train, y_train)

	# Predict the labels of the test set: preds
	preds = xg_reg.predict(X_test)

	# Compute the rmse: rmse
	rmse = np.sqrt(mean_squared_error(y_test, preds))
	print("RMSE: %f" % (rmse))
	----
	XG Boost GLearner Have to use DMatrix as it is not common
	----
	# Convert the training and testing sets into DMatrixes: DM_train, DM_test
	DM_train = xgb.DMatrix(data=X_train, label=y_train)
	DM_test = xgb.DMatrix(data=X_test, label=y_test)

	# Create the parameter dictionary: params
	params = {"booster":"gblinear", "objective":"reg:linear"}

	# Train the model: xg_reg
	xg_reg = xgb.train(params = params, dtrain=DM_train, num_boost_round=5)

	# Predict the labels of the test set: preds
	preds = xg_reg.predict(DM_test)

	# Compute and print the RMSE
	rmse = np.sqrt(mean_squared_error(y_test,preds))
	print("RMSE: %f" % (rmse))
	-----
	CV Fold using XG Boost
	----
	# Create the DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X,label=y)

	# Create the parameter dictionary: params
	params = {"objective":"reg:linear", "max_depth":4}

	# Perform cross-validation: cv_results
	cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=4, num_boost_round=5, metrics="mae", as_pandas=True, seed=123)

	# Print cv_results
	print(cv_results)

	# Extract and print final round boosting round metric
	print((cv_results["test-mae-mean"]).tail(1))
	----
	Regularization
	-----
	# Create the DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X, label=y)

	reg_params = [1, 10, 100]

	# Create the initial parameter dictionary for varying l2 strength: params
	params = {"objective":"reg:linear","max_depth":3}

	# Create an empty list for storing rmses as a function of l2 complexity
	rmses_l2 = []

	# Iterate over reg_params
	for reg in reg_params:

	# Update l2 strength
	params["lambda"] = reg

	# Pass this updated param dictionary into cv
	cv_results_rmse = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2, num_boost_round=5, metrics="rmse", as_pandas=True, seed=123)

	# Append best rmse (final round) to rmses_l2
	rmses_l2.append(cv_results_rmse["test-rmse-mean"].tail(1).values[0])

	# Look at best rmse per l2 param
	print("Best rmse as a function of l2:")
	print(pd.DataFrame(list(zip(reg_params, rmses_l2)), columns=["l2","rmse"]))

	-----
	Visulaizing Most Important DataSet
	-----
	# Create the DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X, label=y)

	# Create the parameter dictionary: params
	params = {"objective":"reg:linear", "max_depth":4}

	# Train the model: xg_reg
	xg_reg = xgb.train(params=params, dtrain=housing_dmatrix, num_boost_round=10)

	# Plot the feature importances
	xgb.plot_importance(xg_reg)
	plt.show()
	-----
	Tuning XGBoost
	------
	# Create the DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X, label=y)

	# Create the parameter dictionary for each tree: params
	params = {"objective":"reg:linear", "max_depth":3}

	# Create list of number of boosting rounds
	num_rounds = [5, 10, 15]

	# Empty list to store final round rmse per XGBoost model
	final_rmse_per_round = []

	# Iterate over num_rounds and build one model per num_boost_round parameter
	for curr_num_rounds in num_rounds:

	# Perform cross-validation: cv_results
	cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=curr_num_rounds, metrics="rmse", as_pandas=True, seed=123)

	# Append final round RMSE
	final_rmse_per_round.append(cv_results["test-rmse-mean"].tail().values[-1])

	# Print the resultant DataFrame
	num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round))
	print(pd.DataFrame(num_rounds_rmses,columns=["num_boosting_rounds","rmse"]))

	-------
	Automated boosting round selection using early_stopping
	-------
	# Create your housing DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X,label=y)

	# Create the parameter dictionary for each tree: params
	params = {"objective":"reg:linear", "max_depth":4}

	# Perform cross-validation with early stopping: cv_results
	cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=50, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)

	# Print cv_results
	print(cv_results)
	-------
	Tunig eta
	-----
	# Create your housing DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X, label=y)

	# Create the parameter dictionary for each tree (boosting round)
	params = {"objective":"reg:linear", "max_depth":3}

	# Create list of eta values and empty list to store final round rmse per xgboost model
	eta_vals = [0.001, 0.01, 0.1]
	best_rmse = []

	# Systematically vary the eta
	for curr_val in eta_vals:

	params["eta"] = curr_val

	# Perform cross-validation: cv_results
	cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3,
	num_boost_round=10, early_stopping_rounds=5,
	metrics="rmse", as_pandas=True, seed=123)

	# Append the final round rmse to best_rmse
	best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

	# Print the resultant DataFrame


	print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=["eta","best_rmse"]))

	---
	Tuning max_depth
	-----
	# Create your housing DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X,label=y)

	# Create the parameter dictionary
	params = {"objective":"reg:linear"}

	# Create list of max_depth values
	max_depths = [2, 5, 10, 20]
	best_rmse = []

	# Systematically vary the max_depth
	for curr_val in max_depths:

	params["max_depth"] = curr_val

	# Perform cross-validation
	cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2,
	num_boost_round=10, early_stopping_rounds=5,
	metrics="rmse", as_pandas=True, seed=123)

	# Append the final round rmse to best_rmse
	best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

	# Print the resultant DataFrame
	print(pd.DataFrame(list(zip(max_depths, best_rmse)),columns=["max_depth","best_rmse"]))

	-----
	Tuning colsample_bytree
	----
	# Create your housing DMatrix
	housing_dmatrix = xgb.DMatrix(data=X,label=y)

	# Create the parameter dictionary
	params={"objective":"reg:linear","max_depth":3}

	# Create list of hyperparameter values
	colsample_bytree_vals = [0.1, 0.5, 0.8, 1]
	best_rmse = []

	# Systematically vary the hyperparameter value
	for curr_val in colsample_bytree_vals:

	params["colsample_bytree"] = curr_val

	# Perform cross-validation
	cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2,
	num_boost_round=10, early_stopping_rounds=5,
	metrics="rmse", as_pandas=True, seed=123)

	# Append the final round rmse to best_rmse
	best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

	# Print the resultant DataFrame
	print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=["colsample_bytree","best_rmse"]))
	------
	Grid search with XGBoost
	------
	# Create the parameter grid: gbm_param_grid
	gbm_param_grid = {
	'colsample_bytree': [0.3, 0.7],
	'n_estimators': [50],
	'max_depth': [2, 5]
	}

	# Instantiate the regressor: gbm
	gbm = xgb.XGBRegressor()

	# Perform grid search: grid_mse
	grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid,
	scoring='neg_mean_squared_error', cv=4, verbose=1)
	grid_mse.fit(X, y)

	# Print the best parameters and lowest RMSE
	print("Best parameters found: ", grid_mse.best_params_)
	print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))
	---------
	Random search with XGBoost
	----------
	# Create the parameter grid: gbm_param_grid
	gbm_param_grid = {
	'n_estimators': [25],
	'max_depth': range(2, 12)
	}

	# Instantiate the regressor: gbm
	gbm = xgb.XGBRegressor(n_estimators=10)

	# Perform random search: grid_mse
	randomized_mse = RandomizedSearchCV(estimator=gbm, param_distributions=gbm_param_grid,
	n_iter=5, scoring='neg_mean_squared_error', cv=4, verbose=1)
	randomized_mse.fit(X, y)

	# Print the best parameters and lowest RMSE
	print("Best parameters found: ",randomized_mse.best_params_)
	print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))
	--------
	Encoding categorical columns I: LabelEncoder
	---------
	# Import LabelEncoder
	from sklearn.preprocessing import LabelEncoder

	# Fill missing values with 0
	df.LotFrontage = df.LotFrontage.fillna(0)

	# Create a boolean mask for categorical columns
	categorical_mask = (df.dtypes == object)

	# Get list of categorical column names
	categorical_columns = df.columns[categorical_mask].tolist()

	# Print the head of the categorical columns
	print(df[categorical_columns].head())

	# Create LabelEncoder object: le
	le = LabelEncoder()

	# Apply LabelEncoder to categorical columns
	df[categorical_columns] = df[categorical_columns].apply(lambda x: le.fit_transform(x))

	# Print the head of the LabelEncoded categorical columns
	print(df[categorical_columns].head())

	------
	Encoding categorical columns II: OneHotEncoder
	-------
	# Import OneHotEncoder
	from sklearn.preprocessing import OneHotEncoder

	# Create OneHotEncoder: ohe
	ohe = OneHotEncoder(categorical_features=categorical_mask, sparse=False)

	# Apply OneHotEncoder to categorical columns - output is no longer a dataframe: df_encoded
	df_encoded = ohe.fit_transform(df)

	# Print first 5 rows of the resulting dataset - again, this will no longer be a pandas dataframe
	print(df_encoded[:5, :])

	# Print the shape of the original DataFrame
	print(df.shape)

	# Print the shape of the transformed array
	print(df_encoded.shape)
	-------
	Encoding categorical columns III: DictVectorizer-LabelEncoder followed by OneHotEncoder - can be simplified by using a DictVectorizer.
	------
	# Import DictVectorizer
	from sklearn.feature_extraction import DictVectorizer

	# Convert df into a dictionary: df_dict
	df_dict = df.to_dict("records")

	# Create the DictVectorizer object: dv
	dv = DictVectorizer(sparse=False)

	# Apply dv on df: df_encoded
	df_encoded = dv.fit_transform(df_dict)

	# Print the resulting first five rows
	print(df_encoded[:5,:])

	# Print the vocabulary
	print(dv.vocabulary_)
	-----
	Preprocessing within a pipeline
	-------
	# Import necessary modules
	from sklearn.feature_extraction import DictVectorizer
	from sklearn.pipeline import Pipeline

	# Fill LotFrontage missing values with 0
	X.LotFrontage = X.LotFrontage.fillna(0)

	# Setup the pipeline steps: steps
	steps = [("ohe_onestep", DictVectorizer(sparse=False)),
	("xgb_model", xgb.XGBRegressor())]

	# Create the pipeline: xgb_pipeline
	xgb_pipeline = Pipeline(steps)

	# Fit the pipeline
	xgb_pipeline.fit(X.to_dict("records"), y)
	--------
	-----IMPORTANT STUFF---
	------
	Cross-validating your XGBoost model
	-----

	# Import necessary modules
	from sklearn.feature_extraction import DictVectorizer
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import cross_val_score

	# Fill LotFrontage missing values with 0
	X.LotFrontage = X.LotFrontage.fillna(0)

	# Setup the pipeline steps: steps
	steps = [("ohe_onestep", DictVectorizer(sparse=False)),
	("xgb_model", xgb.XGBRegressor(max_depth=2, objective="reg:linear"))]

	# Create the pipeline: xgb_pipeline
	xgb_pipeline = Pipeline(steps)

	# Cross-validate the model
	cross_val_scores = cross_val_score(xgb_pipeline, X.to_dict("records"), y, cv=10, scoring="neg_mean_squared_error")

	# Print the 10-fold RMSE
	print("10-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val_scores))))
	---
	KIDNEY CASE STUDY
	next gist