vidit0210 · February 29, 2020 20:55
diff --git a/KIDNEY CASE STUDY b/KIDNEY CASE STUDY
 DataLink : https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease

 # Import necessary modules
 from sklearn_pandas import DataFrameMapper
 from sklearn_pandas import CategoricalImputer

 # Check number of nulls in each feature column
 nulls_per_column = X.isnull().sum()
 print(nulls_per_column)

 # Create a boolean mask for categorical columns
 categorical_feature_mask = X.dtypes == object

 # Get list of categorical column names
 categorical_columns = X.columns[categorical_feature_mask].tolist()

 # Get list of non-categorical column names
 non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

 # Apply numeric imputer
 numeric_imputation_mapper = DataFrameMapper(
                                            [([numeric_feature],Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
                                            input_df=True,
                                            df_out=True
                                           )

 # Apply categorical imputer
 categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )
 -----
 Kidney disease case study II: Feature Union
 -----
 # Import FeatureUnion
 from sklearn.pipeline import FeatureUnion

 # Combine the numeric and categorical transformations
 numeric_categorical_union = FeatureUnion([
                                          ("num_mapper", numeric_imputation_mapper),
                                          ("cat_mapper", categorical_imputation_mapper)
                                         ])
                       
 ------
 Kidney disease case study III: Full pipeline
 ------
 # Create full pipeline
 pipeline = Pipeline([
                     ("featureunion", numeric_categorical_union),
                     ("dictifier", Dictifier()),
                     ("vectorizer", DictVectorizer(sort=False)),
                     ("clf", xgb.XGBClassifier(max_depth=3))
                    ])

 # Perform cross-validation
 cross_val_scores = cross_val_score(pipeline, kidney_data, y, scoring="roc_auc", cv=3)

 # Print avg. AUC
 print("3-fold AUC: ", np.mean(cross_val_scores))
 -------
 Bringing it all together
 ---------
 # Create the parameter grid
 gbm_param_grid = {
    'clf__learning_rate': np.arange(.05, 1, .05),
    'clf__max_depth': np.arange(3,10, 1),
    'clf__n_estimators': np.arange(50, 200, 50)
 }

 # Perform RandomizedSearchCV
 randomized_roc_auc = RandomizedSearchCV(estimator=pipeline,
                                        param_distributions=gbm_param_grid,
                                        n_iter=2, scoring='roc_auc', cv=2, verbose=1)

 # Fit the estimator
 randomized_roc_auc.fit(X, y)

 # Compute metrics
 print(randomized_roc_auc.best_score_)
 print(randomized_roc_auc.best_estimator_)
 ------
	DataLink : https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease

	# Import necessary modules
	from sklearn_pandas import DataFrameMapper
	from sklearn_pandas import CategoricalImputer

	# Check number of nulls in each feature column
	nulls_per_column = X.isnull().sum()
	print(nulls_per_column)

	# Create a boolean mask for categorical columns
	categorical_feature_mask = X.dtypes == object

	# Get list of categorical column names
	categorical_columns = X.columns[categorical_feature_mask].tolist()

	# Get list of non-categorical column names
	non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

	# Apply numeric imputer
	numeric_imputation_mapper = DataFrameMapper(
	[([numeric_feature],Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
	input_df=True,
	df_out=True
	)

	# Apply categorical imputer
	categorical_imputation_mapper = DataFrameMapper(
	[(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
	input_df=True,
	df_out=True
	)
	-----
	Kidney disease case study II: Feature Union
	-----
	# Import FeatureUnion
	from sklearn.pipeline import FeatureUnion

	# Combine the numeric and categorical transformations
	numeric_categorical_union = FeatureUnion([
	("num_mapper", numeric_imputation_mapper),
	("cat_mapper", categorical_imputation_mapper)
	])

	------
	Kidney disease case study III: Full pipeline
	------
	# Create full pipeline
	pipeline = Pipeline([
	("featureunion", numeric_categorical_union),
	("dictifier", Dictifier()),
	("vectorizer", DictVectorizer(sort=False)),
	("clf", xgb.XGBClassifier(max_depth=3))
	])

	# Perform cross-validation
	cross_val_scores = cross_val_score(pipeline, kidney_data, y, scoring="roc_auc", cv=3)

	# Print avg. AUC
	print("3-fold AUC: ", np.mean(cross_val_scores))
	-------
	Bringing it all together
	---------
	# Create the parameter grid
	gbm_param_grid = {
	'clf__learning_rate': np.arange(.05, 1, .05),
	'clf__max_depth': np.arange(3,10, 1),
	'clf__n_estimators': np.arange(50, 200, 50)
	}

	# Perform RandomizedSearchCV
	randomized_roc_auc = RandomizedSearchCV(estimator=pipeline,
	param_distributions=gbm_param_grid,
	n_iter=2, scoring='roc_auc', cv=2, verbose=1)

	# Fit the estimator
	randomized_roc_auc.fit(X, y)

	# Compute metrics
	print(randomized_roc_auc.best_score_)
	print(randomized_roc_auc.best_estimator_)
	------