lakshay-arora · November 4, 2019 07:28
diff --git a/pipeline_4_pyspark.py b/pipeline_4_pyspark.py
 # define stage 1: transform the column feature_2 to numeric
 stage_1 = StringIndexer(inputCol= 'feature_2', outputCol= 'feature_2_index')
 # define stage 2: transform the column feature_3 to numeric
 stage_2 = StringIndexer(inputCol= 'feature_3', outputCol= 'feature_3_index')
 # define stage 3: one hot encode the numeric versions of feature 2 and 3 generated from stage 1 and stage 2
 stage_3 = OneHotEncoderEstimator(inputCols=[stage_1.getOutputCol(), stage_2.getOutputCol()], 
                                 outputCols= ['feature_2_encoded', 'feature_3_encoded'])
 # define stage 4: create a vector of all the features required to train the logistic regression model 
 stage_4 = VectorAssembler(inputCols=['feature_1', 'feature_2_encoded', 'feature_3_encoded', 'feature_4'],
                          outputCol='features')
 # define stage 5: logistic regression model                          
 stage_5 = LogisticRegression(featuresCol='features',labelCol='label')

 # setup the pipeline
 regression_pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, stage_4, stage_5])

 # fit the pipeline for the trainind data
 model = regression_pipeline.fit(sample_data_train)
 # transform the data
 sample_data_train = model.transform(sample_data_train)

 # view some of the columns generated
 sample_data_train.select('features', 'label', 'rawPrediction', 'probability', 'prediction').show()
	# define stage 1: transform the column feature_2 to numeric
	stage_1 = StringIndexer(inputCol= 'feature_2', outputCol= 'feature_2_index')
	# define stage 2: transform the column feature_3 to numeric
	stage_2 = StringIndexer(inputCol= 'feature_3', outputCol= 'feature_3_index')
	# define stage 3: one hot encode the numeric versions of feature 2 and 3 generated from stage 1 and stage 2
	stage_3 = OneHotEncoderEstimator(inputCols=[stage_1.getOutputCol(), stage_2.getOutputCol()],
	outputCols= ['feature_2_encoded', 'feature_3_encoded'])
	# define stage 4: create a vector of all the features required to train the logistic regression model
	stage_4 = VectorAssembler(inputCols=['feature_1', 'feature_2_encoded', 'feature_3_encoded', 'feature_4'],
	outputCol='features')
	# define stage 5: logistic regression model
	stage_5 = LogisticRegression(featuresCol='features',labelCol='label')

	# setup the pipeline
	regression_pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, stage_4, stage_5])

	# fit the pipeline for the trainind data
	model = regression_pipeline.fit(sample_data_train)
	# transform the data
	sample_data_train = model.transform(sample_data_train)

	# view some of the columns generated
	sample_data_train.select('features', 'label', 'rawPrediction', 'probability', 'prediction').show()