HarshSingh16 · October 15, 2018 20:19
diff --git a/Surviving Titanic.R b/Surviving Titanic.R
 ########loading the Titanic Train Data Set
 TitanicTrain<-train1

 ######Checking Missing Values in the Train Data Set
 sapply(TitanicTrain, function(x)sum(is.na(x)))

 #######Loading the Titanic Test Data Set
 TitanicTest<-test11

 #######Checking Missing Values in the Test Data Set
 sapply(TitanicTest, function(x)sum(is.na(x)))

 #######Adding the Survived Variable to the Test Data Set with NA's so that we can easily merge Train and Test
 TitanicTest$Survived<-NA

 ###Merging the Two Data Sets
 TitanicTrain<-rbind(TitanicTrain,TitanicTest)

 #######Making Sex a Factor Variable
 TitanicTrain$Sex<-as.factor(TitanicTrain$Sex)

 ########Extracting the Titles from the Name Column
 TitanicTrain$Title <- gsub('(.*, )|(\\..*)', '', TitanicTrain$Name)

 ###########Fixing the Missing Values in the Variable "Age" with the Median
 ROWS<-which(is.na(TitanicTrain$Age))
 MedianAge<-median(TitanicTrain$Age,na.rm=TRUE)
 TitanicTrain$Age<-as.character(TitanicTrain$Age)
 TitanicTrain[ROWS,"Age"]<-MedianAge


 ###########Fixing the Missing Values in Variable "Embarked" 
 ROWS2<-which(is.na(TitanicTrain$Embarked))
 TitanicTrain[ROWS2,"Embarked"]<-"S"

 ##########Fixing the Missing Values in Variable "Fare"
 Rows3<-which(is.na(TitanicTrain$Fare))
 TitanicTrain[Rows3,"Fare"]<-14.45

 ###########Converting the types of Variables to correct form
 TitanicTrain$Survived<-as.factor(TitanicTrain$Survived)
 TitanicTrain$PassengerId<-as.factor(TitanicTrain$PassengerId)
 TitanicTrain$Pclass<-as.factor(TitanicTrain$Pclass)
 TitanicTrain$Age<-as.numeric(TitanicTrain$Age)
 TitanicTrain$SibSp<-as.factor(TitanicTrain$SibSp)
 TitanicTrain$Parch<-as.factor(TitanicTrain$Parch)
 TitanicTrain$Embarked<-as.factor(TitanicTrain$Embarked)
 TitanicTrain$Title<-as.factor(TitanicTrain$Title)

 #########Removing Variables "Ticket and "Cabin" as they have a huge proportion of missing values and do not add any value to the model
 TitanicTrain$Ticket<-NULL
 TitanicTrain$Cabin<-NULL
 TitanicTrain$Name<-NULL ###We Can remove the variable Name too as we have already extracted the Titles


 ########Converting the prepared data to a Data Frame 
 TitanicTrain<-data.frame(TitanicTrain)

 #######Converting Passenger Id to numeric so that we can subset the data and sepearate the observations that we have to predict
 TitanicTrain$PassengerId<-as.numeric(TitanicTrain$PassengerId)
 Train<-subset(TitanicTrain,PassengerId<892)
 Test<-subset(TitanicTrain,PassengerId>=892)

 #############Separating the Independent (x) and the dependant ("Survived") variables 
 y<-as.numeric(Train[,2])-1
 x<-data.frame(Train[,3:10])
 Test$Survived<-NULL
 xtest<-data.frame(Test)
 xtest$PassengerId<-NULL

 ########Finally checking Structures for all the created data frames
 str(y)
 str(x)
 str(xtest)


 ###########Loading the SuperLearner Library
 library(SuperLearner)

 #############Training the Model using SuperLearner Library (Ensemble Modelling)
 single.model2 <- SuperLearner(y,
                             x,
                             family=binomial(),
                             SL.library=list("SL.ranger","SL.ksvm","SL.gbm","SL.xgboost",
                                             "SL.glmnet","SL.randomForest"))

 ###########Printing the Model to check the Risk Estimates/Error
 print(single.model2)


 ##########Making Predictions on the Test Data
 predictions3<-predict.SuperLearner(single.model2,xtest)
 ###########Observing the frequency distribution of the Predictions
 hist(predictions3$pred)

 #########Converting the Predictions to Binaries
 predictions4<-ifelse(predictions3$pred>=0.73,1,0)


 #############Creating a CV File with the Predictions
 write.csv(predictions4,"Predictions.csv")
	########loading the Titanic Train Data Set
	TitanicTrain<-train1

	######Checking Missing Values in the Train Data Set
	sapply(TitanicTrain, function(x)sum(is.na(x)))

	#######Loading the Titanic Test Data Set
	TitanicTest<-test11

	#######Checking Missing Values in the Test Data Set
	sapply(TitanicTest, function(x)sum(is.na(x)))

	#######Adding the Survived Variable to the Test Data Set with NA's so that we can easily merge Train and Test
	TitanicTest$Survived<-NA

	###Merging the Two Data Sets
	TitanicTrain<-rbind(TitanicTrain,TitanicTest)

	#######Making Sex a Factor Variable
	TitanicTrain$Sex<-as.factor(TitanicTrain$Sex)

	########Extracting the Titles from the Name Column
	TitanicTrain$Title <- gsub('(., )\|(\\..)', '', TitanicTrain$Name)

	###########Fixing the Missing Values in the Variable "Age" with the Median
	ROWS<-which(is.na(TitanicTrain$Age))
	MedianAge<-median(TitanicTrain$Age,na.rm=TRUE)
	TitanicTrain$Age<-as.character(TitanicTrain$Age)
	TitanicTrain[ROWS,"Age"]<-MedianAge


	###########Fixing the Missing Values in Variable "Embarked"
	ROWS2<-which(is.na(TitanicTrain$Embarked))
	TitanicTrain[ROWS2,"Embarked"]<-"S"

	##########Fixing the Missing Values in Variable "Fare"
	Rows3<-which(is.na(TitanicTrain$Fare))
	TitanicTrain[Rows3,"Fare"]<-14.45

	###########Converting the types of Variables to correct form
	TitanicTrain$Survived<-as.factor(TitanicTrain$Survived)
	TitanicTrain$PassengerId<-as.factor(TitanicTrain$PassengerId)
	TitanicTrain$Pclass<-as.factor(TitanicTrain$Pclass)
	TitanicTrain$Age<-as.numeric(TitanicTrain$Age)
	TitanicTrain$SibSp<-as.factor(TitanicTrain$SibSp)
	TitanicTrain$Parch<-as.factor(TitanicTrain$Parch)
	TitanicTrain$Embarked<-as.factor(TitanicTrain$Embarked)
	TitanicTrain$Title<-as.factor(TitanicTrain$Title)

	#########Removing Variables "Ticket and "Cabin" as they have a huge proportion of missing values and do not add any value to the model
	TitanicTrain$Ticket<-NULL
	TitanicTrain$Cabin<-NULL
	TitanicTrain$Name<-NULL ###We Can remove the variable Name too as we have already extracted the Titles


	########Converting the prepared data to a Data Frame
	TitanicTrain<-data.frame(TitanicTrain)

	#######Converting Passenger Id to numeric so that we can subset the data and sepearate the observations that we have to predict
	TitanicTrain$PassengerId<-as.numeric(TitanicTrain$PassengerId)
	Train<-subset(TitanicTrain,PassengerId<892)
	Test<-subset(TitanicTrain,PassengerId>=892)

	#############Separating the Independent (x) and the dependant ("Survived") variables
	y<-as.numeric(Train[,2])-1
	x<-data.frame(Train[,3:10])
	Test$Survived<-NULL
	xtest<-data.frame(Test)
	xtest$PassengerId<-NULL

	########Finally checking Structures for all the created data frames
	str(y)
	str(x)
	str(xtest)


	###########Loading the SuperLearner Library
	library(SuperLearner)

	#############Training the Model using SuperLearner Library (Ensemble Modelling)
	single.model2 <- SuperLearner(y,
	x,
	family=binomial(),
	SL.library=list("SL.ranger","SL.ksvm","SL.gbm","SL.xgboost",
	"SL.glmnet","SL.randomForest"))

	###########Printing the Model to check the Risk Estimates/Error
	print(single.model2)


	##########Making Predictions on the Test Data
	predictions3<-predict.SuperLearner(single.model2,xtest)
	###########Observing the frequency distribution of the Predictions
	hist(predictions3$pred)

	#########Converting the Predictions to Binaries
	predictions4<-ifelse(predictions3$pred>=0.73,1,0)


	#############Creating a CV File with the Predictions
	write.csv(predictions4,"Predictions.csv")