Created
October 15, 2018 20:19
-
-
Save HarshSingh16/6568b0bfd046ceec238b16eff9003873 to your computer and use it in GitHub Desktop.
Building a Predictive Model to predict survivals on the Titanic Data Set
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########loading the Titanic Train Data Set | |
TitanicTrain<-train1 | |
######Checking Missing Values in the Train Data Set | |
sapply(TitanicTrain, function(x)sum(is.na(x))) | |
#######Loading the Titanic Test Data Set | |
TitanicTest<-test11 | |
#######Checking Missing Values in the Test Data Set | |
sapply(TitanicTest, function(x)sum(is.na(x))) | |
#######Adding the Survived Variable to the Test Data Set with NA's so that we can easily merge Train and Test | |
TitanicTest$Survived<-NA | |
###Merging the Two Data Sets | |
TitanicTrain<-rbind(TitanicTrain,TitanicTest) | |
#######Making Sex a Factor Variable | |
TitanicTrain$Sex<-as.factor(TitanicTrain$Sex) | |
########Extracting the Titles from the Name Column | |
TitanicTrain$Title <- gsub('(.*, )|(\\..*)', '', TitanicTrain$Name) | |
###########Fixing the Missing Values in the Variable "Age" with the Median | |
ROWS<-which(is.na(TitanicTrain$Age)) | |
MedianAge<-median(TitanicTrain$Age,na.rm=TRUE) | |
TitanicTrain$Age<-as.character(TitanicTrain$Age) | |
TitanicTrain[ROWS,"Age"]<-MedianAge | |
###########Fixing the Missing Values in Variable "Embarked" | |
ROWS2<-which(is.na(TitanicTrain$Embarked)) | |
TitanicTrain[ROWS2,"Embarked"]<-"S" | |
##########Fixing the Missing Values in Variable "Fare" | |
Rows3<-which(is.na(TitanicTrain$Fare)) | |
TitanicTrain[Rows3,"Fare"]<-14.45 | |
###########Converting the types of Variables to correct form | |
TitanicTrain$Survived<-as.factor(TitanicTrain$Survived) | |
TitanicTrain$PassengerId<-as.factor(TitanicTrain$PassengerId) | |
TitanicTrain$Pclass<-as.factor(TitanicTrain$Pclass) | |
TitanicTrain$Age<-as.numeric(TitanicTrain$Age) | |
TitanicTrain$SibSp<-as.factor(TitanicTrain$SibSp) | |
TitanicTrain$Parch<-as.factor(TitanicTrain$Parch) | |
TitanicTrain$Embarked<-as.factor(TitanicTrain$Embarked) | |
TitanicTrain$Title<-as.factor(TitanicTrain$Title) | |
#########Removing Variables "Ticket and "Cabin" as they have a huge proportion of missing values and do not add any value to the model | |
TitanicTrain$Ticket<-NULL | |
TitanicTrain$Cabin<-NULL | |
TitanicTrain$Name<-NULL ###We Can remove the variable Name too as we have already extracted the Titles | |
########Converting the prepared data to a Data Frame | |
TitanicTrain<-data.frame(TitanicTrain) | |
#######Converting Passenger Id to numeric so that we can subset the data and sepearate the observations that we have to predict | |
TitanicTrain$PassengerId<-as.numeric(TitanicTrain$PassengerId) | |
Train<-subset(TitanicTrain,PassengerId<892) | |
Test<-subset(TitanicTrain,PassengerId>=892) | |
#############Separating the Independent (x) and the dependant ("Survived") variables | |
y<-as.numeric(Train[,2])-1 | |
x<-data.frame(Train[,3:10]) | |
Test$Survived<-NULL | |
xtest<-data.frame(Test) | |
xtest$PassengerId<-NULL | |
########Finally checking Structures for all the created data frames | |
str(y) | |
str(x) | |
str(xtest) | |
###########Loading the SuperLearner Library | |
library(SuperLearner) | |
#############Training the Model using SuperLearner Library (Ensemble Modelling) | |
single.model2 <- SuperLearner(y, | |
x, | |
family=binomial(), | |
SL.library=list("SL.ranger","SL.ksvm","SL.gbm","SL.xgboost", | |
"SL.glmnet","SL.randomForest")) | |
###########Printing the Model to check the Risk Estimates/Error | |
print(single.model2) | |
##########Making Predictions on the Test Data | |
predictions3<-predict.SuperLearner(single.model2,xtest) | |
###########Observing the frequency distribution of the Predictions | |
hist(predictions3$pred) | |
#########Converting the Predictions to Binaries | |
predictions4<-ifelse(predictions3$pred>=0.73,1,0) | |
#############Creating a CV File with the Predictions | |
write.csv(predictions4,"Predictions.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment