The Titanic Dataset

Datasets

There are different datasets with slightly different counts.

The following analysis provides the Titanic’s training and test datasets, together with some of the R scripts presented in the Datacamp “free tutorial” at this link

I have edited some of the scripts to clarify certain points and to add figures and sub-analyses.

ls()
rm()
rm(list=ls())
Import train and test, or load as normal
train_url <- "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train <- read.csv(train_url) 
test_url <- "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test <- read.csv(test_url)

explore
# Print train and test to the console
train
test
str(train)
str(test)
colnames(train)
summary(train)
summary(test)
#table of survived
table(train$Survived)
0   1 
549 342
# proportions of survived
prop.table(table(train$Survived))
        0              1 
0.6161616   0.3838384
#compare survivors depending on sex
prop.table(table(train$Sex, train$Survived),1)
           0      1
  female  81    233
  male   468    109
#proportion
prop.table(table(train$Sex, train$Survived),1)
              0         1
  female 0.2579618 0.7420382
  male   0.8110919 0.1889081
The proportion of only female passengers who survived or did not.
#make a grouped Bar Chart with Direct Labels with Plotly
sex <- c('females', 'males')
survived <- c(233,109)
not_survived <- c(81, 468)
text <- c('females', 'males')
data <- data.frame(sex, survived, not_survived, text)
data

fig04 <- data %>% plot_ly()
fig04 <- fig04 %>% add_trace(x = ~sex, y = ~survived, name = "survived", type = 'bar',
                         text = survived, textposition = 'auto',
                         marker = list(color = 'rgb(158,202,225)',
                                       line = list(color = 'rgb(8,48,107)', width = 1.5)))
fig04 <- fig04 %>% add_trace(x = ~sex, y = ~not_survived, name = "not_survived", type = 'bar',
                         text = not_survived, textposition = 'auto',
                         marker = list(color = 'rgb(58,200,225)',
                                       line = list(color = 'rgb(8,48,107)', width = 1.5)))
fig04 <- fig04 %>% layout(title = "Compare survivors depending on sex",
                      barmode = 'group',
                      xaxis = list(title = ""),
                      yaxis = list(title = ""))

fig04
#check whether age was relevant for survivors
#create a new category , thus create a new column called “lucky”
train$lucky <- NA
# and we assign the lucky variable to survivors
train$lucky[train$Survived == 1] <- TRUE
train$lucky[train$Survived == 0] <- FALSE
check whether this new variable works
#exercise
# Your train and test set are still loaded in
str(train)
str(test)
# Create the column child, and indicate whether child or no child, defining children as those under 18 years of age
train$Child <- NA
train$Child[train$Age < 18] <- 1
train$Child[train$Age >= 18] <- 0

#recheck the new column
colnames(train)
# Two-way comparison between children and survivors
prop.table(table(train$Child, train$Survived), 1)
         0         1
  0 0.6189684 0.3810316
  1 0.4601770 0.5398230
#code for grouped Bar Chart with Direct Labels with Plotly
Children <-  c("Children", "Not_children")
survived <- c(0.54, 0.38)
not_survived <- c(0.46, 0.62)
data <- data.frame(Children, survived, not_survived) 
data

fig05 <- data %>% plot_ly()
fig05 <- fig05 %>% add_trace(x = ~Children, y = ~survived, name = "survived", type = 'bar',
                         text = survived, textposition = 'auto',
                         marker = list(color = 'rgb(158,202,225)',
                                       line = list(color = 'rgb(8,48,107)', width = 1.5)))
fig05 <- fig05 %>% add_trace(x = ~Children, y = ~not_survived, name = "not_survived", type = 'bar',
                         text = not_survived, textposition = 'auto',
                         marker = list(color = 'rgb(58,200,225)',
                                       line = list(color = 'rgb(8,48,107)', width = 1.5)))
fig05 <- fig05 %>% layout(title = "Two-way comparison between children and overall survivors - proportion",
                      barmode = 'group',
                      xaxis = list(title = ""),
                      yaxis = list(title = ""))

fig05
#check connections between the class and survivors
> colnames(train)
 [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"         "Age"         "SibSp"      
 [8] "Parch"       "Ticket"      "Fare"        "Cabin"       "Embarked"    "lucky"       "Child"      
#table of passengers divided per class
> table(train$Pclass)
  1      2       3 
216     184      491 
#check how many passengers survived depending on the class, with 1=survived, 0=did not survive
> table(train$Pclass, train$Survived)   
       0     1 
  1   80   136
  2   97    87
  3  372   119

#here we must check the proportion between those who survived in first class and the overall total number of passengers embarked in that class.

prop.table(table(train$Pclass, train$Survived),1)
       0                 1
  1  0.3703704       0.6296296
  2  0.5271739       0.4728261
  3  0.7576375       0.2423625

#this means that 63% of passengers embarked in first class survived, 47% of those embarked in second class, and only 24% of those embarked in third class.

#we check where they embarked (Southampton, Cherbourg, and Queenstown)

table(train$Embarked)

      C   Q   S 
  2 168  77 644 
table(train$Embarked, train$Survived)
   
      0   1
      0   2
  C  75  93
  Q  47  30
  S 427 217


#check the proportion of survived among those embarked in Southampton, Cherbourg, and Queenstown

prop.table(table(train$Embarked, train$Survived),1)
   
            0         1
    0.0000000 1.0000000
  C 0.4464286 0.5535714
  Q 0.6103896 0.3896104
  S 0.6630435 0.3369565

#55% of passengers embarked in Cherbourg survived. To note that third-class passengers were the first to board in Southampton Terminus station. The remaining passengers were boarded in Cherbourg, then in Queenstown (https://en.wikipedia.org/wiki/Titanic).

#make prediction, create new data called “test_one”, a copy of test
test_one <- test
test_one
#create a new column called “Survived”, this is initialized to 0, and it means 
test_one$Survived <- 0
test_one
# If Survived is female, it equals to 1 
test_one$Survived[test$Sex == "female"] <- 1
test_one
#Intro to decision trees, install packages
install.packages("rpart")

install.packages("rattle")
install.packages("rpart.plot")
install.packages("RColorBrewer")
#load packages
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
#Create the first decision tree
my_tree <- rpart(Survived ~ Sex + Age,
                 data = train,
                 method ="class")

plot(my_tree)
text(my_tree)
#This is indeed a basic plot.

# Build the decision tree called my_tree_two, use rpart to specify the parameters you will use in your model. The first name after parenthesis is the element you want to predict, here it is “Survived”. For details about rpart see here.

# Build the decision tree
my_tree_two <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data = train, method = "class")
my_tree_two
# Visualize the decision tree using plot() and text()
plot(my_tree_two)
text(my_tree_two)
you get another basic decision tree like the one above
#If you have not previously done so, now you load the required packages
library(rattle)
library(rpart.plot)
library(RColorBrewer)
#they will help you in creating a nicer tree
fancyRpartPlot(my_tree_two)

#We have a decision tree and we can make use of the predict() function to "generate" our answer:
predict(my_tree_two, test, type = "class")
#make predictions using my_tree_too
my_prediction <- predict(my_tree_two, newdata = test, type = "class")
my_prediction
#match prediction based on survivors with Passenger and discover which are the IDs of survived passengers
my_solution <- data.frame(PassengerId = test$PassengerId, Survived = my_prediction)
nrow(my_solution)
if you check “my_solution” you get a list like this (it continues).
#create a csv file with your solution
write.csv(my_solution, file = "my_solution.csv", row.names = FALSE) 
#we create a more complex decision tree based on more observations
#first we make an attempt of a supermodel, and we check what we obtain as follows
super_model <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
                     data = train, method = "class", control = rpart.control(minsplit = 2, cp = 0))
fancyRpartPlot(super_model)
#as we can see, this model is very complex

#we change minsplit to 50 (the minsplit parameter is the smallest number of observations in the parent node that could be split further. The default is 20. If you have less than 20 records in a parent node, it is labeled as a terminal node), cp = 0 means no stopping of splits.

my_tree_three <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
                       data = train, method = "class", control = rpart.control(minsplit = 50, cp = 0))

# Visualize # Visualize your new decision tree "my_tree_three"
fancyRpartPlot(my_tree_three)

#predict depending on the family size using SibSp and Parch. Create the new variable “family_size”, which is the sum of SibSp and Parch plus one (the observation itself), to the test and train set.

# Create train_two
train_two <- train
train_two$family_size <- train_two$SibSp + train_two$Parch + 1

# Finish the command
my_tree_four <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + family_size,
                      data = train_two, method = "class")

# Visualize your new decision tree
fancyRpartPlot(my_tree_four)
Click on the image to open the link and access a larger image of my_tree_four
#make a prediction tree based on passengers Title. 
#First you must create the new variable on both datasets using the column with the name referring to Miss, Mr, etc.
#grab title from passenger name
train$Title<-gsub('(.*, )|(\\..*)', '', train$Name)
# Show title counts by sex
table(train$Sex, train$Title)
# Titles with very low cell counts to be combined to "rare" level
rare_title <- c('Dona', 'Lady', 'the Countess','Capt', 'Col', 'Don', 
                'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer')
# Also reassign mlle, ms, and mme accordingly
train$Title[train$Title == 'Mlle']        <- 'Miss' 
train$Title[train$Title == 'Ms']          <- 'Miss'
train$Title[train$Title == 'Mme']         <- 'Mrs' 
train$Title[train$Title %in% rare_title]  <- 'Rare Title'
# Show title counts by sex again
table(train$Sex, train$Title)

#NOW I DO IT AGAIN WITH TEST
#grab title from passenger name
test$Title<-gsub('(.*, )|(\\..*)', '', test$Name)
# Show title counts by sex
table(test$Sex, test$Title)
# Titles with very low cell counts to be combined to "rare" level
rare_title <- c('Dona', 'Lady', 'the Countess','Capt', 'Col', 'Don', 
                'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer')
# Also reassign mlle, ms, and mme accordingly
test$Title[test$Title == 'Mlle']        <- 'Miss' 
test$Title[test$Title == 'Ms']          <- 'Miss'
test$Title[test$Title == 'Mme']         <- 'Mrs' 
test$Title[test$Title %in% rare_title]  <- 'Rare Title'
# Show title counts by sex again
table(test$Sex, test$Title)
#add Title to train
Title <- train$Title
#now check if there is the new columns "Title"
train$Title <- NA
colnames(train)
#add Title to test
Title <- test$Title
test$Title <- NA
colnames(test)

#create my_tree_five and the new prediction using Title
my_tree_five <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title,
                      data = train, method = "class")
# Visualize my_tree_five
fancyRpartPlot(my_tree_five)
Click on the image to open the link and access a larger image of my_tree_five
# Make prediction
test_new <- test
colnames(test_new) 
colnames(test)
train_new <- train
colnames(train)
my_prediction <- predict(my_tree_five, test_new, type = "class")
my_prediction
#prediction based on my_tree_five, then print
my_solution <- data.frame(PassengerId = test$PassengerId, Survived = my_prediction)
write.csv(my_solution, file = "my_solution.csv", row.names = FALSE)
my_solution
Your prediction will look similar to this one
#Random Forest
library(readr)
library(dplyr)
#join train and test
all_data <- full_join(train, test)
View(all_data)
You have created the all_data dataset

# there are some missing values, since passengers on row 62 and 830 do not have a value for embarkment. Since most passengers embarked on Southampton, we give them the value S.
all_data$Embarked[c(62, 830)] <- "S"
# Factorize embarkment codes.
all_data$Embarked <- factor(all_data$Embarked)

# Passenger on row 1044 has an NA Fare value. Let's replace it with the median fare value.
all_data$Fare[1044] <- median(all_data$Fare, na.rm = TRUE)
# How to fill in missing Age values?
# We make a prediction of a passengers Age using the other variables and a decision tree model.
# This time you give method = "anova" since you are predicting a continuous variable.
library(rpart)
all_data$family_size <- all_data$SibSp + all_data$Parch + 1

predicted_age <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + family_size,
                       data = all_data[!is.na(all_data$Age),], method = "anova")
all_data$Age[is.na(all_data$Age)] <- predict(predicted_age, all_data[is.na(all_data$Age),])
# Split the data back into a train set and a test set
train <- all_data[1:891,]
test <- all_data[892:1309,]
#A Random Forest analysis in R
install.packages("randomForest")
library(randomForest)
# Train set and test set
str(train)
str(test)
class(train)
class(test)
# Set seed for reproducibility
set.seed(111)

# Apply the Random Forest Algorithm
my_forest <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data=train, importance=TRUE,ntree=1000)

# Make your prediction using the test set
my_prediction <- predict(my_forest, test)

# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution <- data.frame(PassengerId = test$PassengerId, Survived = my_prediction)

# Write your solution away to a csv file with the name my_solution.csv
write.csv(my_solution, file = "my_solution.csv", row.names = FALSE)
#Important variables, Your Random Forest object my_forest is still loaded in. 
#Remember you set importance = TRUE? Now you can see what variables are important using
varImpPlot(my_forest)
#When running the function, two graphs appear: the accuracy plot shows how much worse the model would perform without the included variables. So a high decrease (= high value x-axis) #links to a high predictive variable. 
#The second plot is the Gini coefficient. The higher the variable scores here, the more important it is for the model.
#Based on the two plots, what variable has the highest impact on the model?

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s