Logistic regression and SVM classification on famous Titanic data from Kaggle

Tuesday. November 13, 2018 - 22 mins

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Data

I acquired data from Kaggle. This is collected from the passengers on the famous Titanic ship. Analyzing the data there are some pretty interesting results.

train = pd.read_csv("train.csv")
train.head(n=10)

	Survived	Pclass	Name	Sex	Age
1	0	3	Braund, Mr. Owen Harris	male	22.0
2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0
3	1	3	Heikkinen, Miss. Laina	female	26.0
4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0

// data is left out on purpose for better visualization

sns.heatmap(train.isna(), cbar=False), train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB





(<matplotlib.axes._subplots.AxesSubplot at 0x7fbff8af45c0>, None)

png

Cabin number and age is not present for everyone, I set the cabin number to unknown instead of NaN and where age was missing I set it to the mean of the dataset.

# Fill age with mean because only some are missing
train['Age'] = train['Age'].fillna(train['Age'].dropna().mean())
# Set NaN for Cabin to Unknown because if it would be dropped we would loose ~70% of data
train['Cabin'] = train['Cabin'].fillna('Unknown')
# For any other let's drop the record
train = train.dropna()

sns.heatmap(train.isna(), cbar=False), train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Name           889 non-null object
Sex            889 non-null object
Age            889 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Ticket         889 non-null object
Fare           889 non-null float64
Cabin          889 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.3+ KB





(<matplotlib.axes._subplots.AxesSubplot at 0x7fbff47e7198>, None)

png

Where there were still missing data I dropped those that contained NaN.

fig = plt.figure(figsize=(7, 6))
sns.set(style="ticks")
sns.boxplot(x="Pclass", y="Age", data=train,
            whis="range", palette="vlag")
plt.title("Age - Class, mean and standard deviation")
plt.show()

png

Older people buy first class tickets mostly.

fig = plt.figure(figsize=(7, 6))
sns.set(style="ticks")
sns.boxplot(x="Sex", y="Pclass", data=train,
            whis="range", palette="winter")
plt.title("Gender - Class, mean and standard deviation")
plt.show()

png

Males buy less first class tickets.

fig = plt.figure(figsize=(7, 6))
sns.set(style="ticks")
sns.boxplot(x="Sex", y="Age", data=train,
            whis="range", palette="vlag")
plt.title("Age - Gender, mean and standard deviation")
plt.show()

png

fig = plt.figure(figsize=(7, 6))
sns.set(style="ticks")
sns.boxplot(x="Survived", y="Age", data=train,
            whis="range", palette="vlag")
plt.title("Age - Survived, mean and standard deviation")
plt.show()

png

fig = plt.figure(figsize=(7, 6))
sns.set(style="ticks")
sns.boxplot(x="Sex", y="Fare", data=train,
            whis="range", palette="vlag")
plt.title("Fare - Sex, mean and standard deviation")
plt.show()

png

Females bought more expensive tickets.

# JK but the fare for survival was higher. :DDDD
fig = plt.figure(figsize=(7, 6))
sns.set(style="ticks")
sns.boxplot(x="Survived", y="Fare", data=train,
            whis="range", palette="vlag")
plt.title("Fare - Survived, mean and standard deviation")
plt.show()

png

embarked = set(train['Embarked'].values)
embarked = dict(zip(embarked, range(len(embarked))))
embarked

{'C': 0, 'Q': 1, 'S': 2}

train['Embarked'] = train['Embarked'].map(lambda value: embarked[value])

fig = plt.figure(figsize=(7, 6))
sns.set(style="ticks")
sns.boxplot(x="Embarked", y="Age", data=train,
            whis="range", palette="vlag")
plt.title("Age - Embarked location, mean and standard deviation")
plt.show()

png

gender = set(train['Sex'].values)
gender = dict(zip(gender, range(len(gender))))
gender

{'female': 0, 'male': 1}

train['Sex'] = train['Sex'].map(lambda value: gender[value])

numerical_data = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
survived = train['Survived']

from sklearn.preprocessing import normalize
numerical_data = normalize(numerical_data)

fig = plt.figure(figsize=(7, 6)) sns.set(style=”ticks”) sns.boxplot(x=”Embarked”, y=”Fare”, data=train, whis=”range”, palette=”vlag”) plt.title(“Gender - Class, mean and standard deviation”) plt.show()

Logistic regression

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(numerical_data, survived, test_size=0.2,
                                                   random_state=42, shuffle=True)
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.decision_function(X_test)

from sklearn.metrics import accuracy_score, auc, roc_curve, classification_report

print("Accuracy score: %.2f%%" % (100.*accuracy_score(y_test, y_pred)))

Accuracy score: 70.79%

fpr, tpr, threshold = roc_curve(y_test, y_proba)

plt.figure(figsize=(5.,5.))
lw = 2
plt.plot(fpr, tpr, color='darkorange',lw=lw, label="AUC = %.3f " % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()

png

from matplotlib import colors as mpl_colors

def cross_validate(model, data, labels, k_fold=5, shuffle=True):
    plt.figure(figsize=(7.,7.))
    lw = 2
    colors = list(mpl_colors.BASE_COLORS.keys())
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    for i in range(k_fold):
        X_train, X_test, y_train, y_test = train_test_split(numerical_data, survived, test_size=0.2,
                                                   random_state=42*i+k_fold, shuffle=shuffle)
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        y_proba = clf.decision_function(X_test)
        print("Fold %d - accuracy score: %.2f%%" % ((i+1), 100.*accuracy_score(y_test, y_pred)))
        fpr, tpr, threshold = roc_curve(y_test, y_proba)
        plt.plot(fpr, tpr, color=colors[i], lw=lw, label="AUC = %.3f " % auc(fpr, tpr))
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc="lower right")
    plt.show()
    
    print("*******************************************************")
    print("Classification report for the last fold:")
    print(classification_report(y_test, y_pred, target_names=['not survived', 'survived']))
    print("*******************************************************\n\n\n")

cross_validate(LogisticRegression(), numerical_data, survived) # with shuffle

Fold 1 - accuracy score: 68.54%
Fold 2 - accuracy score: 74.72%
Fold 3 - accuracy score: 68.54%
Fold 4 - accuracy score: 71.91%
Fold 5 - accuracy score: 63.48%

png

*******************************************************
Classification report for the last fold:
              precision    recall  f1-score   support

not survived       0.63      0.81      0.71        99
    survived       0.63      0.42      0.50        79

 avg / total       0.63      0.63      0.62       178

*******************************************************

cross_validate(LogisticRegression(), numerical_data, survived, shuffle=False) # without shuffle

Fold 1 - accuracy score: 72.47%
Fold 2 - accuracy score: 72.47%
Fold 3 - accuracy score: 72.47%
Fold 4 - accuracy score: 72.47%
Fold 5 - accuracy score: 72.47%

png

*******************************************************
Classification report for the last fold:
              precision    recall  f1-score   support

not survived       0.76      0.84      0.80       115
    survived       0.64      0.51      0.57        63

 avg / total       0.72      0.72      0.72       178

*******************************************************

SVM

I used different SVM classifiers with different kernels to estimate the survivals.

from sklearn.svm import LinearSVC, SVC, NuSVC

cross_validate(LinearSVC(C=1), numerical_data, survived)

Fold 1 - accuracy score: 68.54%
Fold 2 - accuracy score: 76.97%
Fold 3 - accuracy score: 69.10%
Fold 4 - accuracy score: 70.79%
Fold 5 - accuracy score: 65.73%

png

*******************************************************
Classification report for the last fold:
              precision    recall  f1-score   support

not survived       0.65      0.84      0.73        99
    survived       0.68      0.43      0.53        79

 avg / total       0.66      0.66      0.64       178

*******************************************************

cross_validate(SVC(kernel='linear', C=1), numerical_data, survived)

Fold 1 - accuracy score: 65.17%
Fold 2 - accuracy score: 73.03%
Fold 3 - accuracy score: 65.73%
Fold 4 - accuracy score: 71.35%
Fold 5 - accuracy score: 61.80%

png

*******************************************************
Classification report for the last fold:
              precision    recall  f1-score   support

not survived       0.62      0.81      0.70        99
    survived       0.61      0.38      0.47        79

 avg / total       0.62      0.62      0.60       178

*******************************************************

cross_validate(NuSVC(), numerical_data, survived)

Fold 1 - accuracy score: 80.34%
Fold 2 - accuracy score: 78.65%
Fold 3 - accuracy score: 75.28%
Fold 4 - accuracy score: 73.03%
Fold 5 - accuracy score: 75.84%

png

*******************************************************
Classification report for the last fold:
              precision    recall  f1-score   support

not survived       0.77      0.80      0.79        99
    survived       0.74      0.71      0.72        79

 avg / total       0.76      0.76      0.76       178

*******************************************************

cross_validate(SVC(kernel='poly', degree=3), numerical_data, survived)

Fold 1 - accuracy score: 66.85%
Fold 2 - accuracy score: 66.85%
Fold 3 - accuracy score: 54.49%
Fold 4 - accuracy score: 65.17%
Fold 5 - accuracy score: 55.62%

png

*******************************************************
Classification report for the last fold:
              precision    recall  f1-score   support

not survived       0.56      1.00      0.71        99
    survived       0.00      0.00      0.00        79

 avg / total       0.31      0.56      0.40       178

*******************************************************

/opt/conda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

cross_validate(SVC(C=1), numerical_data, survived) # radial kernel by default

Fold 1 - accuracy score: 65.73%
Fold 2 - accuracy score: 74.16%
Fold 3 - accuracy score: 65.73%
Fold 4 - accuracy score: 71.35%
Fold 5 - accuracy score: 61.80%

png

*******************************************************
Classification report for the last fold:
              precision    recall  f1-score   support

not survived       0.62      0.81      0.70        99
    survived       0.61      0.38      0.47        79

 avg / total       0.62      0.62      0.60       178

*******************************************************

Most important features

from sklearn.feature_selection import RFECV

X_train, X_test, y_train, y_test = train_test_split(
    numerical_data, survived, test_size=0.2,
    random_state=42, shuffle=True)

svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc, step=1, cv=5,
              scoring='accuracy')
rfecv.fit(X_train, y_train)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

Optimal number of features : 4

png

It means that the last feature, namely where people embarked the ship does not matter at all and only lowers accuracy.

Validation

def svm_behaviour_check(data, labels, C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False):
    
    dat = data[:, 0:4]
    
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3,
        random_state=42, shuffle=True)
    X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5,
        random_state=137, shuffle=True)
    
    clf = SVC(C=C, cache_size=cache_size, class_weight=class_weight, coef0=coef0,
    decision_function_shape=decision_function_shape, degree=degree, gamma=gamma, kernel="linear",
    max_iter=max_iter, probability=probability, random_state=random_state, shrinking=shrinking,
    tol=tol, verbose=verbose).fit(X_train, y_train)
    
    test_pred = clf.predict(X_test)
    valid_pred = clf.predict(X_valid)
    test_proba = clf.decision_function(X_test)
    valid_proba = clf.decision_function(X_valid)
    
    plt.figure(figsize=(8, 8))
    plt.subplot(221)
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.title("Accuracy score: %.2f%%" % (100.*accuracy_score(y_test, test_pred)))
    fpr, tpr, threshold = roc_curve(y_test, test_proba)
    plt.plot(fpr, tpr, color='g', lw=lw, label="AUC = %.3f " % auc(fpr, tpr))
    plt.subplot(222)
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.title("Accuracy score: %.2f%%" % (100.*accuracy_score(y_valid, valid_pred)))
    fpr, tpr, threshold = roc_curve(y_valid, valid_proba)
    plt.plot(fpr, tpr, color='r', lw=lw, label="AUC = %.3f " % auc(fpr, tpr))
    
    rfecv = RFECV(estimator=clf, step=1, cv=5,
              scoring='accuracy')
    rfecv.fit(X_test, y_test)

    # Plot number of features VS. cross-validation scores
    plt.subplot(223)
    plt.title("Test - Optimal number of features : %d" % rfecv.n_features_)
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    
    rfecv = RFECV(estimator=clf, step=1, cv=5,
              scoring='accuracy')
    rfecv.fit(X_valid, y_valid)
    
    # Plot number of features VS. cross-validation scores
    plt.subplot(224)
    plt.title("Validation - Optimal number of features : %d" % rfecv.n_features_)
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    
    plt.show()

svm_behaviour_check(numerical_data, survived, C=25.6, gamma=1.6, shrinking=False, coef0=42)

png

svm_behaviour_check(numerical_data, survived, C=25.6, gamma=1.6, shrinking=False, coef0=1378)

png

svm_behaviour_check(numerical_data, survived, C=61166.6, gamma=1.6, shrinking=False, coef0=42)

png

svm_behaviour_check(numerical_data, survived, C=25.6, gamma=1379, shrinking=False, coef0=42)

png

svm_behaviour_check(numerical_data, survived, C=25.6, gamma=1.6, shrinking=True, coef0=42)

png

For some parameters there is considerable change but most of them are fitted correctly. However, I only tried few of the parameters so I can’t say more of this. However, the cross validated feature selection scores are pretty bad regarding the results.

Alex Olar

Christian, foodie, physicist, tech enthusiast