타이타닉 생존자예측 데이터분석

Kaggle의 타이타닉 데이터를 받아 주피터노트북에서
파이썬을 이용하여 생존자를 예측하였습니다.

필요한 데이터만 추출하여 전처리를 하였고, 사이킷런의 의사결정트리를 이용하여 모델을 만들었습니다.

import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
test=pd.read_csv("C:/Users/User/Downloads/titanic/test.csv")
train=pd.read_csv("C:/Users/User/Downloads/titanic/train.csv")
test.head()
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
train.shape
(891, 21)
test.shape
(418, 11)
test.describe()
PassengerId Pclass Age SibSp Parch Fare
count 418.000000 418.000000 332.000000 418.000000 418.000000 417.000000
mean 1100.500000 2.265550 30.272590 0.447368 0.392344 35.627188
std 120.810458 0.841838 14.181209 0.896760 0.981429 55.907576
min 892.000000 1.000000 0.170000 0.000000 0.000000 0.000000
25% 996.250000 1.000000 21.000000 0.000000 0.000000 7.895800
50% 1100.500000 3.000000 27.000000 0.000000 0.000000 14.454200
75% 1204.750000 3.000000 39.000000 1.000000 0.000000 31.500000
max 1309.000000 3.000000 76.000000 8.000000 9.000000 512.329200
train.describe()
PassengerId Survived Pclass Age SibSp Parch Fare Sex_encode Fare_fillin fares
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208 0.647587 32.843148 0.702581
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429 0.477990 49.657185 0.784817
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000 0.000000 4.012500 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400 0.000000 7.925000 0.000000
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200 1.000000 14.500000 0.000000
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000 1.000000 31.275000 1.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200 1.000000 512.329200 2.000000

data preprocessing

test.isnull().sum()
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
title            0
young&women      0
man              0
Embarked_C       0
Embarked_S       0
Embarked_Q       0
Sex_encode       0
Fare_fillin      1
FamilySize       0
FamilyType       0
Single           0
Nuclear          0
Big              0
fares           63
dtype: int64
def get_title(name):
    return name.split(', ')[1].split('.')[0]

train["title"]=train["Name"].apply(get_title)
test["title"]=test["Name"].apply(get_title)
print(train["title"].unique(),test["title"].unique())
['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer'] ['Mr' 'Mrs' 'Miss' 'Master' 'Ms' 'Col' 'Rev' 'Dr' 'Dona']
f, ax = plt.subplots(2, 2, figsize=(20,15))
sns.countplot('Embarked', data=train,ax=ax[0,0])
ax[0,0].set_title('No. Of Passengers Boarded')
sns.countplot('Embarked',hue='Sex',data=train,ax=ax[0,1])
ax[0,1].set_title('Male-Female Split for Embarked')
sns.countplot('Embarked',hue='Survived',data=train,ax=ax[1,0])
ax[1,0].set_title('Embarked vs Survived')
sns.countplot('Embarked',hue='Pclass',data=train,ax=ax[1,1])
ax[1,1].set_title('Embarked vs Pclass')
plt.show()

png

sns.countplot(x="title", hue="Survived",data=train)
<matplotlib.axes._subplots.AxesSubplot at 0x27093396088>

png

  • train으로 survived와 변수와의 상관관계를 밝히고 test로 넘어간다
train["young&women"]=(train["title"]=="Mrs") | (train["title"]=="Miss") | (train["title"]=="Master") | (train["title"]=="Ms")\
|(train["title"]=="Lady") | (train["title"]=="the Countess") |(train["title"]=="Mme") | (train["title"]=="Mlle") |(train["title"]=="Dr")

train["man"]=(train["title"]=="Mr")|(train["title"]=="Col")|(train["title"]=="Rev")|(train["title"]=="Don")

test["young&women"]=(test["title"]=="Mrs") | (test["title"]=="Miss") | (test["title"]=="Master")| (test["title"]=="Ms")\
|(test["title"]=="Dona")|(train["title"]=="Dr")
test["man"]= (test["title"]=="Mr")|(test["title"]=="Col")|(test["title"]=="Rev")
print(train.loc[886])
PassengerId                      887
Survived                           0
Pclass                             2
Name           Montvila, Rev. Juozas
Sex                             male
Age                               27
SibSp                              0
Parch                              0
Ticket                        211536
Fare                              13
Cabin                            NaN
Embarked                           S
title                            Rev
young&women                    False
man                             True
Name: 886, dtype: object
train.isnull().sum()
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
title            0
young&women      0
man              0
dtype: int64
sns.countplot(data=train, x="Sex", hue="Survived")
<matplotlib.axes._subplots.AxesSubplot at 0x270951e5d08>

png

pd.pivot_table(train, index="title", values="Survived")
Survived
title
Capt 0.000000
Col 0.500000
Don 0.000000
Dr 0.428571
Jonkheer 0.000000
Lady 1.000000
Major 0.500000
Master 0.575000
Miss 0.697802
Mlle 1.000000
Mme 1.000000
Mr 0.156673
Mrs 0.792000
Ms 1.000000
Rev 0.000000
Sir 1.000000
the Countess 1.000000
  • 여성승객이 남성승객보다 생존율이높다
  • 성별은 생존률에 유의미한 변수이다
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
title          891 non-null object
young&women    891 non-null bool
man            891 non-null bool
dtypes: bool(2), float64(2), int64(5), object(6)
memory usage: 92.4+ KB

EDA 탐색적 자료 분석

sns.countplot(data=train, x="Pclass", hue="Survived")
<matplotlib.axes._subplots.AxesSubplot at 0x270963cd1c8>

png

pd.pivot_table(train, index="Pclass", values="Survived")
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
  • 3등객실에 비해 1등객실의 생존률이 높다
  • 객실 등급은 생존률에 유의미한 변수이다
sns.countplot(data=train, x="Embarked", hue="Survived")
<matplotlib.axes._subplots.AxesSubplot at 0x27096426cc8>

png

pd.pivot_table(train, index="Embarked", values="Survived")
Survived
Embarked
C 0.553571
Q 0.389610
S 0.336957
train["Embarked"]=train["Embarked"].fillna("S")
train["Embarked_C"] = train["Embarked"] == "C"
train["Embarked_S"] = train["Embarked"] == "S"
train["Embarked_Q"] = train["Embarked"] == "Q"
train[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head()
Embarked Embarked_C Embarked_S Embarked_Q
0 S False True False
1 C True False False
2 S False True False
3 S False True False
4 S False True False
train.shape
(891, 21)
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked title young&women man Embarked_C Embarked_S Embarked_Q
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S Mr False True False True False
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C Mrs True False True False False
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S Miss True False False True False
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S Mrs True False False True False
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S Mr False True False True False
test["Embarked_C"] = test["Embarked"] == "C"
test["Embarked_S"] = test["Embarked"] == "S"
test["Embarked_Q"] = test["Embarked"] == "Q"
test[["Embarked", "Embarked_C", "Embarked_S", "Embarked_Q"]].head()
Embarked Embarked_C Embarked_S Embarked_Q
0 Q False False True
1 S False True False
2 Q False False True
3 S False True False
4 S False True False
sns.lmplot(data=train, x="Age", y="Fare",hue="Survived", fit_reg=False)
<seaborn.axisgrid.FacetGrid at 0x2709668d8c8>

png

low_fare = train[train["Fare"] < 500]
high_fare=train[train["Fare"]>=500]
high_fare
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked title young&women man Embarked_C Embarked_S Embarked_Q
258 259 1 1 Ward, Miss. Anna female 35.0 0 0 PC 17755 512.3292 NaN C Miss True False True False False
679 680 1 1 Cardeza, Mr. Thomas Drake Martinez male 36.0 0 1 PC 17755 512.3292 B51 B53 B55 C Mr False True True False False
737 738 1 1 Lesurer, Mr. Gustave J male 35.0 0 0 PC 17755 512.3292 B101 C Mr False True True False False
sns.lmplot(data=low_fare, x="Age", y="Fare", hue="Survived", fit_reg=False)
<seaborn.axisgrid.FacetGrid at 0x2709653d748>

png

test.loc[test["Sex"] == "male", "Sex_encode"] = 1
test.loc[test["Sex"] == "female", "Sex_encode"] = 0
train.loc[train["Sex"] == "male", "Sex_encode"] =1
train.loc[train["Sex"] == "female", "Sex_encode"] = 0
train[["Sex", "Sex_encode"]].head()
Sex Sex_encode
0 male 1.0
1 female 0.0
2 female 0.0
3 female 0.0
4 male 1.0
test[["Sex", "Sex_encode"]].head()
Sex Sex_encode
0 male 1.0
1 female 0.0
2 male 1.0
3 male 1.0
4 female 0.0
train["Fare_fillin"] = train["Fare"]
test["Fare_fillin"] = test["Fare"]
third_fare_train=train.loc[train["Pclass"]==3, "Fare"]
third_fare_test=test.loc[test["Pclass"]==3, "Fare"]
print(third_fare_train.median(),third_fare_test.median())
8.05 7.8958
second_fare_train=train.loc[train["Pclass"]==2, "Fare"]
second_fare_test=test.loc[test["Pclass"]==2, "Fare"]
print(second_fare_train.median(),second_fare_test.median())
14.25 15.75
first_fare_train=train.loc[(train["Pclass"]==1) & (train["Fare"]<300), "Fare"]
first_fare_test=test.loc[(test["Pclass"]==1)&(test["Fare"]<300),  "Fare"]
print(first_fare_train.median(),first_fare_test.median())
57.9792 60.0
test.loc[((test["Fare"]==0.0 )| (test["Fare"].isnull()))& (test["Pclass"]==1), "Fare_fillin"] = first_fare_test.median()
train.loc[(train["Fare"]==0.0 )| (train["Pclass"]==1), "Fare_fillin"] = first_fare_train.median()

test.loc[((test["Fare"]==0.0) |(test["Fare"].isnull()))& (test["Pclass"]==2), "Fare_fillin"] = second_fare_test.median()
train.loc[(train["Fare"]==0.0)| (train["Pclass"]==2), "Fare_fillin"] = second_fare_train.median()

test.loc[((test["Fare"]==0.0) |(test["Fare"].isnull()))&( test["Pclass"]==3), "Fare_fillin"] = third_fare_test.median() 
train.loc[(train["Fare"]==0.0) | (train["Pclass"]==3), "Fare_fillin"] = third_fare_train.median()
train.isnull().sum()
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
title            0
young&women      0
man              0
Embarked_C       0
Embarked_S       0
Embarked_Q       0
Sex_encode       0
Fare_fillin      0
fares            0
FamilySize       0
FamilyType       0
Single           0
Nuclear          0
Big              0
dtype: int64
sns.kdeplot(train["Survived"], train["Sex_encode"])
<matplotlib.axes._subplots.AxesSubplot at 0x27095236708>

png

train.loc[train["Fare_fillin"]<10, "fares"]=0
train.loc[(train["Fare_fillin"]>=10) & (train["Fare_fillin"]<=35), "fares"]=1
train.loc[(train["Fare_fillin"]>35), "fares"]=2

test.loc[test["Fare_fillin"]<10, "fares"]=0
test.loc[(test["Fare_fillin"]>=10) & (test["Fare_fillin"]<=35), "fares"]=1
test.loc[(test["Fare_fillin"]>35), "fares"]=2
pd.pivot_table(train, index="fares", values="Survived")
Survived
fares
0.0 0.237052
1.0 0.488764
2.0 0.644550
train.corr()#상관관계 매트릭스
PassengerId Survived Pclass Age SibSp Parch Fare young&women man Embarked_C Embarked_S Embarked_Q Sex_encode Fare_fillin fares FamilySize Single Nuclear Big mysex
PassengerId 1.000000 -0.005007 -0.035144 0.036847 -0.057527 -0.001652 0.012658 -0.049744 0.041419 -0.001205 0.022204 -0.033606 0.042939 0.023265 0.021634 -0.040143 0.057462 -0.028976 -0.057055 -0.041419
Survived -0.005007 1.000000 -0.338481 -0.077221 -0.035322 0.081629 0.257307 0.563511 -0.562671 0.168240 -0.149683 0.003650 -0.543351 0.319530 0.356638 0.016639 -0.203367 0.279855 -0.125147 0.562671
Pclass -0.035144 -0.338481 1.000000 -0.369226 0.083081 0.018443 -0.549500 -0.109791 0.127425 -0.243292 0.074053 0.221009 0.131900 -0.917824 -0.979271 0.065997 0.135207 -0.223551 0.152366 -0.127425
Age 0.036847 -0.077221 -0.369226 1.000000 -0.308247 -0.189119 0.096067 -0.256904 0.235802 0.036261 -0.023233 -0.022405 0.093254 0.357485 0.363850 -0.301914 0.198270 -0.083059 -0.226521 -0.235802
SibSp -0.057527 -0.035322 0.083081 -0.308247 1.000000 0.414838 0.159651 0.260895 -0.259057 -0.059528 0.068734 -0.026354 -0.114631 -0.055891 -0.073128 0.890712 -0.584471 0.213225 0.730691 0.259057
Parch -0.001652 0.081629 0.018443 -0.189119 0.414838 1.000000 0.216225 0.345308 -0.341989 -0.011069 0.060814 -0.081228 -0.245489 -0.010963 -0.008291 0.783111 -0.583398 0.265863 0.631523 0.341989
Fare 0.012658 0.257307 -0.549500 0.096067 0.159651 0.216225 1.000000 0.187869 -0.188564 0.269335 -0.162184 -0.117216 -0.182333 0.608117 0.565329 0.217138 -0.271832 0.205527 0.143636 0.188564
young&women -0.049744 0.563511 -0.109791 -0.256904 0.260895 0.345308 0.187869 1.000000 -0.988440 0.070876 -0.112746 0.080495 -0.895928 0.097330 0.127915 0.350823 -0.406498 0.297336 0.233257 0.988440
man 0.041419 -0.562671 0.127425 0.235802 -0.259057 -0.341989 -0.188564 -0.988440 1.000000 -0.071057 0.110586 -0.076811 0.885571 -0.111956 -0.140334 -0.347908 0.405681 -0.298450 -0.229632 -1.000000
Embarked_C -0.001205 0.168240 -0.243292 0.036261 -0.059528 -0.011069 0.269335 0.070876 -0.071057 1.000000 -0.782742 -0.148258 -0.082853 0.299075 0.254504 -0.046215 -0.095298 0.158586 -0.109274 0.071057
Embarked_S 0.022204 -0.149683 0.074053 -0.023233 0.068734 0.060814 -0.162184 -0.112746 0.110586 -0.782742 1.000000 -0.499421 0.119224 -0.154113 -0.087581 0.077359 0.029074 -0.084120 0.099265 -0.110586
Embarked_Q -0.033606 0.003650 0.221009 -0.022405 -0.026354 -0.081228 -0.117216 0.080495 -0.076811 -0.148258 -0.499421 1.000000 -0.074115 -0.171442 -0.215120 -0.058592 0.086464 -0.087093 -0.005620 0.076811
Sex_encode 0.042939 -0.543351 0.131900 0.093254 -0.114631 -0.245489 -0.182333 -0.895928 0.885571 -0.082853 0.119224 -0.074115 1.000000 -0.120790 -0.148251 -0.200988 0.303646 -0.260747 -0.102954 -0.885571
Fare_fillin 0.023265 0.319530 -0.917824 0.357485 -0.055891 -0.010963 0.608117 0.097330 -0.111956 0.299075 -0.154113 -0.171442 -0.120790 1.000000 0.934535 -0.043677 -0.134716 0.198030 -0.106230 0.111956
fares 0.021634 0.356638 -0.979271 0.363850 -0.073128 -0.008291 0.565329 0.127915 -0.140334 0.254504 -0.087581 -0.215120 -0.148251 0.934535 1.000000 -0.054123 -0.153169 0.239358 -0.146981 0.140334
FamilySize -0.040143 0.016639 0.065997 -0.301914 0.890712 0.783111 0.217138 0.350823 -0.347908 -0.046215 0.077359 -0.058592 -0.200988 -0.043677 -0.054123 1.000000 -0.690922 0.278553 0.814901 0.347908
Single 0.057462 -0.203367 0.135207 0.198270 -0.584471 -0.583398 -0.271832 -0.406498 0.405681 -0.095298 0.029074 0.086464 0.303646 -0.134716 -0.153169 -0.690922 1.000000 -0.859931 -0.336825 -0.405681
Nuclear -0.028976 0.279855 -0.223551 -0.083059 0.213225 0.265863 0.205527 0.297336 -0.298450 0.158586 -0.084120 -0.087093 -0.260747 0.198030 0.239358 0.278553 -0.859931 1.000000 -0.190940 0.298450
Big -0.057055 -0.125147 0.152366 -0.226521 0.730691 0.631523 0.143636 0.233257 -0.229632 -0.109274 0.099265 -0.005620 -0.102954 -0.106230 -0.146981 0.814901 -0.336825 -0.190940 1.000000 0.229632
mysex -0.041419 0.562671 -0.127425 -0.235802 0.259057 0.341989 0.188564 0.988440 -1.000000 0.071057 -0.110586 0.076811 -0.885571 0.111956 0.140334 0.347908 -0.405681 0.298450 0.229632 1.000000
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
print(train.shape)
(891, 26)
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1
print(test.shape)
(418, 20)
sns.countplot(data=train, x="FamilySize", hue="Survived")
<matplotlib.axes._subplots.AxesSubplot at 0x270967078c8>

png

train.loc[train["FamilySize"] == 1, "FamilyType"] = "Single"
train.loc[(train["FamilySize"] > 1) & (train["FamilySize"] < 5), "FamilyType"] = "Nuclear"
train.loc[train["FamilySize"] >= 5, "FamilyType"] = "Big"
print(train.shape)
train[["FamilySize", "FamilyType"]].head(10)
(891, 23)
FamilySize FamilyType
0 2 Nuclear
1 2 Nuclear
2 1 Single
3 2 Nuclear
4 1 Single
5 1 Single
6 1 Single
7 5 Big
8 3 Nuclear
9 2 Nuclear
test.loc[train["FamilySize"] == 1, "FamilyType"] = "Single"
test.loc[(train["FamilySize"] > 1) & (train["FamilySize"] < 5), "FamilyType"] = "Nuclear"
test.loc[train["FamilySize"] >= 5, "FamilyType"] = "Big"
sns.countplot(data=train, x="FamilyType", hue="Survived")
<matplotlib.axes._subplots.AxesSubplot at 0x270967ac508>

png

pd.pivot_table(data=train, index="FamilyType", values="Survived")
Survived
FamilyType
Big 0.161290
Nuclear 0.578767
Single 0.303538
train["Single"] = train["FamilySize"] == 1
train["Nuclear"] = (train["FamilySize"] > 1) & (train["FamilySize"] < 5)
train["Big"] = train["FamilySize"] >= 5
print(train.shape)
train[["FamilySize", "Single", "Nuclear", "Big"]].head(10)
(891, 26)
FamilySize Single Nuclear Big
0 2 False True False
1 2 False True False
2 1 True False False
3 2 False True False
4 1 True False False
5 1 True False False
6 1 True False False
7 5 False False True
8 3 False True False
9 2 False True False
test["Single"] = test["FamilySize"] == 1
test["Nuclear"] = (test["FamilySize"] > 1) & (test["FamilySize"] < 5)
test["Big"] = test["FamilySize"] >= 5
print(test.shape)
test[["FamilySize", "Single", "Nuclear", "Big"]].head(10)
(418, 24)
FamilySize Single Nuclear Big
0 1 True False False
1 2 False True False
2 1 True False False
3 1 True False False
4 3 False True False
5 1 True False False
6 1 True False False
7 3 False True False
8 1 True False False
9 3 False True False
train.corr()
PassengerId Survived Pclass Age SibSp Parch Fare young&women man Embarked_C Embarked_S Embarked_Q Sex_encode Fare_fillin fares FamilySize Single Nuclear Big mysex
PassengerId 1.000000 -0.005007 -0.035144 0.036847 -0.057527 -0.001652 0.012658 -0.049744 0.041419 -0.001205 0.022204 -0.033606 0.042939 0.023267 0.021634 -0.040143 0.057462 -0.028976 -0.057055 0.041419
Survived -0.005007 1.000000 -0.338481 -0.077221 -0.035322 0.081629 0.257307 0.563511 -0.562671 0.168240 -0.149683 0.003650 -0.543351 0.317052 0.356638 0.016639 -0.203367 0.279855 -0.125147 -0.562671
Pclass -0.035144 -0.338481 1.000000 -0.369226 0.083081 0.018443 -0.549500 -0.109791 0.127425 -0.243292 0.074053 0.221009 0.131900 -0.912606 -0.979271 0.065997 0.135207 -0.223551 0.152366 0.127425
Age 0.036847 -0.077221 -0.369226 1.000000 -0.308247 -0.189119 0.096067 -0.256904 0.235802 0.036261 -0.023233 -0.022405 0.093254 0.356091 0.363850 -0.301914 0.198270 -0.083059 -0.226521 0.235802
SibSp -0.057527 -0.035322 0.083081 -0.308247 1.000000 0.414838 0.159651 0.260895 -0.259057 -0.059528 0.068734 -0.026354 -0.114631 -0.055006 -0.073128 0.890712 -0.584471 0.213225 0.730691 -0.259057
Parch -0.001652 0.081629 0.018443 -0.189119 0.414838 1.000000 0.216225 0.345308 -0.341989 -0.011069 0.060814 -0.081228 -0.245489 -0.011044 -0.008291 0.783111 -0.583398 0.265863 0.631523 -0.341989
Fare 0.012658 0.257307 -0.549500 0.096067 0.159651 0.216225 1.000000 0.187869 -0.188564 0.269335 -0.162184 -0.117216 -0.182333 0.608175 0.565329 0.217138 -0.271832 0.205527 0.143636 -0.188564
young&women -0.049744 0.563511 -0.109791 -0.256904 0.260895 0.345308 0.187869 1.000000 -0.988440 0.070876 -0.112746 0.080495 -0.895928 0.095763 0.127915 0.350823 -0.406498 0.297336 0.233257 -0.988440
man 0.041419 -0.562671 0.127425 0.235802 -0.259057 -0.341989 -0.188564 -0.988440 1.000000 -0.071057 0.110586 -0.076811 0.885571 -0.110441 -0.140334 -0.347908 0.405681 -0.298450 -0.229632 1.000000
Embarked_C -0.001205 0.168240 -0.243292 0.036261 -0.059528 -0.011069 0.269335 0.070876 -0.071057 1.000000 -0.782742 -0.148258 -0.082853 0.300093 0.254504 -0.046215 -0.095298 0.158586 -0.109274 -0.071057
Embarked_S 0.022204 -0.149683 0.074053 -0.023233 0.068734 0.060814 -0.162184 -0.112746 0.110586 -0.782742 1.000000 -0.499421 0.119224 -0.156471 -0.087581 0.077359 0.029074 -0.084120 0.099265 0.110586
Embarked_Q -0.033606 0.003650 0.221009 -0.022405 -0.026354 -0.081228 -0.117216 0.080495 -0.076811 -0.148258 -0.499421 1.000000 -0.074115 -0.169112 -0.215120 -0.058592 0.086464 -0.087093 -0.005620 -0.076811
Sex_encode 0.042939 -0.543351 0.131900 0.093254 -0.114631 -0.245489 -0.182333 -0.895928 0.885571 -0.082853 0.119224 -0.074115 1.000000 -0.119288 -0.148251 -0.200988 0.303646 -0.260747 -0.102954 0.885571
Fare_fillin 0.023267 0.317052 -0.912606 0.356091 -0.055006 -0.011044 0.608175 0.095763 -0.110441 0.300093 -0.156471 -0.169112 -0.119288 1.000000 0.929100 -0.043112 -0.133553 0.195723 -0.104210 -0.110441
fares 0.021634 0.356638 -0.979271 0.363850 -0.073128 -0.008291 0.565329 0.127915 -0.140334 0.254504 -0.087581 -0.215120 -0.148251 0.929100 1.000000 -0.054123 -0.153169 0.239358 -0.146981 -0.140334
FamilySize -0.040143 0.016639 0.065997 -0.301914 0.890712 0.783111 0.217138 0.350823 -0.347908 -0.046215 0.077359 -0.058592 -0.200988 -0.043112 -0.054123 1.000000 -0.690922 0.278553 0.814901 -0.347908
Single 0.057462 -0.203367 0.135207 0.198270 -0.584471 -0.583398 -0.271832 -0.406498 0.405681 -0.095298 0.029074 0.086464 0.303646 -0.133553 -0.153169 -0.690922 1.000000 -0.859931 -0.336825 0.405681
Nuclear -0.028976 0.279855 -0.223551 -0.083059 0.213225 0.265863 0.205527 0.297336 -0.298450 0.158586 -0.084120 -0.087093 -0.260747 0.195723 0.239358 0.278553 -0.859931 1.000000 -0.190940 -0.298450
Big -0.057055 -0.125147 0.152366 -0.226521 0.730691 0.631523 0.143636 0.233257 -0.229632 -0.109274 0.099265 -0.005620 -0.102954 -0.104210 -0.146981 0.814901 -0.336825 -0.190940 1.000000 -0.229632
mysex 0.041419 -0.562671 0.127425 0.235802 -0.259057 -0.341989 -0.188564 -0.988440 1.000000 -0.071057 0.110586 -0.076811 0.885571 -0.110441 -0.140334 -0.347908 0.405681 -0.298450 -0.229632 1.000000
#mysex로 young&women일때 1, man일때 0을 대입했는데 점수가 더 낮게나옴

Fitting the model

from sklearn.model_selection import train_test_split
from sklearn import metrics
features = ["Pclass", "man","young&women", "fares",
                 "Single","Nuclear"]
features
['Pclass', 'man', 'young&women', 'fares', 'Single', 'Nuclear']
label="Survived"
label
'Survived'
x_train = train[features]
x_train.head()
Pclass man young&women fares Single Nuclear
0 3 True False 0.0 False True
1 1 False True 2.0 False True
2 3 False True 0.0 True False
3 1 False True 2.0 False True
4 3 True False 0.0 True False
x_train.shape
(891, 6)
x_test=test[features]
x_test.head()
Pclass man young&women fares Single Nuclear
0 3 True False 0.0 True False
1 3 False True 0.0 False True
2 2 True False 0.0 True False
3 3 True False 0.0 True False
4 3 False True 1.0 False True
x_test.isnull().sum()
Pclass         0
man            0
young&women    0
fares          0
Single         0
Nuclear        0
dtype: int64
x_test.shape
(418, 6)
y_train=train[label]
y_train.head()
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=5)
model
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
model.fit(x_train, y_train)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
predictions = model.predict(x_test)
predictions.shape
(418,)
predictions
array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1],
      dtype=int64)
submission = pd.read_csv("C:/Users/User/Downloads/titanic/gender_submission.csv", index_col="PassengerId")
submission.head()
Survived
PassengerId
892 0
893 1
894 0
895 0
896 1
submission.shape
(418, 1)
submission["Survived"] = predictions
y_train=predictions
print(metrics.accuracy_score(predictions, y_train))
1.0
submission.head()
Survived
PassengerId
892 0
893 1
894 0
895 0
896 1
submission.to_csv("C:/Users/User/Downloads/titanic/finish.csv")
import pandas_profiling
submission.profile_report()
Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …

Report generated with pandas-profiling.

train.profile_report()
Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …

Report generated with pandas-profiling.


Categories:

Updated:

Comments