1.Introduction

### Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

### seaborn plot ๊ธฐ๋ณธ ์„ค์ •
sns.set(style = 'white', context = 'notebook', palette = 'deep')

2. ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ/ ํ™•์ธ

2-1. ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ECC 48แ„€แ…ต แ„ƒแ…ฆแ„€แ…ชB/1แ„Œแ…ฎแ„Žแ…ก/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ECC 48แ„€แ…ต แ„ƒแ…ฆแ„€แ…ชB/1แ„Œแ…ฎแ„Žแ…ก/data/test.csv")
IDtest = test["PassengerId"]
train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
test.head()
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S

2-2. ์ด์ƒ์น˜(outlier) ํƒ์ง€

  • IQR(์‚ฌ๋ถ„์œ„ ๋ฒ”์œ„) ๋ฐฉ๋ฒ•

  • Tukey Method

  • IQR = Q3(75%) - Q1(25%)

  • [Q1 - 1.5 * IQR, Q3 + 1.5 * IQR] ๋ฒ”์œ„๋ฅผ ๋ฒ—์–ด๋‚˜๋Š” ๋ฐ์ดํ„ฐ ํฌ์ธํŠธ๋ฅผ ์ด์ƒ์น˜๋กœ ๊ฐ„์ฃผ

### ์ด์ƒ์น˜ ํƒ์ง€๋ฅผ ์œ„ํ•œ ํ•จ์ˆ˜

def detect_outliers(df,n,features):
    """
    ํ”ผ์ณ์˜ ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„์„ ๊ฐ€์ ธ์™€์„œ Tukey ๋ฐฉ๋ฒ•์— ๋”ฐ๋ผ 
    n๊ฐœ ์ด์ƒ์˜ ์ด์ƒ์น˜๋ฅผ ํฌํ•จํ•˜๋Š” ๊ด€์ธก์น˜์— ํ•ด๋‹นํ•˜๋Š” ์ธ๋ฑ์Šค ๋ชฉ๋ก์„ ๋ฐ˜ํ™˜
    """
    outlier_indices = [ ] # ์ด์ƒ์น˜๋ฅผ ๊ฐ€์ง€๋Š” feature๋“ค์„ ์ €์žฅ
    for col in features: # ๊ฐ feature๋งˆ๋‹ค
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col],75)
        
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        outlier_step = 1.5 * IQR
        
        # ์ด์ƒ์น˜ ํƒ์ง€
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        outlier_indices.extend(outlier_list_col)
        
    # ๋‘ ๊ฐœ ์ด์ƒ์˜ ์ด์ƒ์น˜๋ฅผ ํฌํ•จํ•˜๋Š” ๊ด€์ธก์น˜ ์„ ํƒ
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   
# ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜๋“ค(Age, SibSp, Parch, Fare)์—์„œ ์ด์ƒ์น˜ ํƒ์ง€

Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])
  • ์ด์ƒ์น˜๋Š” ์˜ˆ์ธก์— ๊ทน์ ์ธ ์˜ํ–ฅ์„ ๋ฏธ์น  ์ˆ˜ ์žˆ์Œ (ํŠนํžˆ ํšŒ๊ท€ ๋ฌธ์ œ)

    • ์ด์— ๋Œ€ํ•œ ์ „์ฒ˜๋ฆฌ
  • Tukey ๋ฐฉ๋ฒ•์„ ํ™œ์šฉํ•˜์—ฌ ์ด์ƒ์น˜ ํƒ์ง€

  • ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜(Age, SibSp, Sarch and Fare)์—์„œ ์ด์ƒ์น˜๋ฅผ ํƒ์ง€

    • ์ตœ์†Œ ๋‘ ๊ฐœ์˜ ๋Œ์ถœ๋œ ์ˆซ์ž ๊ฐ’์„ ์ด์ƒ์น˜๋ฅผ ๊ฐ€์ง„ ํ–‰์œผ๋กœ ๊ฐ„์ฃผ
# ์ด์ƒ์น˜๊ฐ€ ์žˆ๋Š” ํ–‰ ํ‘œ์‹œ

train.loc[Outliers_to_drop]
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
27 28 0 1 Fortune, Mr. Charles Alexander male 19.0 3 2 19950 263.00 C23 C25 C27 S
88 89 1 1 Fortune, Miss. Mabel Helen female 23.0 3 2 19950 263.00 C23 C25 C27 S
159 160 0 3 Sage, Master. Thomas Henry male NaN 8 2 CA. 2343 69.55 NaN S
180 181 0 3 Sage, Miss. Constance Gladys female NaN 8 2 CA. 2343 69.55 NaN S
201 202 0 3 Sage, Mr. Frederick male NaN 8 2 CA. 2343 69.55 NaN S
324 325 0 3 Sage, Mr. George John Jr male NaN 8 2 CA. 2343 69.55 NaN S
341 342 1 1 Fortune, Miss. Alice Elizabeth female 24.0 3 2 19950 263.00 C23 C25 C27 S
792 793 0 3 Sage, Miss. Stella Anna female NaN 8 2 CA. 2343 69.55 NaN S
846 847 0 3 Sage, Mr. Douglas Bullen male NaN 8 2 CA. 2343 69.55 NaN S
863 864 0 3 Sage, Miss. Dorothy Edith "Dolly" female NaN 8 2 CA. 2343 69.55 NaN S
  • 10๊ฐœ์˜ ์ด์ƒ์น˜๋ฅผ ํƒ์ง€

    • 28, 89, 342๋ฒˆ ์Šน๊ฐ์˜ ๊ฒฝ์šฐ ๋งค์šฐ ๋†’์€ Ticket,Fare๋ฅผ ๊ฐ€์ง€๊ณ  ์žˆ์Œ

    • ๋‚˜๋จธ์ง€ 7๋ช…์˜ ๊ฒฝ์šฐ SibSp ๊ฐ’์ด ๋งค์šฐ ๋†’์Œ

### ์ด์ƒ์น˜ ์ œ๊ฑฐ

train = train.drop(Outliers_to_drop, axis = 0).reset_index(drop = True) 
# ์ด์ƒ์น˜๊ฐ€ ์žˆ๋Š” ํ–‰์„ ๋‚ ๋ฆฐ ํ›„ ์ธ๋ฑ์Šค๋ฅผ ๋‹ค์‹œ ๋ถ€์—ฌ

2-3. Train + Test set

### train, test set ๊ฒฐํ•ฉ
# ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ๋ณ€ํ™˜ ์‹œ ๋™์ผํ•œ ์ˆ˜์˜ feature๋ฅผ ์–ป๊ธฐ ์œ„ํ•ด์„œ

train_len = len(train)
dataset =  pd.concat(objs = [train, test], axis = 0).reset_index(drop = True) # ํ–‰์„ ๊ธฐ์ค€์œผ๋กœ ๊ฒฐํ•ฉ

2-4. Null๊ฐ’, ๊ฒฐ์ธก์น˜(Missing value) ํ™•์ธ

# ๋น„์–ด์žˆ๊ฑฐ๋‚˜ NaN์ธ ๊ฐ’๋“ค์„ NaN์œผ๋กœ ์ฑ„์›€
dataset = dataset.fillna(np.nan)

# ๋‚จ์€ null๊ฐ’ ํ™•์ธ
dataset.isnull().sum()
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             256
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1007
Embarked          2
dtype: int64
  • Age์™€ Cabin์— ๋งค์šฐ ๋งŽ์€ ๊ฒฐ์ธก์น˜ ์กด์žฌ

  • ํ…Œ์ŠคํŠธ ๋ฐ์ดํ„ฐ ๊ฒฐํ•ฉ ์‹œ์˜ Survived์˜ ๊ฒฐ์ธก์น˜๋Š” test set์— ์กด์žฌ x

    • concat ์‹œ NaN ๊ฐ’์„ ๋Œ€์ฒดํ•ด์„œ
### ์ •๋ณด
train.info()
print()
train.isnull().sum()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  881 non-null    int64  
 1   Survived     881 non-null    int64  
 2   Pclass       881 non-null    int64  
 3   Name         881 non-null    object 
 4   Sex          881 non-null    object 
 5   Age          711 non-null    float64
 6   SibSp        881 non-null    int64  
 7   Parch        881 non-null    int64  
 8   Ticket       881 non-null    object 
 9   Fare         881 non-null    float64
 10  Cabin        201 non-null    object 
 11  Embarked     879 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 82.7+ KB

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            170
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          680
Embarked         2
dtype: int64
### ๋ฐ์ดํ„ฐ ํ™•์ธ

train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
### ๋ฐ์ดํ„ฐํ˜• ํ™•์ธ
train.dtypes
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
### ๋ฐ์ดํ„ฐ ์š”์•ฝ
# ํ†ต๊ณ„๋Ÿ‰ ์ œ๊ณต

train.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 881.000000 881.000000 881.000000 711.000000 881.000000 881.000000 881.000000
mean 446.713961 0.385925 2.307605 29.731603 0.455165 0.363224 31.121566
std 256.617021 0.487090 0.835055 14.547835 0.871571 0.791839 47.996249
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 226.000000 0.000000 2.000000 20.250000 0.000000 0.000000 7.895800
50% 448.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.000000 1.000000 3.000000 38.000000 1.000000 0.000000 30.500000
max 891.000000 1.000000 3.000000 80.000000 5.000000 6.000000 512.329200

3. ๋ณ€์ˆ˜(feature) ๋ถ„์„

3-1. ์ˆ˜์น˜ํ˜•(numerical) ๋ณ€์ˆ˜

### ์ƒ๊ด€๊ณ„์ˆ˜ ํ–‰๋ ฌ
# ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜๋“ค๊ณผ Survived ๊ฐ„์˜ ์ƒ๊ด€๊ณ„์ˆ˜

g = sns.heatmap(train[["Survived","SibSp","Parch",
                       "Age","Fare"]].corr(),annot = True, fmt = ".2f", cmap = "coolwarm")

  • ์˜ค์ง Fare ๋ณ€์ˆ˜๋งŒ์ด Survived ๋ณ€์ˆ˜์™€ ์œ ์˜ํ•œ ์ƒ๊ด€๊ด€๊ณ„๋ฅผ ๋ณด์ž„

    • ๋‹ค๋ฅธ feature๋“ค์ด ์ค‘์š”ํ•˜์ง€ ์•Š๋‹ค๋Š” ์˜๋ฏธ๋Š” x

    • ํ•ด๋‹น feature๋“ค์˜ ํŒŒ์ƒ ๋ณ€์ˆ˜๋“ค์ด Survived์™€ ์ƒ๊ด€์„ฑ์„ ๊ฐ€์งˆ ์ˆ˜ ์žˆ์Œ

  • seaborn version issue๋กœ ์ธํ•ด ์ผ๋ถ€ ํ•จ์ˆ˜ ๋ณ€๊ฒฝ

(factorplot -> pointplot, catplot)

(distplot -> histplot)

โˆŽ SibSp

g = sns.catplot(x = "SibSp",y = "Survived",data = train,
                   kind = "bar", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("survival probability")

  • ํ˜•์ œ/๋ฐฐ์šฐ์ž๊ฐ€ ๋งŽ์„์ˆ˜๋ก ์ƒ์กด๋ฅ ์ด ๋‚ฎ์€ ๊ฒƒ์œผ๋กœ ํŒ๋‹จ๋จ

    • ํ˜ผ์ž์ธ ์Šน๊ฐ(SibSp = 0)์ด๋‚˜ ํ•œ, ๋‘๋ช…์˜ ์‚ฌ๋žŒ์ด ์žˆ๋Š” ๊ฒฝ์šฐ(SibSp = 1 or 2)๊ฐ€ ์ƒ์กด๋ฅ ์ด ๋” ๋†’์€ ๊ฒฝํ–ฅ์„ ๋ณด์ž„

โˆŽ Parch

g = sns.catplot(x = "Parch",y = "Survived",data = train,
                   kind = "bar", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("survival probability")

  • ์†Œ๊ฐ€์กฑ์ด ๋…์‹  ๊ฐ€์กฑ(Parch = 0)์ด๋‚˜ ์ค‘์†Œ ๊ฐ€์กฑ(Parch = 3,4), ๋Œ€๊ฐ€์กฑ(Parch = 5,6)๋ณด๋‹ค ์ƒ์กด๋ฅ ์ด ๋†’์Œ

  • ๋ถ€๋ชจ/์ž๋…€๊ฐ€ 3๋ช…์ธ ์Šน๊ฐ์˜ ์ƒ์กด๋ฅ ์— ์ค‘์š”ํ•œ ํ‘œ์ค€ ํŽธ์ฐจ๊ฐ€ ์žˆ์Œ

โˆŽ Age

g = sns.FacetGrid(train, col = 'Survived')
g = g.map(sns.histplot, "Age", kde = True)

  • ๊ฐ€์šฐ์Šค ๋ถ„ํฌ์ผ ์ˆ˜๋„ ์žˆ๋Š” ์•ฝ๊ฐ„์€ ์™œ๊ณก๋œ(tailed) ๋ถ„ํฌ๋ฅผ ๋ณด์ž„

  • ์ƒ์กดํ•œ ์ง‘๋‹จ๊ณผ ์ƒ์กดํ•˜์ง€ ๋ชปํ•œ ์ง‘๋‹จ์—์„œ์˜ ์—ฐ๋ น๋Œ€ ๋ถ„ํฌ ์–‘์ƒ์ด ๋‹ค๋ฆ„

    • ์ƒ์กดํ•œ ์ง‘๋‹จ ์ค‘ ์ Š์€ ์‚ฌ๋žŒ๋“ค์˜ ๋น„์œจ์ด ๋†’์Œ

    • 60 ~ 80์„ธ ์‚ฌ์ด์˜ ์Šน๊ฐ๋“ค์˜ ์ƒ์กด๋ฅ ์ด ๋‚ฎ์Œ

  • Age์™€ Survived์˜ ์ƒ๊ด€๊ณ„์ˆ˜๊ฐ€ ๋‚ฎ๋”๋ผ๋„, ์ƒ์กด ๊ฐ€๋Šฅ์„ฑ์ด ๋†’์€ ์—ฐ๋ น๋Œ€๊ฐ€ ์กด์žฌํ•จ์„ ์ง์ž‘ํ•  ์ˆ˜ ์žˆ์Œ

โœ” ์—ฌ๋‹ด์œผ๋กœ, ์ƒ๊ด€๊ณ„์ˆ˜ ํ–‰๋ ฌ์—์„œ์˜ ์ƒ๊ด€ ๊ณ„์ˆ˜(default = Pearson ์ƒ๊ด€๊ณ„์ˆ˜)๋Š” ๋‘ ๋ณ€์ˆ˜ ๊ฐ„์˜ ์„ ํ˜•์  ๊ด€๊ณ„๋ฅผ ๋‚˜ํƒ€๋‚ด๋Š” ์ธก๋„์ด๊ธฐ์—, ์šฐ๋ฆฌ๊ฐ€ ๋ชจ๋ฅด๋Š” ๋‹ค๋ฅธ ๊ด€๊ณ„๊ฐ€ ์กด์žฌํ•ญ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค..!

### ์—ฐ๋ น๋Œ€ ๋ถ„ํฌ ์‹œ๊ฐํ™”

g = sns.kdeplot(train["Age"][(train["Survived"] == 0) & (train["Age"].notnull())], 
                color = "Red", shade = True)
g = sns.kdeplot(train["Age"][(train["Survived"] == 1) & (train["Age"].notnull())], 
                ax = g, color = "Blue", shade = True)
g.set_xlabel("Age")
g.set_ylabel("Frequency")
g = g.legend(["Not Survived","Survived"])

image.png

  • ๋‘ ๋ฐ€๋„ํ•จ์ˆ˜๋ฅผ ์ค‘์ฒฉํ•˜์—ฌ ํŒŒ์•…ํ•˜๋ฉด, ์•„๊ธฐ๋“ค๊ณผ ๋งค์šฐ ์–ด๋ฆฐ ์•„์ด๋“ค์— ๋Œ€์‘ํ•˜๋Š” peak๋ฅผ ๋ณผ ์ˆ˜ ์žˆ์Œ(Peak 0 ~ 5)

โˆŽ Fare

### ๊ฒฐ์ธก์น˜ ํ™•์ธ

dataset["Fare"].isnull().sum()
1
### ๊ฒฐ์ธก์น˜ -> ์ค‘๊ฐ„๊ฐ’
# ์˜ˆ์ธก์— ์ค‘์š”ํ•œ ์˜ํ–ฅ์„ ๋ฏธ์น˜์ง€ ์•Š์„ ๊ฒƒ์ด๋ผ๊ณ  ํŒ๋‹จ๋˜๋Š” ์ค‘์•™๊ฐ’ ์ฑ„ํƒ

dataset["Fare"] = dataset["Fare"].fillna(dataset["Fare"].median())
g = sns.histplot(dataset["Fare"], color="m", kde = True,
                 label = "Skewness : %.2f"%(dataset["Fare"].skew()))
g = g.legend(loc = "best")

  • Fare ๋ณ€์ˆ˜๋Š” ๋งค์šฐ ์™œ๊ณก๋œ ๋ถ„ํฌ๋ฅผ ๋„๊ณ  ์žˆ์Œ

    • Scale์ด ์กฐ์ •๋œ๋‹ค ํ•ด๋„ ๋ชจํ˜•์˜ ๊ฐ€์ค‘์น˜๊ฐ€ ๋งค์šฐ ๋†’์•„์ง€๋Š” ๋ฌธ์ œ๊ฐ€ ๋ฐœ์ƒํ•  ์ˆ˜ ์žˆ์Œ

    • ๋กœ๊ทธ ๋ณ€ํ™˜(log transformation)์„ ํ†ตํ•ด ์™œ๊ณก์„ ์ค„์ด๋Š” ๊ฒƒ์ด ๊ถŒ์žฅ๋จ

### ๋กœ๊ทธ ๋ณ€ํ™˜

dataset["Fare"] = dataset["Fare"].map(lambda i: np.log(i) if i > 0 else 0)
### ๋กœ๊ทธ ๋ณ€ํ™˜ ํ›„ ์‹œ๊ฐํ™”

g = sns.histplot(dataset["Fare"], color="b", kde = True,
                 label="Skewness : %.2f"%(dataset["Fare"].skew()))
g = g.legend(loc="best")

  • ๋กœ๊ทธ ๋ณ€ํ™˜์ด ์™œ๊ณก ์ •๋„๋ฅผ ๊ต‰์žฅํžˆ ๋งŽ์ด ๊ฐ์†Œ์‹œํ‚ด

3-2. ๋ฒ”์ฃผํ˜•(categorical) ๋ณ€์ˆ˜

โˆŽ Sex

g = sns.barplot(x = "Sex",y = "Survived",data = train)
g = g.set_ylabel("Survival Probability")

train[["Sex","Survived"]].groupby('Sex').mean()
Survived
Sex
female 0.747573
male 0.190559
  • ๋‚จ์„ฑ์ด ์—ฌ์„ฑ๋ณด๋‹ค ์ƒ์กด๋ฅ ์ด ํ˜„์ €ํ•˜๊ฒŒ ๋‚ฎ์Œ

    • Sex๋Š” ์ƒ์กด ์—ฌ๋ถ€๋ฅผ ์˜ˆ์ธกํ•˜๋Š” ๋ฐ ์ค‘์š”ํ•œ ์˜ํ–ฅ์„ ํ•  ์ˆ˜ ์žˆ์Œ

โˆŽ PClass

### ์‹œ๊ฐํ™”

g = sns.catplot(x = "Pclass",y = "Survived",data = train, 
                kind = "bar", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("survival probability")

g = sns.catplot(x = "Pclass", y = "Survived", hue = "Sex", 
                   data = train, kind = "bar", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("survival probability")

  • 3๊ฐœ์˜ ํด๋ž˜์Šค์— ๋Œ€ํ•ด ๊ฐ๊ฐ์˜ ํด๋ž˜์Šค์—์„œ ์ƒ์กด๋ฅ ์ด ๋™์ผํ•˜์ง€๋Š” x

    • PClass = 1์ธ ์Šน๊ฐ๋“ค์€ PClass = 2, 3์ธ ์Šน๊ฐ๋“ค์— ๋น„ํ•ด ์ƒ์กด๋ฅ ์ด ๋†’์Œ

    • ๋‚จ๋…€ ์ƒ๊ด€ x

โˆŽ Embarked

### ๊ฒฐ์ธก์น˜ ํ™•์ธ

dataset["Embarked"].isnull().sum()
2
### ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ
# ๊ฐ€์žฅ ๋งŽ์€ 'S'๋กœ ๋Œ€์ฒด

dataset["Embarked"] = dataset["Embarked"].fillna("S")
### ์‹œ๊ฐํ™”

g = sns.catplot(x = "Embarked", y = "Survived", data = train,
                 kind = "bar", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("survival probability")

  • Cherbourg(C)์—์„œ ์˜ค๋Š” ์Šน๊ฐ๋“ค์˜ ์ƒ์กด๋ฅ ์ด ๋” ๋†’์€ ๊ฒฝํ–ฅ์„ ๋ณด์ž„

    • ์ผ๋“ฑ์„ ์Šน๊ฐ์˜ ๋น„์œจ์ด C์—์„œ ๋†’์„๊นŒ?
### PClass vs Embarked

g = sns.catplot(x = "Pclass", col = "Embarked", data = train,
                kind = "count", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("Count")

  • ์‹ค์ œ๋กœ PClass = 3์€ ์‚ฌ์šฐ์ƒ˜ํ”„ํ„ด(S)๊ณผ ํ€ธ์Šคํƒ€์šด(Q)์—์„œ ์˜ค๋Š” ์Šน๊ฐ๋“ค์—๊ฒŒ ๊ฐ€์žฅ ๋นˆ๋ฒˆํ•œ ๋ฐ˜๋ฉด, ์…ฐ๋ฅด๋ถ€๋ฅด(C)์˜ ์Šน๊ฐ๋“ค์€ ๋Œ€๋ถ€๋ถ„ ์ƒ์กด์œจ์ด ๊ฐ€์žฅ ๋†’์€ PClass = 1์ž„

4. ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ

4-1. Age

  • Age column์—๋Š” 256๊ฐœ์˜ ๊ฒฐ์ธก๊ฐ’์ด ํฌํ•จ๋˜์–ด ์žˆ์Œ

  • ์ƒ์กด ๊ธฐํšŒ๊ฐ€ ๋” ๋งŽ์€ ํ•˜์œ„ ์ง‘๋‹จ(ex> ์–ด๋ฆฐ์ด)์ด ์žˆ์Œ

    • Age feature๋ฅผ ์œ ์ง€ํ•˜๊ณ  ๋ˆ„๋ฝ๋œ ๊ฐ’์„ ์ฒ˜๋ฆฌํ•˜๋Š” ๊ฒƒ์ด ๋ฐ”๋žŒ์ง

    • Age์™€ ์ƒ๊ด€๊ด€๊ณ„๊ฐ€ ์žˆ๋Š” ๋ณ€์ˆ˜๋“ค(Sex, Parch, Pclass, SibSP) ๊ด€์ฐฐ

g = sns.catplot(y = "Age",x = "Sex",data = dataset, kind = "box")
g = sns.catplot(y = "Age", x = "Sex",hue="Pclass", data=dataset,kind="box")
g = sns.catplot(y="Age",x="Parch", data=dataset,kind="box")
g = sns.catplot(y="Age",x="SibSp", data=dataset,kind="box")

  • ์—ฐ๋ น ๋ถ„ํฌ๋Š” ๋‚จ์„ฑ๊ณผ ์—ฌ์„ฑ ํ•˜์œ„ ๋ชจ์ง‘๋‹จ์—์„œ ๋™์ผํ•œ ๊ฒƒ์œผ๋กœ ๋ณด์ž„

    • ์„ฑ๋ณ„์€ ์—ฐ๋ น์„ ์˜ˆ์ธกํ•˜๋Š” ๋ฐ ์œ ์šฉํ•˜์ง€ x
  • PClass = 1์— ๋‚˜์ด๊ฐ€ ๋งŽ์€ ์Šน๊ฐ๋“ค์ด ๋งŽ์Œ

    • ๋ถ€๋ชจ/์ž๋…€ ์ˆ˜๊ฐ€ ๋งŽ์„์ˆ˜๋ก ๋‚˜์ด๊ฐ€ ๋งŽ๊ณ , ํ˜•์ œ/๋ฐฐ์šฐ์ž๊ฐ€ ๋งŽ์„์ˆ˜๋ก ๋‚˜์ด๊ฐ€ ์–ด๋ ค์ง€๋Š” ๊ฒฝํ–ฅ์ด ์žˆ์Œ
### Sex(๋ฒ”์ฃผํ˜•) -> ์ˆ˜์น˜ํ˜•

dataset["Sex"] = dataset["Sex"].map({"male": 0, "female":1})
### ์ƒ๊ด€๊ณ„์ˆ˜ heatmap

g = sns.heatmap(dataset[["Age","Sex","SibSp","Parch","Pclass"]].corr(),
                cmap = "BrBG",annot = True)

  • Parch๋ฅผ ์ œ์™ธํ•œ feature๋“ค ๊ฐ„์˜ ์ƒ๊ด€์„ฑ์„ ์‹œ๊ฐ์ ์œผ๋กœ ์ œ์‹œ

  • ๋‚˜์ด๋Š” Sex์™€๋Š” ์ƒ๊ด€๊ด€๊ณ„๊ฐ€ x

    • PClass, Parch, SibSp์™€๋Š” ์Œ์˜ ์ƒ๊ด€๊ด€๊ณ„๋ฅผ ๊ฐ€์ง
  • Age - Parch ๊ทธ๋ž˜ํ”„์—์„œ ๋‚˜์ด๋Š” ๋ถ€๋ชจ/์ž๋…€์˜ ์ˆ˜์— ๋”ฐ๋ผ ์ฆ๊ฐ€ํ•จ

    • ํ•˜์ง€๋งŒ, ์ผ๋ฐ˜์ ์ธ ์ƒ๊ด€๊ด€๊ณ„๋Š” ์Œ์˜ ์ƒ๊ด€๊ด€๊ณ„
  • ๋‚˜์ด ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ์— SibSp, Parch, PClass ํ™œ์šฉ

    • PClass, Parch ๋ฐ SibSp์— ๋”ฐ๋ผ ์œ ์‚ฌํ•œ ํ–‰์˜ median ๊ฐ’์œผ๋กœ ๋Œ€์ฒด
### ๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ

index_NaN_age = list(dataset["Age"][dataset["Age"].isnull()].index) # ๊ฒฐ์ธก์น˜ ๋ฐ์ดํ„ฐ์˜ index

for i in index_NaN_age:
    age_med = dataset["Age"].median()
    age_pred = dataset["Age"][((dataset['SibSp'] == dataset.iloc[i]["SibSp"]) & (dataset['Parch'] == dataset.iloc[i]["Parch"]) & (dataset['Pclass'] == dataset.iloc[i]["Pclass"]))].median()
    
    if not np.isnan(age_pred):
        dataset['Age'].iloc[i] = age_pred
    else:
        dataset['Age'].iloc[i] = age_med
g = sns.catplot(x = "Survived", y = "Age",data = train, kind = "box")
g = sns.catplot(x = "Survived", y = "Age",data = train, kind = "violin")

  • ์ƒ์กดํ•œ ๊ทธ๋ฃน์˜ ์—ฐ๋ น์˜ ์ค‘๊ฐ„๊ฐ’๊ณผ ์ƒ์กดํ•˜์ง€ ๋ชปํ•œ ๊ทธ๋ฃน์˜ ์—ฐ๋ น์˜ ์ค‘๊ฐ„๊ฐ’ ์‚ฌ์ด์—๋Š” ์ƒ๋‹นํ•œ ์ฐจ์ด๊ฐ€ ์กด์žฌ

  • violin plot์„ ํ†ตํ•ด ์—ฌ์ „ํžˆ ๋งค์šฐ ์–ด๋ฆฐ ์Šน๊ฐ๋“ค์˜ ์ƒ์กด๋ฅ ์ด ๋” ๋†’์Œ์„ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Œ

5. ํŠน์„ฑ ๊ณตํ•™(Feature Engineering)

5-1. Name/ Title

dataset["Name"].head()
0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object
  • Name feature์—๋Š” ์Šน๊ฐ์˜ ์ œ๋ชฉ์— ๋Œ€ํ•œ ์ •๋ณด๊ฐ€ ํฌํ•จ๋˜์–ด ์žˆ์Œ

  • ๋Œ€ํ”ผํ•˜๋Š” ๋™์•ˆ ๊ตฌ๋ณ„๋˜๋Š” ํ˜ธ์นญ์„ ๊ฐ€์ง„ ์ผ๋ถ€ ์Šน๊ฐ์ด ์„ ํ˜ธ๋  ์ˆ˜ ์žˆ๊ธฐ ๋•Œ๋ฌธ์—, ์ด๋“ค์„ ์ƒˆ๋กœ์šด ๋ณ€์ˆ˜๋กœ ํ™œ์šฉ

dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]]
dataset["Title"] = pd.Series(dataset_title)
dataset["Title"].head()
0      Mr
1     Mrs
2    Miss
3     Mrs
4      Mr
Name: Title, dtype: object
### ์‹œ๊ฐํ™”

g = sns.countplot(x="Title",data=dataset)
g = plt.setp(g.get_xticklabels(), rotation = 45)

  • 17๊ฐœ์˜ title์ด ์กด์žฌ

    • ๋Œ€๋ถ€๋ถ„ ๊ฑฐ์˜ ์—†๊ณ  ์•ฝ 4๊ฐœ๋กœ ๊ฑฐ์˜ ๋‹ค ๊ตฌ๋ถ„ ๊ฐ€๋Šฅ
### Title(๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜) -> ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜

# 4๊ฐœ์˜ ๋ฒ”์ฃผ๋กœ ์žฌ๋ถ„๋ฅ˜
dataset["Title"] = dataset["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset["Title"] = dataset["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
dataset["Title"] = dataset["Title"].astype(int)
### ์‹œ๊ฐํ™”

g = sns.countplot(dataset["Title"])
g = g.set_xticklabels(["Master","Miss/Ms/Mme/Mlle/Mrs","Mr","Rare"])
  • ์—๋Ÿฌ ๋ฐœ์ƒ
g = sns.catplot(x="Title",y="Survived",data=dataset,kind="bar")
g = g.set_xticklabels(["Master","Miss-Mrs","Mr","Rare"])
g = g.set_ylabels("survival probability")

  • ์—ฌ์ž์™€ ์•„์ด๋“ค์˜ ์ƒ์กด๋ฅ ์ด ๋” ๋†’์Œ์„ ํ™•์ธํ•  ์ˆ˜ ์žˆ๋‹ค.

  • ํŠน์ดํ•œ title์„ ๊ฐ€์ง„ ์Šน๊ฐ๋“ค์˜ ์ƒ์กด๋ฅ ์ด ๋” ๋†’๋‹ค.

### ๊ธฐ์กด์˜ ์ด๋ฆ„ ๋ณ€์ˆ˜๋ฅผ drop

dataset.drop(labels = ["Name"], axis = 1, inplace = True)

5-2. Family Size

  • SibSp, Parch์™€ ์—ฌ๊ธฐ์— 1(์ž๊ธฐ ์ž์‹ )์„ ๋”ํ•ด FSize๋ผ๋Š” ๋ณ€์ˆ˜ ์ƒ์„ฑ
dataset["Fsize"] = dataset["SibSp"] + dataset["Parch"] + 1
### ์‹œ๊ฐํ™”

g = sns.pointplot(x = "Fsize",y = "Survived",data = dataset, )
g = g.set_ylabel("Survival Probability")

image.png

  • ๊ฐ€์กฑ ๊ทœ๋ชจ๊ฐ€ ์ค‘์š”ํ•œ ์—ญํ• ์„ ํ•˜๋Š” ๊ฒƒ์ฒ˜๋Ÿผ ๋ณด์ž„

    • ๋Œ€๊ฐ€์กฑ์˜ ์ƒ์กด ํ™•๋ฅ ์€ ์ตœ์•…์ž„
  • ์ถ”๊ฐ€์ ์œผ๋กœ 4๊ฐœ์˜ ๊ฐ€์กฑ ํฌ๊ธฐ category๋ฅผ ์ƒ์„ฑํ•˜์ž

### ์ƒˆ๋กœ์šด ์นดํ…Œ๊ณ ๋ฆฌ ์ƒ์„ฑ
dataset['Single'] = dataset['Fsize'].map(lambda s: 1 if s == 1 else 0)
dataset['SmallF'] = dataset['Fsize'].map(lambda s: 1 if  s == 2  else 0)
dataset['MedF'] = dataset['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
dataset['LargeF'] = dataset['Fsize'].map(lambda s: 1 if s >= 5 else 0)
### ์‹œ๊ฐํ™”

g = sns.catplot(x="Single",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")

g = sns.catplot(x="SmallF",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")

g = sns.catplot(x="MedF",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")

g = sns.catplot(x="LargeF",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")

  • ์†Œ๊ฐ€์กฑ์ด๋‚˜ ์ค‘๊ฐ„ ์ •๋„ ํฌ๊ธฐ์˜ ๊ฐ€์กฑ๋“ค์ด ํ˜ผ์ž ํƒ‘์Šนํ•œ ์Šน๊ฐ์ด๋‚˜ ๋Œ€๊ฐ€์กฑ๋“ค๋ณด๋‹ค ์ƒ์กด๋ฅ ์ด ๋†’์Œ
### ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜๋กœ ๋ณ€ํ™˜
# One-hot Encoding ์ง„ํ–‰

dataset = pd.get_dummies(dataset, columns = ["Title"])
dataset = pd.get_dummies(dataset, columns = ["Embarked"], prefix="Em")
dataset.head()
PassengerId Survived Pclass Sex Age SibSp Parch Ticket Fare Cabin ... SmallF MedF LargeF Title_0 Title_1 Title_2 Title_3 Em_C Em_Q Em_S
0 1 0.0 3 0 22.0 1 0 A/5 21171 1.981001 NaN ... 1 0 0 0 0 1 0 0 0 1
1 2 1.0 1 1 38.0 1 0 PC 17599 4.266662 C85 ... 1 0 0 0 1 0 0 1 0 0
2 3 1.0 3 1 26.0 0 0 STON/O2. 3101282 2.070022 NaN ... 0 0 0 0 1 0 0 0 0 1
3 4 1.0 1 1 35.0 1 0 113803 3.972177 C123 ... 1 0 0 0 1 0 0 0 0 1
4 5 0.0 3 0 35.0 0 0 373450 2.085672 NaN ... 0 0 0 0 0 1 0 0 0 1

5 rows ร— 22 columns

  • ํ˜„ ์‹œ์ ์—์„œ 22๊ฐœ์˜ feature๋ฅผ ๊ฐ€์ง

5-3. Cabin

dataset["Cabin"].head()
0     NaN
1     C85
2     NaN
3    C123
4     NaN
Name: Cabin, dtype: object
dataset["Cabin"].describe()
count     292
unique    186
top        G6
freq        5
Name: Cabin, dtype: object
dataset["Cabin"].isnull().sum()
1007
  • 292๊ฐœ์˜ ๊ฐ’๊ณผ 1007๊ฐœ์˜ ๊ฒฐ์ธก์น˜๊ฐ€ ์กด์žฌ

    • ๊ฐ์‹ค์ด ์—†๋Š” ์Šน๊ฐ์€ ๊ฐ์‹ค ๋ฒˆํ˜ธ ๋Œ€์‹  ๋ˆ„๋ฝ๋œ ๊ฐ’์ด ํ‘œ์‹œ๋˜๋Š” ๊ฒƒ์œผ๋กœ ์ƒ๊ฐ
dataset["Cabin"][dataset["Cabin"].notnull()].head()
1      C85
3     C123
6      E46
10      G6
11    C103
Name: Cabin, dtype: object
### ๊ฒฐ์ธก์น˜์˜ ๊ฒฝ์šฐ 'X'๋กœ ํ‘œ๊ธฐ

dataset["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin'] ])
  • Cabin์˜ ์ฒซ ๊ธ€์ž๋Š” Desk๋ฅผ ํฌํ•จ

    • Titanicํ˜ธ์—์„œ ์Šน๊ฐ์ด ์žˆ์„ ๋งŒํ•œ ์œ„์น˜๋ฅผ ํฌํ•จ
g = sns.countplot(dataset["Cabin"],
                  order=['A','B','C','D','E','F','G','T','X'])
g = sns.catplot(y="Survived",x="Cabin",data=dataset,kind="bar",
                order=['A','B','C','D','E','F','G','T','X'])
g = g.set_ylabels("Survival Probability")

  • ๊ฐ์‹ค์ด ์žˆ๋Š” ์Šน๊ฐ์˜ ์ˆ˜๊ฐ€ ์ ์Œ

    • ์ƒ์กด ํ™•๋ฅ ์€ ์ค‘์š”ํ•œ ํ‘œ์ค€ ํŽธ์ฐจ๋ฅผ ๊ฐ€์ง€๋ฉฐ ๋‹ค๋ฅธ desk์— ์žˆ๋Š” ์Šน๊ฐ์˜ ์ƒ์กด ํ™•๋ฅ ์„ ๊ตฌ๋ณ„ํ•  ์ˆ˜ ์—†์Œ
  • ํ•˜์ง€๋งŒ ์ผ๋ฐ˜์ ์œผ๋กœ ๊ฐ์‹ค์„ ๊ฐ€์ง„ ์Šน๊ฐ์ด ๊ฐ์‹ค์ด ์—†๋Š” ์Šน๊ฐ๋ณด๋‹ค ์ƒ์กดํ•  ์ˆ˜ ์žˆ๋Š” ๊ธฐํšŒ๊ฐ€ ๋” ๋งŽ๋‹ค๋Š” ๊ฒƒ์„ ์•Œ ์ˆ˜ ์žˆ์Œ

    • ํŠนํžˆ ๊ฐ์‹ค B, C, D, E, F์— ํ•ด๋‹น
### ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ -> ์ˆ˜์น˜ํ˜• ๋ณ€์ˆ˜

dataset = pd.get_dummies(dataset, columns = ["Cabin"],prefix="Cabin")

5-4. Ticket

dataset["Ticket"].head()
0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object
  • ๊ฐ™์€ ์ ‘๋‘์‚ฌ๋ฅผ ๊ณต์œ ํ•˜๋Š” ํ‹ฐ์ผ“์ด ํ•จ๊ป˜ ๋ฐฐ์น˜๋œ ๊ฐ์‹ค์— ์˜ˆ์•ฝ๋  ์ˆ˜ ์žˆ๋‹ค๋Š” ๊ฒƒ์„ ์˜๋ฏธํ•˜๋Š” ๊ฒƒ์ผ ์ˆ˜ ์žˆ์Œ -> ์‹ค์ œ ์„ ์‹ค ๋ฐฐ์น˜์™€ ์—ฐ๊ฒฐ๋  ์ˆ˜ ์žˆ์Œ

  • ๋™์ผํ•œ ์ ‘๋‘์‚ฌ๋ฅผ ๊ฐ€์ง„ ํ‹ฐ์ผ“์€ PClass์™€ Survived๊ฐ€ ์œ ์‚ฌํ•  ์ˆ˜ ์žˆ์Œ

    • ์ ‘๋‘์–ด๋งŒ์„ ์ถ”์ถœ
### ์ ‘๋‘์–ด๋งŒ์„ ์ถ”์ถœ
# ๋งŒ์•ฝ ์—†๋‹ค๋ฉด 'X'๋กœ ํ‘œ๊ธฐ

Ticket = []
for i in list(dataset.Ticket):
    # ์ ‘๋‘์–ด๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ
    if not i.isdigit():
        Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0]) 
    else:
        Ticket.append("X")
        
dataset["Ticket"] = Ticket
dataset["Ticket"].head()
0        A5
1        PC
2    STONO2
3         X
4         X
Name: Ticket, dtype: object
### One-hot Encoding

dataset = pd.get_dummies(dataset, columns = ["Ticket"], prefix = "T")
### PClass์— ๋Œ€ํ•œ ๋ฒ”์ฃผํ˜• ๋ณ€์ˆ˜ ์ƒ์„ฑ

dataset["Pclass"] = dataset["Pclass"].astype("category")
dataset = pd.get_dummies(dataset, columns = ["Pclass"],prefix = "Pc")
### ๋ถˆํ•„์š”ํ•œ ๋ณ€์ˆ˜ ๋‚ ๋ฆฌ๊ธฐ
dataset.drop(labels = ["PassengerId"], axis = 1, inplace = True)
dataset.head()
Survived Sex Age SibSp Parch Fare Fsize Single SmallF MedF ... T_STONO T_STONO2 T_STONOQ T_SWPP T_WC T_WEP T_X Pc_1 Pc_2 Pc_3
0 0.0 0 22.0 1 0 1.981001 2 0 1 0 ... 0 0 0 0 0 0 0 0 0 1
1 1.0 1 38.0 1 0 4.266662 2 0 1 0 ... 0 0 0 0 0 0 0 1 0 0
2 1.0 1 26.0 0 0 2.070022 1 1 0 0 ... 0 1 0 0 0 0 0 0 0 1
3 1.0 1 35.0 1 0 3.972177 2 0 1 0 ... 0 0 0 0 0 0 1 1 0 0
4 0.0 0 35.0 0 0 2.085672 1 1 0 0 ... 0 0 0 0 0 0 1 0 0 1

5 rows ร— 67 columns

6. ๋ชจ๋ธ๋ง(Modeling)

### Train, Test๋ฅผ ๋‹ค์‹œ ๋‚˜๋ˆ„๊ธฐ

train = dataset[:train_len]
test = dataset[train_len:]
test.drop(labels=["Survived"],axis = 1,inplace=True)
### feature(X) vs label(y)

train["Survived"] = train["Survived"].astype(int)

Y_train = train["Survived"]
X_train = train.drop(labels = ["Survived"],axis = 1)

6-1. ๋‹จ์ˆœํ•œ ๋ชจ๋ธ๋ง

a. ๊ต์ฐจ ๊ฒ€์ฆ(Cross Validate) ๋ชจ๋ธ

  • 10๊ฐœ์˜ ์ธ๊ธฐ ์žˆ๋Š” ๋ถ„๋ฅ˜๊ธฐ๋ฅผ ๋น„๊ต

  • stratified k-fold ๊ต์ฐจ ๊ฒ€์ฆ ๋ฐฉ์‹ -> ๊ฐ ๋ถ„๋ฅ˜๊ธฐ์˜ ํ‰๊ท  ์ •ํ™•๋„๋ฅผ ํ‰๊ฐ€

  • ํ™œ์šฉ ๋ชจ๋ธ

    • SVC

    • Decision Tree

    • AdaBoost

    • Random Forest

    • Extra Trees

    • Gradient Boosting

    • Multiple layer perceprton (neural network)

    • KNN

    • Logistic regression

    • Linear Discriminant Analysis

kfold = StratifiedKFold(n_splits = 10)
### modeling

random_state = 2

# ๋ชจ๋ธ ๊ฐ์ฒด ์ƒ์„ฑ
classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),
                                      random_state=random_state,learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))
classifiers.append(LinearDiscriminantAnalysis())

# ๊ต์ฐจ ๊ฒ€์ฆ
cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier, X_train, y = Y_train, 
                                      scoring = "accuracy", cv = kfold, n_jobs=4))
# ํ‰๊ฐ€
cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,
                       "Algorithm":["SVC","DecisionTree","AdaBoost","RandomForest",
                                    "ExtraTrees","GradientBoosting",
                                    "MultipleLayerPerceptron","KNeighboors",
                                    "LogisticRegression","LinearDiscriminantAnalysis"]})
### ์‹œ๊ฐํ™”

g = sns.barplot(x = "CrossValMeans",y = "Algorithm",data = cv_res, 
                palette = "Set3",orient = "h",**{'xerr':cv_std})
g.set_xlabel("Mean Accuracy")
g = g.set_title("Cross validation scores")

  • ์•™์ƒ๋ธ” ๋ชจ๋ธ๋ง์„ ์œ„ํ•ด SVC, AdaBoost, RandomForest , ExtraTrees, GradientBoosting ๋ชจ๋ธ ์„ ํƒ

b. ์ตœ์  ๋ชจ๋ธ์„ ์œ„ํ•œ ํ•˜์ดํผ ํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹(hyper parameter tuning)

  • Grid Search ์ตœ์ ํ™”๋ฅผ ์ˆ˜ํ–‰
### ๋ฉ”ํƒ€ ๋ชจ๋ธ๋ง(Meta Modeling)

# Adaboost
DTC = DecisionTreeClassifier()
adaDTC = AdaBoostClassifier(DTC, random_state=7)
ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "algorithm" : ["SAMME","SAMME.R"],
              "n_estimators" :[1,2],
              "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}

gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold, 
                        scoring="accuracy", n_jobs= 4, verbose = 1)
gsadaDTC.fit(X_train,Y_train)
ada_best = gsadaDTC.best_estimator_
gsadaDTC.best_score_
0.8275536261491316
# ExtraTrees 
ExtC = ExtraTreesClassifier()

ex_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}

gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=kfold, 
                      scoring="accuracy", n_jobs= 4, verbose = 1)

gsExtC.fit(X_train,Y_train)
ExtC_best = gsExtC.best_estimator_

# Best score
gsExtC.best_score_
Fitting 10 folds for each of 54 candidates, totalling 540 fits
0.8286133810010214
# RFC Parameters tunning 
RFC = RandomForestClassifier()

rf_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}

gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, 
                     scoring="accuracy", n_jobs= 4, verbose = 1)
gsRFC.fit(X_train,Y_train)
RFC_best = gsRFC.best_estimator_

# Best score
gsRFC.best_score_
Fitting 10 folds for each of 54 candidates, totalling 540 fits
0.8320224719101124
# Gradient boosting tunning

GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1] 
              }

gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, 
                     scoring="accuracy", n_jobs= 4, verbose = 1)
gsGBC.fit(X_train,Y_train)
GBC_best = gsGBC.best_estimator_

# Best score
gsGBC.best_score_
Fitting 10 folds for each of 72 candidates, totalling 720 fits
/usr/local/lib/python3.9/dist-packages/sklearn/ensemble/_gb.py:280: FutureWarning: The loss parameter name 'deviance' was deprecated in v1.1 and will be removed in version 1.3. Use the new parameter name 'log_loss' which is equivalent.
  warnings.warn(
0.8376915219611849
# SVC classifier
SVMC = SVC(probability=True)
svc_param_grid = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'C': [1, 10, 50, 100,200,300, 1000]}

gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, 
                      scoring="accuracy", n_jobs= 4, verbose = 1)

gsSVMC.fit(X_train,Y_train)
SVMC_best = gsSVMC.best_estimator_

# Best score
gsSVMC.best_score_
Fitting 10 folds for each of 28 candidates, totalling 280 fits
0.8331332992849847

c. ํ•™์Šต ๊ณก์„ (learning curve) ์‹œ๊ฐํ™”

  • training set์—์„œ์˜ overfitting

  • ์ •ํ™•๋„์— training size๊ฐ€ ๋ฏธ์น˜๋Š” ์˜ํ–ฅ ํŒŒ์•…

### ์‹œ๊ฐํ™”๋ฅผ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
  
    """Generate a simple plot of the test and training learning curve"""
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    
    return plt
### ์‹œ๊ฐํ™”

g = plot_learning_curve(gsRFC.best_estimator_,"RF mearning curves",
                        X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsExtC.best_estimator_,"ExtraTrees learning curves",
                        X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsSVMC.best_estimator_,"SVC learning curves",
                        X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsadaDTC.best_estimator_,"AdaBoost learning curves",
                        X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsGBC.best_estimator_,"GradientBoosting learning curves",
                        X_train,Y_train,cv=kfold)

  • GradientBoosting ๋ฐ Adaboost ๋ถ„๋ฅ˜๊ธฐ๋Š” ํ›ˆ๋ จ ์„ธํŠธ์— overfitting๋œ ๊ฒฝํ–ฅ์ด ์žˆ์Œ

    • ์ฆ๊ฐ€ํ•˜๋Š” ๊ต์ฐจ ๊ฒ€์ฆ ๊ณก์„ ์— ๋”ฐ๋ผ GradientBoost์™€ Adboost๋Š” ๋” ๋งŽ์€ ํ›ˆ๋ จ ์˜ˆ์ œ๋ฅผ ํ†ตํ•ด ๋” ๋‚˜์€ ์„ฑ๋Šฅ์„ ๋ฐœํœ˜ํ•  ์ˆ˜ ์žˆ์Œ
  • SVC์™€ ExtraTree ๋ถ„๋ฅ˜๊ธฐ๋Š” ํ›ˆ๋ จ๊ณผ ๊ต์ฐจ ๊ฒ€์ฆ ๊ณก์„ ์ด ์„œ๋กœ ๋น„์Šทํ•จ

    • ์˜ˆ์ธก์„ ๋” ์ž˜ ์ผ๋ฐ˜ํ™”ํ•˜๋Š” ๊ฒƒ์œผ๋กœ ๋ณด์ž„

d. ํ”ผ์ณ ์ค‘์š”๋„ & ํŠธ๋ฆฌ ๊ธฐ๋ฐ˜ ๋ถ„๋ฅ˜

  • ์Šน๊ฐ ์ƒ์กด ์˜ˆ์ธก์„ ์œ„ํ•œ ๊ฐ€์žฅ ์œ ์šฉํ•œ ํŠน์ง• ํŒŒ์•…
nrows = ncols = 2
fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharex="all", figsize=(15,15))

names_classifiers = [("AdaBoosting", ada_best),("ExtraTrees",ExtC_best),
                     ("RandomForest",RFC_best),("GradientBoosting",GBC_best)]

nclassifier = 0
for row in range(nrows):
    for col in range(ncols):
        name = names_classifiers[nclassifier][0]
        classifier = names_classifiers[nclassifier][1]
        indices = np.argsort(classifier.feature_importances_)[::-1][:40]
        
        g = sns.barplot(y = X_train.columns[indices][:40],
                        x = classifier.feature_importances_[indices][:40] , 
                        orient='h',ax=axes[row][col])
        g.set_xlabel("Relative importance",fontsize=12)
        g.set_ylabel("Features",fontsize=12)
        g.tick_params(labelsize=9)
        g.set_title(name + " feature importance")
        
        nclassifier += 1

  • 4๊ฐœ์˜ ํŠธ๋ฆฌ ๊ธฐ๋ฐ˜ ๋ถ„๋ฅ˜๊ธฐ(Adaboost, ExtraTree, RandomForest ๋ฐ GradientBoost)์— ๋Œ€ํ•œ feature ์ค‘์š”๋„๋ฅผ ํ‘œ์‹œ

  • ๋„ค ๊ฐœ์˜ ๋ถ„๋ฅ˜๊ธฐ๊ฐ€ ์ƒ๋Œ€์  ์ค‘์š”์„ฑ์— ๋”ฐ๋ผ ๋‹ค๋ฅธ ์ตœ์ƒ์œ„ feature ๊ฐ€์ง€๊ณ  ์žˆ์Œ

    • ์˜ˆ์ธก์ด ๊ฐ™์€ feature์— ๊ทผ๊ฑฐํ•˜์ง€ ์•Š๋Š”๋‹ค๋Š” ๊ฒƒ์„ ์˜๋ฏธ
  • Title_2๋Š” Mrs/Mlle/Mme/Miss/Ms ๋ฒ”์ฃผ๊ฐ€ Sex์™€ ๋†’์€ ์ƒ๊ด€๊ด€๊ณ„๊ฐ€ ์žˆ์Œ

    • Discussion:

      • Pc_1, Pc_2, Pc_3 ๋ฐ ์š”๊ธˆ์€ ์Šน๊ฐ์˜ ์ผ๋ฐ˜์ ์ธ ์‚ฌํšŒ์  ์ง€์œ„๋ฅผ ๋‚˜ํƒ€๋ƒ„

      • Sex์™€ Title_2(Mrs/Mlle/Mme/Miss/Ms) ๋ฐ Title_3(Mr)์€ ์„ฑ๋ณ„์„ ์˜๋ฏธ

      • ์—ฐ๋ น๊ณผ Title_1(Master)๋Š” ์Šน๊ฐ์˜ ์—ฐ๋ น

      • Fsize, LargeF, MedF, Single์€ ์Šน๊ฐ์˜ ๊ฐ€์กฑ ์ˆ˜ ํฌ๊ธฐ๋ฅผ ์˜๋ฏธ

์ƒ์กด ์˜ˆ์ธก์€ ๋ณดํŠธ ์•ˆ์˜ ์œ„์น˜๋ณด๋‹ค๋Š” ์Šน๊ฐ์˜ ๋‚˜์ด, ์„ฑ๋ณ„, ๊ฐ€์กฑ ๊ทœ๋ชจ, ์‚ฌํšŒ์  ์ง€์œ„์™€ ๋” ๊ด€๋ จ์ด ์žˆ์Œ

### ๊ฐ๊ฐ์˜ ๋ชจ๋ธ์— ๋Œ€ํ•ด ์˜ˆ์ธก๊ฐ’ ๋„์ถœ
test_Survived_RFC = pd.Series(RFC_best.predict(test), name="RFC")
test_Survived_ExtC = pd.Series(ExtC_best.predict(test), name="ExtC")
test_Survived_SVMC = pd.Series(SVMC_best.predict(test), name="SVC")
test_Survived_AdaC = pd.Series(ada_best.predict(test), name="Ada")
test_Survived_GBC = pd.Series(GBC_best.predict(test), name="GBC")


# ๋ชจ๋“  ๊ฒฐ๊ณผ ํ†ตํ•ฉ
ensemble_results = pd.concat([test_Survived_RFC,test_Survived_ExtC,test_Survived_AdaC,
                              test_Survived_GBC, test_Survived_SVMC], axis = 1)

# ์‹œ๊ฐํ™”
g = sns.heatmap(ensemble_results.corr(),annot=True)

  • Adboost๊ฐ€ ๋‹ค๋ฅธ ๋ถ„๋ฅ˜๊ธฐ์™€ ๋น„๊ต๋˜๋Š” ๊ฒฝ์šฐ๋ฅผ ์ œ์™ธํ•˜๊ณ ๋Š” 5๊ฐœ ๋ถ„๋ฅ˜๊ธฐ์— ๋Œ€ํ•ด ์ƒ๋‹นํžˆ ์œ ์‚ฌํ•œ ๊ฒƒ์œผ๋กœ ๋ณด์ž„

  • 5๊ฐœ์˜ ๋ถ„๋ฅ˜๊ธฐ๋Š” ๊ฑฐ์˜ ๋™์ผํ•œ ์˜ˆ์ธก์„ ์ œ๊ณตํ•˜์ง€๋งŒ ์•ฝ๊ฐ„์˜ ์ฐจ์ด๊ฐ€ ์žˆ์Œ

    • ensembling voting ํ™œ์šฉํ•˜๊ธฐ์— ์ถฉ๋ถ„

6-2. ๋ชจ๋ธ ์•™์ƒ๋ธ”(Ensemble)

###a. ๋ชจ๋ธ ๊ฒฐํ•ฉํ•˜๊ธฐ

  • VotingClassifier๋ฅผ ์„ ํƒ

  • soft ์ง€์ •: ๊ฐ vote์—์„œ์˜ ํ™•๋ฅ  ๊ณ ๋ ค

votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best),
                                       ('svc', SVMC_best), ('adac',ada_best),
                                       ('gbc',GBC_best)], voting='soft', n_jobs=4)
votingC = votingC.fit(X_train, Y_train)

6-3. ์˜ˆ์ธก(Prediction)

a. ์˜ˆ์ธก & ๊ฒฐ๊ณผ ๋„์ถœ

test_Survived = pd.Series(votingC.predict(test), name="Survived")
results = pd.concat([IDtest,test_Survived],axis=1)
results.to_csv("ensemble_python_voting.csv",index=False)

ํƒœ๊ทธ: , ,

์นดํ…Œ๊ณ ๋ฆฌ:

์—…๋ฐ์ดํŠธ: