[ECC DS 1์ฃผ์ฐจ] Titanic Top 4% with ensemble modeling
1.Introduction
### Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from collections import Counter
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
### seaborn plot ๊ธฐ๋ณธ ์ค์
sns.set(style = 'white', context = 'notebook', palette = 'deep')
2. ๋ฐ์ดํฐ ๋ก๋ฉ/ ํ์ธ
2-1. ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ECC 48แแ
ต แแ
ฆแแ
ชB/1แแ
ฎแแ
ก/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ECC 48แแ
ต แแ
ฆแแ
ชB/1แแ
ฎแแ
ก/data/test.csv")
IDtest = test["PassengerId"]
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
test.head()
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
2-2. ์ด์์น(outlier) ํ์ง
-
IQR(์ฌ๋ถ์ ๋ฒ์) ๋ฐฉ๋ฒ
-
Tukey Method
-
IQR = Q3(75%) - Q1(25%)
-
[Q1 - 1.5 * IQR, Q3 + 1.5 * IQR]
๋ฒ์๋ฅผ ๋ฒ์ด๋๋ ๋ฐ์ดํฐ ํฌ์ธํธ๋ฅผ ์ด์์น๋ก ๊ฐ์ฃผ
### ์ด์์น ํ์ง๋ฅผ ์ํ ํจ์
def detect_outliers(df,n,features):
"""
ํผ์ณ์ ๋ฐ์ดํฐ ํ๋ ์์ ๊ฐ์ ธ์์ Tukey ๋ฐฉ๋ฒ์ ๋ฐ๋ผ
n๊ฐ ์ด์์ ์ด์์น๋ฅผ ํฌํจํ๋ ๊ด์ธก์น์ ํด๋นํ๋ ์ธ๋ฑ์ค ๋ชฉ๋ก์ ๋ฐํ
"""
outlier_indices = [ ] # ์ด์์น๋ฅผ ๊ฐ์ง๋ feature๋ค์ ์ ์ฅ
for col in features: # ๊ฐ feature๋ง๋ค
Q1 = np.percentile(df[col], 25)
Q3 = np.percentile(df[col],75)
# Interquartile range (IQR)
IQR = Q3 - Q1
outlier_step = 1.5 * IQR
# ์ด์์น ํ์ง
outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
outlier_indices.extend(outlier_list_col)
# ๋ ๊ฐ ์ด์์ ์ด์์น๋ฅผ ํฌํจํ๋ ๊ด์ธก์น ์ ํ
outlier_indices = Counter(outlier_indices)
multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
return multiple_outliers
# ์์นํ ๋ณ์๋ค(Age, SibSp, Parch, Fare)์์ ์ด์์น ํ์ง
Outliers_to_drop = detect_outliers(train,2,["Age","SibSp","Parch","Fare"])
-
์ด์์น๋ ์์ธก์ ๊ทน์ ์ธ ์ํฅ์ ๋ฏธ์น ์ ์์ (ํนํ ํ๊ท ๋ฌธ์ )
- ์ด์ ๋ํ ์ ์ฒ๋ฆฌ
-
Tukey ๋ฐฉ๋ฒ์ ํ์ฉํ์ฌ ์ด์์น ํ์ง
-
์์นํ ๋ณ์(Age, SibSp, Sarch and Fare)์์ ์ด์์น๋ฅผ ํ์ง
- ์ต์ ๋ ๊ฐ์ ๋์ถ๋ ์ซ์ ๊ฐ์ ์ด์์น๋ฅผ ๊ฐ์ง ํ์ผ๋ก ๊ฐ์ฃผ
# ์ด์์น๊ฐ ์๋ ํ ํ์
train.loc[Outliers_to_drop]
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
27 | 28 | 0 | 1 | Fortune, Mr. Charles Alexander | male | 19.0 | 3 | 2 | 19950 | 263.00 | C23 C25 C27 | S |
88 | 89 | 1 | 1 | Fortune, Miss. Mabel Helen | female | 23.0 | 3 | 2 | 19950 | 263.00 | C23 C25 C27 | S |
159 | 160 | 0 | 3 | Sage, Master. Thomas Henry | male | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
180 | 181 | 0 | 3 | Sage, Miss. Constance Gladys | female | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
201 | 202 | 0 | 3 | Sage, Mr. Frederick | male | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
324 | 325 | 0 | 3 | Sage, Mr. George John Jr | male | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
341 | 342 | 1 | 1 | Fortune, Miss. Alice Elizabeth | female | 24.0 | 3 | 2 | 19950 | 263.00 | C23 C25 C27 | S |
792 | 793 | 0 | 3 | Sage, Miss. Stella Anna | female | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
846 | 847 | 0 | 3 | Sage, Mr. Douglas Bullen | male | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
863 | 864 | 0 | 3 | Sage, Miss. Dorothy Edith "Dolly" | female | NaN | 8 | 2 | CA. 2343 | 69.55 | NaN | S |
-
10๊ฐ์ ์ด์์น๋ฅผ ํ์ง
-
28, 89, 342๋ฒ ์น๊ฐ์ ๊ฒฝ์ฐ ๋งค์ฐ ๋์ Ticket,Fare๋ฅผ ๊ฐ์ง๊ณ ์์
-
๋๋จธ์ง 7๋ช ์ ๊ฒฝ์ฐ SibSp ๊ฐ์ด ๋งค์ฐ ๋์
-
### ์ด์์น ์ ๊ฑฐ
train = train.drop(Outliers_to_drop, axis = 0).reset_index(drop = True)
# ์ด์์น๊ฐ ์๋ ํ์ ๋ ๋ฆฐ ํ ์ธ๋ฑ์ค๋ฅผ ๋ค์ ๋ถ์ฌ
2-3. Train + Test set
### train, test set ๊ฒฐํฉ
# ๋ฒ์ฃผํ ๋ณ์ ๋ณํ ์ ๋์ผํ ์์ feature๋ฅผ ์ป๊ธฐ ์ํด์
train_len = len(train)
dataset = pd.concat(objs = [train, test], axis = 0).reset_index(drop = True) # ํ์ ๊ธฐ์ค์ผ๋ก ๊ฒฐํฉ
2-4. Null๊ฐ, ๊ฒฐ์ธก์น(Missing value) ํ์ธ
# ๋น์ด์๊ฑฐ๋ NaN์ธ ๊ฐ๋ค์ NaN์ผ๋ก ์ฑ์
dataset = dataset.fillna(np.nan)
# ๋จ์ null๊ฐ ํ์ธ
dataset.isnull().sum()
PassengerId 0 Survived 418 Pclass 0 Name 0 Sex 0 Age 256 SibSp 0 Parch 0 Ticket 0 Fare 1 Cabin 1007 Embarked 2 dtype: int64
-
Age์ Cabin์ ๋งค์ฐ ๋ง์ ๊ฒฐ์ธก์น ์กด์ฌ
-
ํ ์คํธ ๋ฐ์ดํฐ ๊ฒฐํฉ ์์ Survived์ ๊ฒฐ์ธก์น๋ test set์ ์กด์ฌ x
- concat ์ NaN ๊ฐ์ ๋์ฒดํด์
### ์ ๋ณด
train.info()
print()
train.isnull().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 881 entries, 0 to 880 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 881 non-null int64 1 Survived 881 non-null int64 2 Pclass 881 non-null int64 3 Name 881 non-null object 4 Sex 881 non-null object 5 Age 711 non-null float64 6 SibSp 881 non-null int64 7 Parch 881 non-null int64 8 Ticket 881 non-null object 9 Fare 881 non-null float64 10 Cabin 201 non-null object 11 Embarked 879 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 82.7+ KB
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 170 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 680 Embarked 2 dtype: int64
### ๋ฐ์ดํฐ ํ์ธ
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
### ๋ฐ์ดํฐํ ํ์ธ
train.dtypes
PassengerId int64 Survived int64 Pclass int64 Name object Sex object Age float64 SibSp int64 Parch int64 Ticket object Fare float64 Cabin object Embarked object dtype: object
### ๋ฐ์ดํฐ ์์ฝ
# ํต๊ณ๋ ์ ๊ณต
train.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 881.000000 | 881.000000 | 881.000000 | 711.000000 | 881.000000 | 881.000000 | 881.000000 |
mean | 446.713961 | 0.385925 | 2.307605 | 29.731603 | 0.455165 | 0.363224 | 31.121566 |
std | 256.617021 | 0.487090 | 0.835055 | 14.547835 | 0.871571 | 0.791839 | 47.996249 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 226.000000 | 0.000000 | 2.000000 | 20.250000 | 0.000000 | 0.000000 | 7.895800 |
50% | 448.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.000000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 30.500000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 5.000000 | 6.000000 | 512.329200 |
3. ๋ณ์(feature) ๋ถ์
3-1. ์์นํ(numerical) ๋ณ์
### ์๊ด๊ณ์ ํ๋ ฌ
# ์์นํ ๋ณ์๋ค๊ณผ Survived ๊ฐ์ ์๊ด๊ณ์
g = sns.heatmap(train[["Survived","SibSp","Parch",
"Age","Fare"]].corr(),annot = True, fmt = ".2f", cmap = "coolwarm")
-
์ค์ง Fare ๋ณ์๋ง์ด Survived ๋ณ์์ ์ ์ํ ์๊ด๊ด๊ณ๋ฅผ ๋ณด์
-
๋ค๋ฅธ feature๋ค์ด ์ค์ํ์ง ์๋ค๋ ์๋ฏธ๋ x
-
ํด๋น feature๋ค์ ํ์ ๋ณ์๋ค์ด Survived์ ์๊ด์ฑ์ ๊ฐ์ง ์ ์์
-
-
seaborn version issue๋ก ์ธํด ์ผ๋ถ ํจ์ ๋ณ๊ฒฝ
(factorplot -> pointplot, catplot)
(distplot -> histplot)
โ SibSp
g = sns.catplot(x = "SibSp",y = "Survived",data = train,
kind = "bar", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("survival probability")
-
ํ์ /๋ฐฐ์ฐ์๊ฐ ๋ง์์๋ก ์์กด๋ฅ ์ด ๋ฎ์ ๊ฒ์ผ๋ก ํ๋จ๋จ
- ํผ์์ธ ์น๊ฐ(SibSp = 0)์ด๋ ํ, ๋๋ช ์ ์ฌ๋์ด ์๋ ๊ฒฝ์ฐ(SibSp = 1 or 2)๊ฐ ์์กด๋ฅ ์ด ๋ ๋์ ๊ฒฝํฅ์ ๋ณด์
โ Parch
g = sns.catplot(x = "Parch",y = "Survived",data = train,
kind = "bar", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("survival probability")
-
์๊ฐ์กฑ์ด ๋ ์ ๊ฐ์กฑ(Parch = 0)์ด๋ ์ค์ ๊ฐ์กฑ(Parch = 3,4), ๋๊ฐ์กฑ(Parch = 5,6)๋ณด๋ค ์์กด๋ฅ ์ด ๋์
-
๋ถ๋ชจ/์๋ ๊ฐ 3๋ช ์ธ ์น๊ฐ์ ์์กด๋ฅ ์ ์ค์ํ ํ์ค ํธ์ฐจ๊ฐ ์์
โ Age
g = sns.FacetGrid(train, col = 'Survived')
g = g.map(sns.histplot, "Age", kde = True)
-
๊ฐ์ฐ์ค ๋ถํฌ์ผ ์๋ ์๋ ์ฝ๊ฐ์ ์๊ณก๋(tailed) ๋ถํฌ๋ฅผ ๋ณด์
-
์์กดํ ์ง๋จ๊ณผ ์์กดํ์ง ๋ชปํ ์ง๋จ์์์ ์ฐ๋ น๋ ๋ถํฌ ์์์ด ๋ค๋ฆ
-
์์กดํ ์ง๋จ ์ค ์ ์ ์ฌ๋๋ค์ ๋น์จ์ด ๋์
-
60 ~ 80์ธ ์ฌ์ด์ ์น๊ฐ๋ค์ ์์กด๋ฅ ์ด ๋ฎ์
-
-
Age์ Survived์ ์๊ด๊ณ์๊ฐ ๋ฎ๋๋ผ๋, ์์กด ๊ฐ๋ฅ์ฑ์ด ๋์ ์ฐ๋ น๋๊ฐ ์กด์ฌํจ์ ์ง์ํ ์ ์์
โ ์ฌ๋ด์ผ๋ก, ์๊ด๊ณ์ ํ๋ ฌ์์์ ์๊ด ๊ณ์(default = Pearson ์๊ด๊ณ์)๋ ๋ ๋ณ์ ๊ฐ์ ์ ํ์ ๊ด๊ณ๋ฅผ ๋ํ๋ด๋ ์ธก๋์ด๊ธฐ์, ์ฐ๋ฆฌ๊ฐ ๋ชจ๋ฅด๋ ๋ค๋ฅธ ๊ด๊ณ๊ฐ ์กด์ฌํญ ์ ์์ต๋๋ค..!
### ์ฐ๋ น๋ ๋ถํฌ ์๊ฐํ
g = sns.kdeplot(train["Age"][(train["Survived"] == 0) & (train["Age"].notnull())],
color = "Red", shade = True)
g = sns.kdeplot(train["Age"][(train["Survived"] == 1) & (train["Age"].notnull())],
ax = g, color = "Blue", shade = True)
g.set_xlabel("Age")
g.set_ylabel("Frequency")
g = g.legend(["Not Survived","Survived"])
- ๋ ๋ฐ๋ํจ์๋ฅผ ์ค์ฒฉํ์ฌ ํ์ ํ๋ฉด, ์๊ธฐ๋ค๊ณผ ๋งค์ฐ ์ด๋ฆฐ ์์ด๋ค์ ๋์ํ๋ peak๋ฅผ ๋ณผ ์ ์์(Peak 0 ~ 5)
โ Fare
### ๊ฒฐ์ธก์น ํ์ธ
dataset["Fare"].isnull().sum()
1
### ๊ฒฐ์ธก์น -> ์ค๊ฐ๊ฐ
# ์์ธก์ ์ค์ํ ์ํฅ์ ๋ฏธ์น์ง ์์ ๊ฒ์ด๋ผ๊ณ ํ๋จ๋๋ ์ค์๊ฐ ์ฑํ
dataset["Fare"] = dataset["Fare"].fillna(dataset["Fare"].median())
g = sns.histplot(dataset["Fare"], color="m", kde = True,
label = "Skewness : %.2f"%(dataset["Fare"].skew()))
g = g.legend(loc = "best")
-
Fare ๋ณ์๋ ๋งค์ฐ ์๊ณก๋ ๋ถํฌ๋ฅผ ๋๊ณ ์์
-
Scale์ด ์กฐ์ ๋๋ค ํด๋ ๋ชจํ์ ๊ฐ์ค์น๊ฐ ๋งค์ฐ ๋์์ง๋ ๋ฌธ์ ๊ฐ ๋ฐ์ํ ์ ์์
-
๋ก๊ทธ ๋ณํ(log transformation)์ ํตํด ์๊ณก์ ์ค์ด๋ ๊ฒ์ด ๊ถ์ฅ๋จ
-
### ๋ก๊ทธ ๋ณํ
dataset["Fare"] = dataset["Fare"].map(lambda i: np.log(i) if i > 0 else 0)
### ๋ก๊ทธ ๋ณํ ํ ์๊ฐํ
g = sns.histplot(dataset["Fare"], color="b", kde = True,
label="Skewness : %.2f"%(dataset["Fare"].skew()))
g = g.legend(loc="best")
- ๋ก๊ทธ ๋ณํ์ด ์๊ณก ์ ๋๋ฅผ ๊ต์ฅํ ๋ง์ด ๊ฐ์์ํด
3-2. ๋ฒ์ฃผํ(categorical) ๋ณ์
โ Sex
g = sns.barplot(x = "Sex",y = "Survived",data = train)
g = g.set_ylabel("Survival Probability")
train[["Sex","Survived"]].groupby('Sex').mean()
Survived | |
---|---|
Sex | |
female | 0.747573 |
male | 0.190559 |
-
๋จ์ฑ์ด ์ฌ์ฑ๋ณด๋ค ์์กด๋ฅ ์ด ํ์ ํ๊ฒ ๋ฎ์
- Sex๋ ์์กด ์ฌ๋ถ๋ฅผ ์์ธกํ๋ ๋ฐ ์ค์ํ ์ํฅ์ ํ ์ ์์
โ PClass
### ์๊ฐํ
g = sns.catplot(x = "Pclass",y = "Survived",data = train,
kind = "bar", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("survival probability")
g = sns.catplot(x = "Pclass", y = "Survived", hue = "Sex",
data = train, kind = "bar", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("survival probability")
-
3๊ฐ์ ํด๋์ค์ ๋ํด ๊ฐ๊ฐ์ ํด๋์ค์์ ์์กด๋ฅ ์ด ๋์ผํ์ง๋ x
-
PClass = 1์ธ ์น๊ฐ๋ค์ PClass = 2, 3์ธ ์น๊ฐ๋ค์ ๋นํด ์์กด๋ฅ ์ด ๋์
-
๋จ๋ ์๊ด x
-
โ Embarked
### ๊ฒฐ์ธก์น ํ์ธ
dataset["Embarked"].isnull().sum()
2
### ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
# ๊ฐ์ฅ ๋ง์ 'S'๋ก ๋์ฒด
dataset["Embarked"] = dataset["Embarked"].fillna("S")
### ์๊ฐํ
g = sns.catplot(x = "Embarked", y = "Survived", data = train,
kind = "bar", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("survival probability")
-
Cherbourg(C)์์ ์ค๋ ์น๊ฐ๋ค์ ์์กด๋ฅ ์ด ๋ ๋์ ๊ฒฝํฅ์ ๋ณด์
- ์ผ๋ฑ์ ์น๊ฐ์ ๋น์จ์ด C์์ ๋์๊น?
### PClass vs Embarked
g = sns.catplot(x = "Pclass", col = "Embarked", data = train,
kind = "count", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("Count")
- ์ค์ ๋ก PClass = 3์ ์ฌ์ฐ์ํํด(S)๊ณผ ํธ์คํ์ด(Q)์์ ์ค๋ ์น๊ฐ๋ค์๊ฒ ๊ฐ์ฅ ๋น๋ฒํ ๋ฐ๋ฉด, ์ ฐ๋ฅด๋ถ๋ฅด(C)์ ์น๊ฐ๋ค์ ๋๋ถ๋ถ ์์กด์จ์ด ๊ฐ์ฅ ๋์ PClass = 1์
4. ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
4-1. Age
-
Age column์๋ 256๊ฐ์ ๊ฒฐ์ธก๊ฐ์ด ํฌํจ๋์ด ์์
-
์์กด ๊ธฐํ๊ฐ ๋ ๋ง์ ํ์ ์ง๋จ(ex> ์ด๋ฆฐ์ด)์ด ์์
-
Age feature๋ฅผ ์ ์งํ๊ณ ๋๋ฝ๋ ๊ฐ์ ์ฒ๋ฆฌํ๋ ๊ฒ์ด ๋ฐ๋์ง
-
Age์ ์๊ด๊ด๊ณ๊ฐ ์๋ ๋ณ์๋ค(Sex, Parch, Pclass, SibSP) ๊ด์ฐฐ
-
g = sns.catplot(y = "Age",x = "Sex",data = dataset, kind = "box")
g = sns.catplot(y = "Age", x = "Sex",hue="Pclass", data=dataset,kind="box")
g = sns.catplot(y="Age",x="Parch", data=dataset,kind="box")
g = sns.catplot(y="Age",x="SibSp", data=dataset,kind="box")
-
์ฐ๋ น ๋ถํฌ๋ ๋จ์ฑ๊ณผ ์ฌ์ฑ ํ์ ๋ชจ์ง๋จ์์ ๋์ผํ ๊ฒ์ผ๋ก ๋ณด์
- ์ฑ๋ณ์ ์ฐ๋ น์ ์์ธกํ๋ ๋ฐ ์ ์ฉํ์ง x
-
PClass = 1์ ๋์ด๊ฐ ๋ง์ ์น๊ฐ๋ค์ด ๋ง์
- ๋ถ๋ชจ/์๋ ์๊ฐ ๋ง์์๋ก ๋์ด๊ฐ ๋ง๊ณ , ํ์ /๋ฐฐ์ฐ์๊ฐ ๋ง์์๋ก ๋์ด๊ฐ ์ด๋ ค์ง๋ ๊ฒฝํฅ์ด ์์
### Sex(๋ฒ์ฃผํ) -> ์์นํ
dataset["Sex"] = dataset["Sex"].map({"male": 0, "female":1})
### ์๊ด๊ณ์ heatmap
g = sns.heatmap(dataset[["Age","Sex","SibSp","Parch","Pclass"]].corr(),
cmap = "BrBG",annot = True)
-
Parch๋ฅผ ์ ์ธํ feature๋ค ๊ฐ์ ์๊ด์ฑ์ ์๊ฐ์ ์ผ๋ก ์ ์
-
๋์ด๋ Sex์๋ ์๊ด๊ด๊ณ๊ฐ x
- PClass, Parch, SibSp์๋ ์์ ์๊ด๊ด๊ณ๋ฅผ ๊ฐ์ง
-
Age - Parch ๊ทธ๋ํ์์ ๋์ด๋ ๋ถ๋ชจ/์๋ ์ ์์ ๋ฐ๋ผ ์ฆ๊ฐํจ
- ํ์ง๋ง, ์ผ๋ฐ์ ์ธ ์๊ด๊ด๊ณ๋ ์์ ์๊ด๊ด๊ณ
-
๋์ด ๊ฒฐ์ธก์น ์ฒ๋ฆฌ์ SibSp, Parch, PClass ํ์ฉ
- PClass, Parch ๋ฐ SibSp์ ๋ฐ๋ผ ์ ์ฌํ ํ์ median ๊ฐ์ผ๋ก ๋์ฒด
### ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
index_NaN_age = list(dataset["Age"][dataset["Age"].isnull()].index) # ๊ฒฐ์ธก์น ๋ฐ์ดํฐ์ index
for i in index_NaN_age:
age_med = dataset["Age"].median()
age_pred = dataset["Age"][((dataset['SibSp'] == dataset.iloc[i]["SibSp"]) & (dataset['Parch'] == dataset.iloc[i]["Parch"]) & (dataset['Pclass'] == dataset.iloc[i]["Pclass"]))].median()
if not np.isnan(age_pred):
dataset['Age'].iloc[i] = age_pred
else:
dataset['Age'].iloc[i] = age_med
g = sns.catplot(x = "Survived", y = "Age",data = train, kind = "box")
g = sns.catplot(x = "Survived", y = "Age",data = train, kind = "violin")
-
์์กดํ ๊ทธ๋ฃน์ ์ฐ๋ น์ ์ค๊ฐ๊ฐ๊ณผ ์์กดํ์ง ๋ชปํ ๊ทธ๋ฃน์ ์ฐ๋ น์ ์ค๊ฐ๊ฐ ์ฌ์ด์๋ ์๋นํ ์ฐจ์ด๊ฐ ์กด์ฌ
-
violin plot์ ํตํด ์ฌ์ ํ ๋งค์ฐ ์ด๋ฆฐ ์น๊ฐ๋ค์ ์์กด๋ฅ ์ด ๋ ๋์์ ํ์ธํ ์ ์์
5. ํน์ฑ ๊ณตํ(Feature Engineering)
5-1. Name/ Title
dataset["Name"].head()
0 Braund, Mr. Owen Harris 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 2 Heikkinen, Miss. Laina 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 4 Allen, Mr. William Henry Name: Name, dtype: object
-
Name feature์๋ ์น๊ฐ์ ์ ๋ชฉ์ ๋ํ ์ ๋ณด๊ฐ ํฌํจ๋์ด ์์
-
๋ํผํ๋ ๋์ ๊ตฌ๋ณ๋๋ ํธ์นญ์ ๊ฐ์ง ์ผ๋ถ ์น๊ฐ์ด ์ ํธ๋ ์ ์๊ธฐ ๋๋ฌธ์, ์ด๋ค์ ์๋ก์ด ๋ณ์๋ก ํ์ฉ
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in dataset["Name"]]
dataset["Title"] = pd.Series(dataset_title)
dataset["Title"].head()
0 Mr 1 Mrs 2 Miss 3 Mrs 4 Mr Name: Title, dtype: object
### ์๊ฐํ
g = sns.countplot(x="Title",data=dataset)
g = plt.setp(g.get_xticklabels(), rotation = 45)
-
17๊ฐ์ title์ด ์กด์ฌ
- ๋๋ถ๋ถ ๊ฑฐ์ ์๊ณ ์ฝ 4๊ฐ๋ก ๊ฑฐ์ ๋ค ๊ตฌ๋ถ ๊ฐ๋ฅ
### Title(๋ฒ์ฃผํ ๋ณ์) -> ์์นํ ๋ณ์
# 4๊ฐ์ ๋ฒ์ฃผ๋ก ์ฌ๋ถ๋ฅ
dataset["Title"] = dataset["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset["Title"] = dataset["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
dataset["Title"] = dataset["Title"].astype(int)
### ์๊ฐํ
g = sns.countplot(dataset["Title"])
g = g.set_xticklabels(["Master","Miss/Ms/Mme/Mlle/Mrs","Mr","Rare"])
- ์๋ฌ ๋ฐ์
g = sns.catplot(x="Title",y="Survived",data=dataset,kind="bar")
g = g.set_xticklabels(["Master","Miss-Mrs","Mr","Rare"])
g = g.set_ylabels("survival probability")
-
์ฌ์์ ์์ด๋ค์ ์์กด๋ฅ ์ด ๋ ๋์์ ํ์ธํ ์ ์๋ค.
-
ํน์ดํ title์ ๊ฐ์ง ์น๊ฐ๋ค์ ์์กด๋ฅ ์ด ๋ ๋๋ค.
### ๊ธฐ์กด์ ์ด๋ฆ ๋ณ์๋ฅผ drop
dataset.drop(labels = ["Name"], axis = 1, inplace = True)
5-2. Family Size
- SibSp, Parch์ ์ฌ๊ธฐ์ 1(์๊ธฐ ์์ )์ ๋ํด FSize๋ผ๋ ๋ณ์ ์์ฑ
dataset["Fsize"] = dataset["SibSp"] + dataset["Parch"] + 1
### ์๊ฐํ
g = sns.pointplot(x = "Fsize",y = "Survived",data = dataset, )
g = g.set_ylabel("Survival Probability")
-
๊ฐ์กฑ ๊ท๋ชจ๊ฐ ์ค์ํ ์ญํ ์ ํ๋ ๊ฒ์ฒ๋ผ ๋ณด์
- ๋๊ฐ์กฑ์ ์์กด ํ๋ฅ ์ ์ต์ ์
-
์ถ๊ฐ์ ์ผ๋ก 4๊ฐ์ ๊ฐ์กฑ ํฌ๊ธฐ category๋ฅผ ์์ฑํ์
### ์๋ก์ด ์นดํ
๊ณ ๋ฆฌ ์์ฑ
dataset['Single'] = dataset['Fsize'].map(lambda s: 1 if s == 1 else 0)
dataset['SmallF'] = dataset['Fsize'].map(lambda s: 1 if s == 2 else 0)
dataset['MedF'] = dataset['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
dataset['LargeF'] = dataset['Fsize'].map(lambda s: 1 if s >= 5 else 0)
### ์๊ฐํ
g = sns.catplot(x="Single",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.catplot(x="SmallF",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.catplot(x="MedF",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.catplot(x="LargeF",y="Survived",data=dataset,kind="bar")
g = g.set_ylabels("Survival Probability")
- ์๊ฐ์กฑ์ด๋ ์ค๊ฐ ์ ๋ ํฌ๊ธฐ์ ๊ฐ์กฑ๋ค์ด ํผ์ ํ์นํ ์น๊ฐ์ด๋ ๋๊ฐ์กฑ๋ค๋ณด๋ค ์์กด๋ฅ ์ด ๋์
### ์์นํ ๋ณ์๋ก ๋ณํ
# One-hot Encoding ์งํ
dataset = pd.get_dummies(dataset, columns = ["Title"])
dataset = pd.get_dummies(dataset, columns = ["Embarked"], prefix="Em")
dataset.head()
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | ... | SmallF | MedF | LargeF | Title_0 | Title_1 | Title_2 | Title_3 | Em_C | Em_Q | Em_S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.0 | 3 | 0 | 22.0 | 1 | 0 | A/5 21171 | 1.981001 | NaN | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
1 | 2 | 1.0 | 1 | 1 | 38.0 | 1 | 0 | PC 17599 | 4.266662 | C85 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
2 | 3 | 1.0 | 3 | 1 | 26.0 | 0 | 0 | STON/O2. 3101282 | 2.070022 | NaN | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
3 | 4 | 1.0 | 1 | 1 | 35.0 | 1 | 0 | 113803 | 3.972177 | C123 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
4 | 5 | 0.0 | 3 | 0 | 35.0 | 0 | 0 | 373450 | 2.085672 | NaN | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
5 rows ร 22 columns
- ํ ์์ ์์ 22๊ฐ์ feature๋ฅผ ๊ฐ์ง
5-3. Cabin
dataset["Cabin"].head()
0 NaN 1 C85 2 NaN 3 C123 4 NaN Name: Cabin, dtype: object
dataset["Cabin"].describe()
count 292 unique 186 top G6 freq 5 Name: Cabin, dtype: object
dataset["Cabin"].isnull().sum()
1007
-
292๊ฐ์ ๊ฐ๊ณผ 1007๊ฐ์ ๊ฒฐ์ธก์น๊ฐ ์กด์ฌ
- ๊ฐ์ค์ด ์๋ ์น๊ฐ์ ๊ฐ์ค ๋ฒํธ ๋์ ๋๋ฝ๋ ๊ฐ์ด ํ์๋๋ ๊ฒ์ผ๋ก ์๊ฐ
dataset["Cabin"][dataset["Cabin"].notnull()].head()
1 C85 3 C123 6 E46 10 G6 11 C103 Name: Cabin, dtype: object
### ๊ฒฐ์ธก์น์ ๊ฒฝ์ฐ 'X'๋ก ํ๊ธฐ
dataset["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin'] ])
-
Cabin์ ์ฒซ ๊ธ์๋ Desk๋ฅผ ํฌํจ
- Titanicํธ์์ ์น๊ฐ์ด ์์ ๋งํ ์์น๋ฅผ ํฌํจ
g = sns.countplot(dataset["Cabin"],
order=['A','B','C','D','E','F','G','T','X'])
g = sns.catplot(y="Survived",x="Cabin",data=dataset,kind="bar",
order=['A','B','C','D','E','F','G','T','X'])
g = g.set_ylabels("Survival Probability")
-
๊ฐ์ค์ด ์๋ ์น๊ฐ์ ์๊ฐ ์ ์
- ์์กด ํ๋ฅ ์ ์ค์ํ ํ์ค ํธ์ฐจ๋ฅผ ๊ฐ์ง๋ฉฐ ๋ค๋ฅธ desk์ ์๋ ์น๊ฐ์ ์์กด ํ๋ฅ ์ ๊ตฌ๋ณํ ์ ์์
-
ํ์ง๋ง ์ผ๋ฐ์ ์ผ๋ก ๊ฐ์ค์ ๊ฐ์ง ์น๊ฐ์ด ๊ฐ์ค์ด ์๋ ์น๊ฐ๋ณด๋ค ์์กดํ ์ ์๋ ๊ธฐํ๊ฐ ๋ ๋ง๋ค๋ ๊ฒ์ ์ ์ ์์
- ํนํ ๊ฐ์ค B, C, D, E, F์ ํด๋น
### ๋ฒ์ฃผํ ๋ณ์ -> ์์นํ ๋ณ์
dataset = pd.get_dummies(dataset, columns = ["Cabin"],prefix="Cabin")
5-4. Ticket
dataset["Ticket"].head()
0 A/5 21171 1 PC 17599 2 STON/O2. 3101282 3 113803 4 373450 Name: Ticket, dtype: object
-
๊ฐ์ ์ ๋์ฌ๋ฅผ ๊ณต์ ํ๋ ํฐ์ผ์ด ํจ๊ป ๋ฐฐ์น๋ ๊ฐ์ค์ ์์ฝ๋ ์ ์๋ค๋ ๊ฒ์ ์๋ฏธํ๋ ๊ฒ์ผ ์ ์์ -> ์ค์ ์ ์ค ๋ฐฐ์น์ ์ฐ๊ฒฐ๋ ์ ์์
-
๋์ผํ ์ ๋์ฌ๋ฅผ ๊ฐ์ง ํฐ์ผ์ PClass์ Survived๊ฐ ์ ์ฌํ ์ ์์
- ์ ๋์ด๋ง์ ์ถ์ถ
### ์ ๋์ด๋ง์ ์ถ์ถ
# ๋ง์ฝ ์๋ค๋ฉด 'X'๋ก ํ๊ธฐ
Ticket = []
for i in list(dataset.Ticket):
# ์ ๋์ด๊ฐ ์๋ ๊ฒฝ์ฐ
if not i.isdigit():
Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0])
else:
Ticket.append("X")
dataset["Ticket"] = Ticket
dataset["Ticket"].head()
0 A5 1 PC 2 STONO2 3 X 4 X Name: Ticket, dtype: object
### One-hot Encoding
dataset = pd.get_dummies(dataset, columns = ["Ticket"], prefix = "T")
### PClass์ ๋ํ ๋ฒ์ฃผํ ๋ณ์ ์์ฑ
dataset["Pclass"] = dataset["Pclass"].astype("category")
dataset = pd.get_dummies(dataset, columns = ["Pclass"],prefix = "Pc")
### ๋ถํ์ํ ๋ณ์ ๋ ๋ฆฌ๊ธฐ
dataset.drop(labels = ["PassengerId"], axis = 1, inplace = True)
dataset.head()
Survived | Sex | Age | SibSp | Parch | Fare | Fsize | Single | SmallF | MedF | ... | T_STONO | T_STONO2 | T_STONOQ | T_SWPP | T_WC | T_WEP | T_X | Pc_1 | Pc_2 | Pc_3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0 | 22.0 | 1 | 0 | 1.981001 | 2 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 1.0 | 1 | 38.0 | 1 | 0 | 4.266662 | 2 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 1.0 | 1 | 26.0 | 0 | 0 | 2.070022 | 1 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
3 | 1.0 | 1 | 35.0 | 1 | 0 | 3.972177 | 2 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
4 | 0.0 | 0 | 35.0 | 0 | 0 | 2.085672 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
5 rows ร 67 columns
6. ๋ชจ๋ธ๋ง(Modeling)
### Train, Test๋ฅผ ๋ค์ ๋๋๊ธฐ
train = dataset[:train_len]
test = dataset[train_len:]
test.drop(labels=["Survived"],axis = 1,inplace=True)
### feature(X) vs label(y)
train["Survived"] = train["Survived"].astype(int)
Y_train = train["Survived"]
X_train = train.drop(labels = ["Survived"],axis = 1)
6-1. ๋จ์ํ ๋ชจ๋ธ๋ง
a. ๊ต์ฐจ ๊ฒ์ฆ(Cross Validate) ๋ชจ๋ธ
-
10๊ฐ์ ์ธ๊ธฐ ์๋ ๋ถ๋ฅ๊ธฐ๋ฅผ ๋น๊ต
-
stratified k-fold ๊ต์ฐจ ๊ฒ์ฆ ๋ฐฉ์ -> ๊ฐ ๋ถ๋ฅ๊ธฐ์ ํ๊ท ์ ํ๋๋ฅผ ํ๊ฐ
-
ํ์ฉ ๋ชจ๋ธ
-
SVC
-
Decision Tree
-
AdaBoost
-
Random Forest
-
Extra Trees
-
Gradient Boosting
-
Multiple layer perceprton (neural network)
-
KNN
-
Logistic regression
-
Linear Discriminant Analysis
-
kfold = StratifiedKFold(n_splits = 10)
### modeling
random_state = 2
# ๋ชจ๋ธ ๊ฐ์ฒด ์์ฑ
classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),
random_state=random_state,learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))
classifiers.append(LinearDiscriminantAnalysis())
# ๊ต์ฐจ ๊ฒ์ฆ
cv_results = []
for classifier in classifiers:
cv_results.append(cross_val_score(classifier, X_train, y = Y_train,
scoring = "accuracy", cv = kfold, n_jobs=4))
# ํ๊ฐ
cv_means = []
cv_std = []
for cv_result in cv_results:
cv_means.append(cv_result.mean())
cv_std.append(cv_result.std())
cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,
"Algorithm":["SVC","DecisionTree","AdaBoost","RandomForest",
"ExtraTrees","GradientBoosting",
"MultipleLayerPerceptron","KNeighboors",
"LogisticRegression","LinearDiscriminantAnalysis"]})
### ์๊ฐํ
g = sns.barplot(x = "CrossValMeans",y = "Algorithm",data = cv_res,
palette = "Set3",orient = "h",**{'xerr':cv_std})
g.set_xlabel("Mean Accuracy")
g = g.set_title("Cross validation scores")
- ์์๋ธ ๋ชจ๋ธ๋ง์ ์ํด SVC, AdaBoost, RandomForest , ExtraTrees, GradientBoosting ๋ชจ๋ธ ์ ํ
b. ์ต์ ๋ชจ๋ธ์ ์ํ ํ์ดํผ ํ๋ผ๋ฏธํฐ ํ๋(hyper parameter tuning)
- Grid Search ์ต์ ํ๋ฅผ ์ํ
### ๋ฉํ ๋ชจ๋ธ๋ง(Meta Modeling)
# Adaboost
DTC = DecisionTreeClassifier()
adaDTC = AdaBoostClassifier(DTC, random_state=7)
ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
"base_estimator__splitter" : ["best", "random"],
"algorithm" : ["SAMME","SAMME.R"],
"n_estimators" :[1,2],
"learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}
gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold,
scoring="accuracy", n_jobs= 4, verbose = 1)
gsadaDTC.fit(X_train,Y_train)
ada_best = gsadaDTC.best_estimator_
gsadaDTC.best_score_
0.8275536261491316
# ExtraTrees
ExtC = ExtraTreesClassifier()
ex_param_grid = {"max_depth": [None],
"max_features": [1, 3, 10],
"min_samples_split": [2, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [False],
"n_estimators" :[100,300],
"criterion": ["gini"]}
gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=kfold,
scoring="accuracy", n_jobs= 4, verbose = 1)
gsExtC.fit(X_train,Y_train)
ExtC_best = gsExtC.best_estimator_
# Best score
gsExtC.best_score_
Fitting 10 folds for each of 54 candidates, totalling 540 fits
0.8286133810010214
# RFC Parameters tunning
RFC = RandomForestClassifier()
rf_param_grid = {"max_depth": [None],
"max_features": [1, 3, 10],
"min_samples_split": [2, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [False],
"n_estimators" :[100,300],
"criterion": ["gini"]}
gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold,
scoring="accuracy", n_jobs= 4, verbose = 1)
gsRFC.fit(X_train,Y_train)
RFC_best = gsRFC.best_estimator_
# Best score
gsRFC.best_score_
Fitting 10 folds for each of 54 candidates, totalling 540 fits
0.8320224719101124
# Gradient boosting tunning
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
'n_estimators' : [100,200,300],
'learning_rate': [0.1, 0.05, 0.01],
'max_depth': [4, 8],
'min_samples_leaf': [100,150],
'max_features': [0.3, 0.1]
}
gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold,
scoring="accuracy", n_jobs= 4, verbose = 1)
gsGBC.fit(X_train,Y_train)
GBC_best = gsGBC.best_estimator_
# Best score
gsGBC.best_score_
Fitting 10 folds for each of 72 candidates, totalling 720 fits
/usr/local/lib/python3.9/dist-packages/sklearn/ensemble/_gb.py:280: FutureWarning: The loss parameter name 'deviance' was deprecated in v1.1 and will be removed in version 1.3. Use the new parameter name 'log_loss' which is equivalent. warnings.warn(
0.8376915219611849
# SVC classifier
SVMC = SVC(probability=True)
svc_param_grid = {'kernel': ['rbf'],
'gamma': [ 0.001, 0.01, 0.1, 1],
'C': [1, 10, 50, 100,200,300, 1000]}
gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold,
scoring="accuracy", n_jobs= 4, verbose = 1)
gsSVMC.fit(X_train,Y_train)
SVMC_best = gsSVMC.best_estimator_
# Best score
gsSVMC.best_score_
Fitting 10 folds for each of 28 candidates, totalling 280 fits
0.8331332992849847
c. ํ์ต ๊ณก์ (learning curve) ์๊ฐํ
-
training set์์์ overfitting
-
์ ํ๋์ training size๊ฐ ๋ฏธ์น๋ ์ํฅ ํ์
### ์๊ฐํ๋ฅผ ์ํ ํจ์ ์ ์
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
"""Generate a simple plot of the test and training learning curve"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
### ์๊ฐํ
g = plot_learning_curve(gsRFC.best_estimator_,"RF mearning curves",
X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsExtC.best_estimator_,"ExtraTrees learning curves",
X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsSVMC.best_estimator_,"SVC learning curves",
X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsadaDTC.best_estimator_,"AdaBoost learning curves",
X_train,Y_train,cv=kfold)
g = plot_learning_curve(gsGBC.best_estimator_,"GradientBoosting learning curves",
X_train,Y_train,cv=kfold)
-
GradientBoosting ๋ฐ Adaboost ๋ถ๋ฅ๊ธฐ๋ ํ๋ จ ์ธํธ์ overfitting๋ ๊ฒฝํฅ์ด ์์
- ์ฆ๊ฐํ๋ ๊ต์ฐจ ๊ฒ์ฆ ๊ณก์ ์ ๋ฐ๋ผ GradientBoost์ Adboost๋ ๋ ๋ง์ ํ๋ จ ์์ ๋ฅผ ํตํด ๋ ๋์ ์ฑ๋ฅ์ ๋ฐํํ ์ ์์
-
SVC์ ExtraTree ๋ถ๋ฅ๊ธฐ๋ ํ๋ จ๊ณผ ๊ต์ฐจ ๊ฒ์ฆ ๊ณก์ ์ด ์๋ก ๋น์ทํจ
- ์์ธก์ ๋ ์ ์ผ๋ฐํํ๋ ๊ฒ์ผ๋ก ๋ณด์
d. ํผ์ณ ์ค์๋ & ํธ๋ฆฌ ๊ธฐ๋ฐ ๋ถ๋ฅ
- ์น๊ฐ ์์กด ์์ธก์ ์ํ ๊ฐ์ฅ ์ ์ฉํ ํน์ง ํ์
nrows = ncols = 2
fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharex="all", figsize=(15,15))
names_classifiers = [("AdaBoosting", ada_best),("ExtraTrees",ExtC_best),
("RandomForest",RFC_best),("GradientBoosting",GBC_best)]
nclassifier = 0
for row in range(nrows):
for col in range(ncols):
name = names_classifiers[nclassifier][0]
classifier = names_classifiers[nclassifier][1]
indices = np.argsort(classifier.feature_importances_)[::-1][:40]
g = sns.barplot(y = X_train.columns[indices][:40],
x = classifier.feature_importances_[indices][:40] ,
orient='h',ax=axes[row][col])
g.set_xlabel("Relative importance",fontsize=12)
g.set_ylabel("Features",fontsize=12)
g.tick_params(labelsize=9)
g.set_title(name + " feature importance")
nclassifier += 1
-
4๊ฐ์ ํธ๋ฆฌ ๊ธฐ๋ฐ ๋ถ๋ฅ๊ธฐ(Adaboost, ExtraTree, RandomForest ๋ฐ GradientBoost)์ ๋ํ feature ์ค์๋๋ฅผ ํ์
-
๋ค ๊ฐ์ ๋ถ๋ฅ๊ธฐ๊ฐ ์๋์ ์ค์์ฑ์ ๋ฐ๋ผ ๋ค๋ฅธ ์ต์์ feature ๊ฐ์ง๊ณ ์์
- ์์ธก์ด ๊ฐ์ feature์ ๊ทผ๊ฑฐํ์ง ์๋๋ค๋ ๊ฒ์ ์๋ฏธ
-
Title_2๋ Mrs/Mlle/Mme/Miss/Ms ๋ฒ์ฃผ๊ฐ Sex์ ๋์ ์๊ด๊ด๊ณ๊ฐ ์์
-
Discussion:
-
Pc_1, Pc_2, Pc_3 ๋ฐ ์๊ธ์ ์น๊ฐ์ ์ผ๋ฐ์ ์ธ ์ฌํ์ ์ง์๋ฅผ ๋ํ๋
-
Sex์ Title_2(Mrs/Mlle/Mme/Miss/Ms) ๋ฐ Title_3(Mr)์ ์ฑ๋ณ์ ์๋ฏธ
-
์ฐ๋ น๊ณผ Title_1(Master)๋ ์น๊ฐ์ ์ฐ๋ น
-
Fsize, LargeF, MedF, Single์ ์น๊ฐ์ ๊ฐ์กฑ ์ ํฌ๊ธฐ๋ฅผ ์๋ฏธ
-
-
์์กด ์์ธก์ ๋ณดํธ ์์ ์์น๋ณด๋ค๋ ์น๊ฐ์ ๋์ด, ์ฑ๋ณ, ๊ฐ์กฑ ๊ท๋ชจ, ์ฌํ์ ์ง์์ ๋ ๊ด๋ จ์ด ์์
### ๊ฐ๊ฐ์ ๋ชจ๋ธ์ ๋ํด ์์ธก๊ฐ ๋์ถ
test_Survived_RFC = pd.Series(RFC_best.predict(test), name="RFC")
test_Survived_ExtC = pd.Series(ExtC_best.predict(test), name="ExtC")
test_Survived_SVMC = pd.Series(SVMC_best.predict(test), name="SVC")
test_Survived_AdaC = pd.Series(ada_best.predict(test), name="Ada")
test_Survived_GBC = pd.Series(GBC_best.predict(test), name="GBC")
# ๋ชจ๋ ๊ฒฐ๊ณผ ํตํฉ
ensemble_results = pd.concat([test_Survived_RFC,test_Survived_ExtC,test_Survived_AdaC,
test_Survived_GBC, test_Survived_SVMC], axis = 1)
# ์๊ฐํ
g = sns.heatmap(ensemble_results.corr(),annot=True)
-
Adboost๊ฐ ๋ค๋ฅธ ๋ถ๋ฅ๊ธฐ์ ๋น๊ต๋๋ ๊ฒฝ์ฐ๋ฅผ ์ ์ธํ๊ณ ๋ 5๊ฐ ๋ถ๋ฅ๊ธฐ์ ๋ํด ์๋นํ ์ ์ฌํ ๊ฒ์ผ๋ก ๋ณด์
-
5๊ฐ์ ๋ถ๋ฅ๊ธฐ๋ ๊ฑฐ์ ๋์ผํ ์์ธก์ ์ ๊ณตํ์ง๋ง ์ฝ๊ฐ์ ์ฐจ์ด๊ฐ ์์
- ensembling voting ํ์ฉํ๊ธฐ์ ์ถฉ๋ถ
6-2. ๋ชจ๋ธ ์์๋ธ(Ensemble)
###a. ๋ชจ๋ธ ๊ฒฐํฉํ๊ธฐ
-
VotingClassifier๋ฅผ ์ ํ
-
soft
์ง์ : ๊ฐ vote์์์ ํ๋ฅ ๊ณ ๋ ค
votingC = VotingClassifier(estimators=[('rfc', RFC_best), ('extc', ExtC_best),
('svc', SVMC_best), ('adac',ada_best),
('gbc',GBC_best)], voting='soft', n_jobs=4)
votingC = votingC.fit(X_train, Y_train)
6-3. ์์ธก(Prediction)
a. ์์ธก & ๊ฒฐ๊ณผ ๋์ถ
test_Survived = pd.Series(votingC.predict(test), name="Survived")
results = pd.concat([IDtest,test_Survived],axis=1)
results.to_csv("ensemble_python_voting.csv",index=False)