18 minute read

import numpy as np
import pandas as pd
import random as rd
import matplotlib.pyplot as plt
import seaborn as sns
# 머신러닝에 필요한 csv 파일 다운로드

train_set = pd.read_csv("/kaggle/input/titanic/train.csv")
test_set = pd.read_csv("/kaggle/input/titanic/test.csv")
gender_submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
# 데이터 프레임의 처음 5개 샘플

train_set.head(5)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

데이터셋 기본 정보 확인 결과 Age 특성과 Cabin 특성에서 null값이 존재하고, Embarked 특성에서 2개의 null 값이 존재한다.

# 데이터셋 기본 정보 확인

train_set.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

Age 특성은 수치형 특성임과 동시에 null 값이 비교적 적으므로 평균값으로 대체한다.

train_set["Age"].fillna(train_set["Age"].mean(), inplace=True)

데이터 탐색 및 시각화

# 훈련셋 복사

titanic = train_set.copy()
# 빈부격차(Pclass)에 따른 생존율
# Pclass 등급이 높을수록 생존율이 높음

sns.barplot(x = "Pclass", y = "Survived", data = titanic)
<AxesSubplot:xlabel='Pclass', ylabel='Survived'>

# 빈부격차에 따른 생존율 확인 후, 성별에 따른 생존율 추가
# 남성에 비해 여성의 생존율이 높음

sns.barplot(x = "Pclass", y = "Survived", hue = "Sex", data = titanic)
<AxesSubplot:xlabel='Pclass', ylabel='Survived'>

# 탑승한 항구에 따른 생존율
# C에서 탑승한 승객들의 생존율이 가장 높음

sns.barplot(x = "Embarked", y = "Survived", data = titanic)
<AxesSubplot:xlabel='Embarked', ylabel='Survived'>

# 탑승자의 나이중 가장 많은 나이와 가장 적은 나이

print(max(titanic["Age"]))
print(min(titanic["Age"]))
80.0
0.42
# 먼저 나이는 값이 많으므로 크게 분류를 지정

def change_age(age):
    if isinstance(age, pd.Series):
        age_values = []
        for a in age:
            if a <= 6:
                age_values.append("Baby")
            elif a <= 13:
                age_values.append("Kids")
            elif a <= 19:
                age_values.append("Teen")
            elif a <= 38:
                age_values.append("Young Adult")
            elif a <= 60:
                age_values.append("Middle-aged")
            elif a <= 80:
                age_values.append("Senior")
        return age_values
    else:
        if age <= 6:
            return "Baby"
        elif age <= 13:
            return "Kids"
        elif age <= 19:
            return "Teen"
        elif age <= 38:
            return "Young Adult"
        elif age <= 60:
            return "Middle-aged"
        elif age <= 80:
            return "Senior"
    
titanic["Age"] = change_age(titanic["Age"])
titanic
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male Young Adult 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female Young Adult 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female Young Adult 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female Young Adult 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male Young Adult 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male Young Adult 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female Teen 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female Young Adult 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male Young Adult 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male Young Adult 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

# 확인 결과 일반적으로 나이가 적으면 생존율이 높음

sns.barplot(x = "Age", y = "Survived", data = titanic)
<AxesSubplot:xlabel='Age', ylabel='Survived'>

데이터 전처리

현재 학습할 데이터의 특성 중 Embarked에 결측값이 존재한다.

Age 특성에도 결측값이 존재했으나, 미리 앞에서 처리하였다.

train_set.isnull().sum()
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Embarked의 결측값에 해당하는 샘플을 삭제하여 결측값을 제거한다.

train_set.dropna(subset = ["Embarked"], inplace = True)

이제 사용할 Pclass, Sex, Age, Embarked 특성을 이용한다.

해당 특성들 중 Sex, Embarked 특성의 값들은 object, 즉 문자열로 되어있는 데이터이므로 이를 처리한다.

from sklearn.preprocessing import OneHotEncoder
# Sex 특성 변환

train_set_sex = train_set[["Sex"]]
sex_encoder = OneHotEncoder()
sex_1hot = sex_encoder.fit_transform(train_set_sex)
# Embarked 특성 변환

train_set_embarked = train_set[["Embarked"]]
embarked_encoder = OneHotEncoder()
embarked_1hot = embarked_encoder.fit_transform(train_set_embarked)
embarked_output = pd.DataFrame(embarked_1hot.toarray(),
                               columns = embarked_encoder.get_feature_names_out(),
                               index = train_set_embarked.index)
embarked_output
Embarked_C Embarked_Q Embarked_S
0 0.0 0.0 1.0
1 1.0 0.0 0.0
2 0.0 0.0 1.0
3 0.0 0.0 1.0
4 0.0 0.0 1.0
... ... ... ...
886 0.0 0.0 1.0
887 0.0 0.0 1.0
888 0.0 0.0 1.0
889 1.0 0.0 0.0
890 0.0 1.0 0.0

889 rows × 3 columns

sex_output = pd.DataFrame(sex_1hot.toarray(),
                               columns = sex_encoder.get_feature_names_out(),
                               index = train_set_sex.index)
sex_output
Sex_female Sex_male
0 0.0 1.0
1 1.0 0.0
2 1.0 0.0
3 1.0 0.0
4 0.0 1.0
... ... ...
886 0.0 1.0
887 1.0 0.0
888 1.0 0.0
889 0.0 1.0
890 0.0 1.0

889 rows × 2 columns

# 원핫인코딩된 Embarked 특성 추가 및 원래의 열 삭제

train_set = pd.concat([train_set, embarked_output, sex_output], axis=1)
train_set = train_set.drop(["Embarked", "Sex"], axis=1)
# 마찬가지로 사용하지 않는 특성들도 삭제한다.
# 같이 삭제하면서 결측치가 많은 Cabin 특성도 삭제
# Name, Ticket 특성 삭제

train_set = train_set.drop(["Name", "Ticket", "Cabin"], axis=1)

최종적으로 훈련에 사용될 데이터셋이다.

train_set
PassengerId Survived Pclass Age SibSp Parch Fare Embarked_C Embarked_Q Embarked_S Sex_female Sex_male
0 1 0 3 22.000000 1 0 7.2500 0.0 0.0 1.0 0.0 1.0
1 2 1 1 38.000000 1 0 71.2833 1.0 0.0 0.0 1.0 0.0
2 3 1 3 26.000000 0 0 7.9250 0.0 0.0 1.0 1.0 0.0
3 4 1 1 35.000000 1 0 53.1000 0.0 0.0 1.0 1.0 0.0
4 5 0 3 35.000000 0 0 8.0500 0.0 0.0 1.0 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 27.000000 0 0 13.0000 0.0 0.0 1.0 0.0 1.0
887 888 1 1 19.000000 0 0 30.0000 0.0 0.0 1.0 1.0 0.0
888 889 0 3 29.699118 1 2 23.4500 0.0 0.0 1.0 1.0 0.0
889 890 1 1 26.000000 0 0 30.0000 1.0 0.0 0.0 0.0 1.0
890 891 0 3 32.000000 0 0 7.7500 0.0 1.0 0.0 0.0 1.0

889 rows × 12 columns

모델 선택 및 훈련

# 입력 및 타깃 데이터셋 지정

X_titanic = train_set.drop("Survived", axis = 1)
y_titanic = train_set["Survived"]
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_titanic, y_titanic, test_size = 0.2, random_state = 15)

사용할 모델로 로지스틱 회귀, 랜덤 포레스트, 결정 트리를 사용한다.

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

LR_clf = LogisticRegression(max_iter = 10000, random_state = 10)
RF_clf = RandomForestClassifier(random_state = 10)
DT_clf = DecisionTreeClassifier(random_state = 10)
# 로지스틱 회귀

LR_clf.fit(X_train, y_train)
LR_clf.score(X_test, y_test)
0.8258426966292135
# 랜덤 포레스트

RF_clf.fit(X_train, y_train)
RF_clf.score(X_test, y_test)
0.8426966292134831
# 결정 트리

DT_clf.fit(X_train, y_train)
DT_clf.score(X_test, y_test)
0.8033707865168539

교차 검증

# 로지스틱 회귀에 대한 cross_val_score 

from sklearn.model_selection import cross_val_score

LR_rmses = cross_val_score(LR_clf, X_train, y_train, cv=10)
np.mean(LR_rmses)
0.7905712050078246
# 랜덤 포레스트에 대한 cross_val_score

RF_rmses = cross_val_score(RF_clf, X_train, y_train, cv= 10)
np.mean(RF_rmses)
0.8115610328638498
# 결정 트리에 대한 cross_val_score

DT_rmses = cross_val_score(DT_clf, X_train, y_train, cv = 10)
np.mean(DT_rmses)
0.7355829420970267

모델 미세 조정

# 결정 트리에 대한 모델 조정

from sklearn.model_selection import GridSearchCV

DT_param_grid = {"max_depth" : [2, 3, 5, 10],
              "min_samples_split" : [2, 3, 5],
              "min_samples_leaf" : [1, 5, 8]}

DT_grid_search = GridSearchCV(DT_clf, DT_param_grid, cv=5,
                          scoring="accuracy")
DT_grid_search.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=10),
             param_grid={'max_depth': [2, 3, 5, 10],
                         'min_samples_leaf': [1, 5, 8],
                         'min_samples_split': [2, 3, 5]},
             scoring='accuracy')
# 최적의 하이퍼 파라미터

DT_grid_search.best_params_
{'max_depth': 10, 'min_samples_leaf': 8, 'min_samples_split': 2}
# 정확도

DT_grid_search.best_score_
0.7961095242785384
# 랜덤 포레스트에 대한 모델 조정

from sklearn.model_selection import GridSearchCV

RF_param_grid = {"max_depth" : [2, 3, 5, 10],
              "min_samples_split" : [2, 3, 5],
              "min_samples_leaf" : [1, 5, 8]}

RF_grid_search = GridSearchCV(RF_clf, RF_param_grid, cv=5,
                          scoring="accuracy")
RF_grid_search.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=10),
             param_grid={'max_depth': [2, 3, 5, 10],
                         'min_samples_leaf': [1, 5, 8],
                         'min_samples_split': [2, 3, 5]},
             scoring='accuracy')
# 최적의 하이퍼 파라미터

RF_grid_search.best_params_
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
# 정확도

RF_grid_search.best_score_
0.8017334777898159
# 로지스틱 회귀에 대한 모델 조정

from sklearn.model_selection import GridSearchCV

LR_param_grid = {"penalty" : ['l1', 'l2'],
              "C" : [0.1, 1, 10, 100],
              "solver" : ['liblinear'],
             "l1_ratio" : [0.2, 0.5, 0.8]}

LR_grid_search = GridSearchCV(LR_clf, LR_param_grid, cv=5,
                          scoring="accuracy")
LR_grid_search.fit(X_train, y_train)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
  "(penalty={})".format(self.penalty)
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:1479: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
  "(penalty={})".format(self.penalty)
GridSearchCV(cv=5,
             estimator=LogisticRegression(max_iter=10000, random_state=10),
             param_grid={'C': [0.1, 1, 10, 100], 'l1_ratio': [0.2, 0.5, 0.8],
                         'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
             scoring='accuracy')
LR_grid_search.best_params_
{'C': 10, 'l1_ratio': 0.2, 'penalty': 'l1', 'solver': 'liblinear'}
LR_grid_search.best_score_
0.7933221707869595

평가

from sklearn.metrics import accuracy_score

# 결정트리에 대한 예측

best_DT = DT_grid_search.best_estimator_

DT_predict = best_DT.predict(X_test)
accuracy = accuracy_score(y_test, DT_predict)
accuracy
0.848314606741573
# 랜덤 포레스트에 대한 예측

best_RF = RF_grid_search.best_estimator_

RF_predict = best_RF.predict(X_test)
accuracy = accuracy_score(y_test, RF_predict)
accuracy
0.8539325842696629
# 로지스틱 회귀에 대한 예측

best_LR = LR_grid_search.best_estimator_

LR_predict = best_LR.predict(X_test)
accuracy = accuracy_score(y_test, LR_predict)
accuracy
0.8202247191011236

test.csv 파일 적용

test_set.isnull().sum()
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
test_set.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
# test.csv 전처리

# Age 특성 중 결측치 처리
test_set["Age"].fillna(test_set["Age"].mean(), inplace=True)

# Fare 특성 결측치 처리
test_set["Fare"].fillna(test_set["Fare"].mean(), inplace=True)

# 필요없는 특성들 삭제
test_set = test_set.drop(["Name", "Ticket", "Cabin"], axis=1)

# Sex 원핫인코딩
test_set_sex = test_set[["Sex"]]
test_sex_encoder = OneHotEncoder()
test_sex_1hot = test_sex_encoder.fit_transform(test_set_sex)

# Embarked 원핫 인코딩
test_set_embarked = test_set[["Embarked"]]
test_embarked_encoder = OneHotEncoder()
test_embarked_1hot = test_embarked_encoder.fit_transform(test_set_embarked)

# 원핫 인코딩된 특성들 데이터에 추가
test_embarked_output = pd.DataFrame(test_embarked_1hot.toarray(),
                               columns = test_embarked_encoder.get_feature_names_out(),
                               index = test_set_embarked.index)

test_sex_output = pd.DataFrame(test_sex_1hot.toarray(),
                               columns = test_sex_encoder.get_feature_names_out(),
                               index = test_set_sex.index)

test_set = pd.concat([test_set, test_embarked_output, test_sex_output], axis=1)
test_set = test_set.drop(["Embarked", "Sex"], axis=1)

test_set
PassengerId Pclass Age SibSp Parch Fare Embarked_C Embarked_Q Embarked_S Sex_female Sex_male
0 892 3 34.50000 0 0 7.8292 0.0 1.0 0.0 0.0 1.0
1 893 3 47.00000 1 0 7.0000 0.0 0.0 1.0 1.0 0.0
2 894 2 62.00000 0 0 9.6875 0.0 1.0 0.0 0.0 1.0
3 895 3 27.00000 0 0 8.6625 0.0 0.0 1.0 0.0 1.0
4 896 3 22.00000 1 1 12.2875 0.0 0.0 1.0 1.0 0.0
... ... ... ... ... ... ... ... ... ... ... ...
413 1305 3 30.27259 0 0 8.0500 0.0 0.0 1.0 0.0 1.0
414 1306 1 39.00000 0 0 108.9000 1.0 0.0 0.0 1.0 0.0
415 1307 3 38.50000 0 0 7.2500 0.0 0.0 1.0 0.0 1.0
416 1308 3 30.27259 0 0 8.0500 0.0 0.0 1.0 0.0 1.0
417 1309 3 30.27259 1 1 22.3583 1.0 0.0 0.0 0.0 1.0

418 rows × 11 columns

# 결측치 처리 확인

test_set.isnull().sum()
PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
Sex_female     0
Sex_male       0
dtype: int64
test_RF = best_RF.predict(test_set)
PassengerId = np.array(test_set["PassengerId"]).astype(int)

dt_solution = pd.DataFrame(
    {"PassengerId": PassengerId, "Survived": test_RF}
).set_index("PassengerId")

dt_solution.to_csv("my_solution_titanic.csv")
dt_solution
Survived
PassengerId
892 0
893 0
894 0
895 0
896 0
... ...
1305 0
1306 1
1307 0
1308 0
1309 1

418 rows × 1 columns