파이썬 머신러닝

사이킷런 타이타닉생존자 예측

꼬마곰 2021. 5. 20. 14:23

데이터 전처리

Null처리
불필요한 속성 제거
인코딩 수행

모델 학습 및 검증/예측/평가

결정트리, 랜덤포레스트, 로지스틱 회귀 학습비교
k폴드 교차 검증
cross_val_score()와 GridSearchCV()수행

머신러닝 지도 학습 프로세스

데이터 전처리 -> 데이터 세트 분리 -> 모델 학습 및 검증 평가 -> 예측 수행 -> 평가

데이터 클린징 학습데이터/ 테스트 알고리즘 학습 테스트 데이터로 예측평가
데이터 분리 예측 수행
결손값 처리
(Null/NaN)처리
데이터 인코딩(레
이블,원 핫인코딩)
데이터 스케일링
이상치제거
Feature선태그 추출
및 가공

 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt # 시각화
 import seaborn as sns           # 시각화
 %matplotlib inline

 # 경고 메시지 무시
 import warnings
 warnings.filterwarnings(action='ignore')

1. 데이터 로드 및 확인 (non값 먼저 채우기)

 # 데이터 로드
 titanic_df = pd.read_csv('./titanic_train.csv') # ./ 동일폴더 ../ 상위폴더

 # 데이터 확인
 print(titanic_df.shape) # 데이터 구조확인(891, 12)
 titanic_df.head()

(891, 12)

Passengerid : 탑승자 데이터 일련번호
survived : 생존 여부, 0=사망, 1=생존
Pclass : 티켓의 선실 등급, 1=일등석, 2=이등석, 3=삼등석
sex : 탑승자 성별
name : 탑승자 이름
Age : 탑승자 나이
sibsp : 같이 탑승한 형제자매 또는 배우자 인원수
parch : 같이 탑승한 부모님 또는 어린이 인원수
ticket : 티켓 번호
fare : 요금
cabin : 선실 번호
embarked : 중간 정착 항구 C = Cherbourg, Q = Queenstown, S = Southampton

 # non-null 값 확인
 print(titanic_df.info())

RangeIndex: 891 entries, 0 to 890

Data columns (total 12 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 PassengerId 891 non-null int64

1 Survived 891 non-null int64

2 Pclass 891 non-null int64

3 Name 891 non-null object

4 Sex 891 non-null object

5 Age 891 non-null float64

6 SibSp 891 non-null int64

7 Parch 891 non-null int64

8 Ticket 891 non-null object

9 Fare 891 non-null float64

10 Cabin 891 non-null object

11 Embarked 891 non-null object

dtypes: float64(2), int64(5), object(5)

memory usage: 83.7+ KB

None

 # 컬럼별 null값 확인.titanic_df.isnull()
 titanic_df.isnull().sum()

PassengerId 0

Survived 0

Pclass 0

Name 0

Sex 0

Age 177

SibSp 0

Parch 0

Ticket 0

Fare 0

Cabin 687

Embarked 2

dtype: int64

NULL 컬럼들에 대한 처리

 # 나이는 전체 평균 값으로 채운다..fillna
 titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True) # 평균나이로 채우기

 # 선실은 N으로 채운다.
 titanic_df['Cabin'].fillna('N', inplace=True) 

 # 정착지는 N으로 채운다.
 titanic_df['Embarked'].fillna('N', inplace=True)
 
 # 다시 컬럼별 null값 확인
 print('전체 데이터 세트 Null 값 갯수 ','\n', titanic_df.isnull().sum())

전체 데이터 세트 Null 값 갯수

PassengerId 0

Survived 0

Pclass 0

Name 0

Sex 0

Age 0

SibSp 0

Parch 0

Ticket 0

Fare 0

Cabin 0

Embarked 0

dtype: int64

 print('데이터 세트 Null 값 갯수 ', titanic_df.isnull().sum().sum())

데이터 세트 Null 값 갯수 0

컬럼별 데이터 분포 확인

 # 성별 승객 수
 titanic_df['Sex'].value_counts()

male 577

female 314

Name: Sex, dtype: int64

 # 최대 줄 수 설정, 중간생략 없음
 pd.set_option('display.max_rows', 200)
 
 # 선실 별 승객 수

titanic_df['Cabin'].value_counts()

 # 성별 생존자 수(1:생존, 0:사망)
 titanic_df.groupby(['Sex','Survived'])['Survived'].count()

Sex Survived

female 0 81

1 233

male 0 468

1 109

Name: Survived, dtype: int64

2. 데이터 시각화

 # 씨본은 데이터프레임과 x,y를 넣어주면 알아서 컬럼을 찾아서 그래프를 그려준다.
 sns.barplot(x='Sex', y = 'Survived', data=titanic_df)

<AxesSubplot:xlabel='Sex', ylabel='Survived'>

 # hue 값 기준으로 구분해서 보여준다.
 sns.barplot(x='Pclass', y='Survived', hue='Sex', data=titanic_df)

<AxesSubplot:xlabel='Pclass', ylabel='Survived'>

3. 데이터 전처리(feature engineering)

 # age 범위에 따라 카테고리화하는 함수. 
 # 입력값으로 'Age' 컬럼값을 받아서 해당하는 카테고리 반환
 def get_category(age):
    cat = ''
    if age <= -1: cat = 'Unknown'
    elif age <= 5: cat = 'Baby'
    elif age <= 12: cat = 'Child'
    elif age <= 18: cat = 'Teenager'
    elif age <= 25: cat = 'Student'
    elif age <= 35: cat = 'Young Adult'
    elif age <= 60: cat = 'Adult'
    else : cat = 'Elderly'
    
    return cat

 # 막대그래프의 크기 figure를 더 크게 설정 
 plt.figure(figsize=(10, 6))
 plt.title('Survivors by age group')

 # X축의 값을 순차적으로 표시하기 위한 설정 
 group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Elderly'] 

 # 나이에 따른 카테고리화
 titanic_df['Age_cat'] = titanic_df['Age'].apply(lambda x : get_category(x))

 # 나이 카테고리 그래프 그리기
 ax = sns.barplot(x='Age_cat', y = 'Survived', hue='Sex', data=titanic_df, order=group_names)

 # 범례 위치  
 ax.legend(loc='upper left')

<matplotlib.legend.Legend at 0x2eb5dfd2910>

 # titanic_df.drop('Age_cat', axis=1, inplace=True)
 titanic_df

 ['컴퓨터', '책상', '의자']
    1        2       3

 from sklearn import preprocessing

 # 데이터프레임이 인자로 들어가면, 특정 컬럼들을 레이블 인코딩 해주는 함수
 def encode_features(dataDF):
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(dataDF[feature])
        dataDF[feature] = le.transform(dataDF[feature])
        
    return dataDF

 titanic_df = encode_features(titanic_df)
 titanic_df.head()

전처리 함수로 데이터 전처리

 from sklearn.preprocessing import LabelEncoder

 # Null 처리 함수
 def fillna(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0,inplace=True)
    return df

 # 머신러닝 알고리즘에 불필요한 속성 제거
 def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
    return df

 # 레이블 인코딩 수행
 def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

 # 앞에서 설정한 Data Preprocessing 함수 호출
 def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

 # 원본 데이터를 재로딩 하고, feature 데이터 셋과 Label 데이터 셋 추출. 
 titanic_df = pd.read_csv('./titanic_train.csv')

 # X, y 데이터프레임 분리
 y_titanic_df = titanic_df['Survived']                # 레이블(타겟값)
 X_titanic_df = titanic_df.drop('Survived', axis=1)   # 피쳐
 
 # X_train 데이터에 전처리 수행
 X_titanic_df = transform_features(X_titanic_df)
 
 # train, test 데이터 분리
 from sklearn.model_selection import train_test_split
 X_train, X_test, y_train, y_test=train_test_split(X_titanic_df, y_titanic_df, \
                                                  test_size=0.2, random_state=11)

4. 모델 학습 및 비교

 # 머신러닝 지도학습 분류기 모델들
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LogisticRegression # 분류알고리즘
 from sklearn.metrics import accuracy_score

DecisionTreeClassifier 학습/예측/평가

 # DecisionTreeClassifier dt_clf 객체 생성 -> 객체로 학습
 dt_clf = DecisionTreeClassifier(random_state=11)
 dt_clf

DecisionTreeClassifier(random_state=11)

 # fit 학습 메소드
 dt_clf.fit(X_train , y_train)

 # 예측
 dt_pred = dt_clf.predict(X_test)
 print('DecisionTreeClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, dt_pred)))

DecisionTreeClassifier 정확도: 0.7877

RandomForestClassifier 학습/예측/평가

 rf_clf = RandomForestClassifier(random_state=11)
 rf_clf

RandomForestClassifier(random_state=11)

 rf_clf.fit(X_train , y_train)
 rf_pred = rf_clf.predict(X_test)
 print('RandomForestClassifier 정확도:{0:.4f}'.format(accuracy_score(y_test, rf_pred)))

RandomForestClassifier 정확도:0.8547

LogisticRegression 학습/예측/평가

 lr_clf = LogisticRegression(random_state=11)
 lr_clf

LogisticRegression(random_state=11)

 lr_clf.fit(X_train , y_train)
 lr_pred = lr_clf.predict(X_test)
 print('LogisticRegression 정확도: {0:.4f}'.format(accuracy_score(y_test, lr_pred)))

LogisticRegression 정확도: 0.8492

5. 교차 검증 - KFold(k=5)

(1) 일반 KFold로 교차 검증 - DecisionTreeClassifier

 from sklearn.model_selection import KFold

 def exec_kfold(clf, folds=5):
    # 폴드 세트를 5개인 KFold객체를 생성, 폴드 수만큼 예측결과 저장을 위한 리스트 객체 생성.
    kfold = KFold(n_splits=folds)
    scores = []
    
    # KFold 교차 검증 수행. 
    # KFold객체의 split( ) 호출하면 폴드 별 학습용, 검증용 테스트의 row 인덱스를 array로 반환
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)):
    # enumerate 인덱스 번호와 걸렉션의 원소를 tuple형태로 변환
    
        # X_titanic_df 데이터에서 교차 검증별로 학습과 검증 데이터를 가리키는 index 생성
        X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
        y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]
        
        # Classifier 학습, 예측, 정확도 계산 
        clf.fit(X_train, y_train) 
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        scores.append(accuracy)
        print("교차 검증 {0} 정확도: {1:.4f}".format(iter_count, accuracy))     
    
    # 5개 fold에서의 평균 정확도 계산. 
    mean_score = np.mean(scores)
    print("평균 정확도: {0:.4f}".format(mean_score))

 # dt_clf (DecisionTreeClassifier) 교차 검증 수행
 exec_kfold(dt_clf , folds=5)

교차 검증 0 정확도: 0.7542

교차 검증 1 정확도: 0.7809

교차 검증 2 정확도: 0.7865

교차 검증 3 정확도: 0.7697

교차 검증 4 정확도: 0.8202

평균 정확도: 0.7823

(2) cross_val_score로 교차 검증 - DecisionTreeClassifier

 from sklearn.model_selection import cross_val_score

 scores = cross_val_score(dt_clf, X_titanic_df , y_titanic_df , cv=5)
 scores

array([0.74301676, 0.7752809 , 0.79213483, 0.78651685, 0.84269663])

 # 교차 검증 평균 정확도 확인
 for iter_count,accuracy in enumerate(scores):
    print("교차 검증 {0} 정확도: {1:.4f}".format(iter_count, accuracy))

 print("평균 정확도: {0:.4f}".format(np.mean(scores)))

교차 검증 0 정확도: 0.7430

교차 검증 1 정확도: 0.7753

교차 검증 2 정확도: 0.7921

교차 검증 3 정확도: 0.7865

교차 검증 4 정확도: 0.8427

평균 정확도: 0.7879

(3) GridSearchCV로 교차 검증 + 하이퍼 파라미터 튜닝 - DecisionTreeClassifier

 from sklearn.model_selection import GridSearchCV

 parameters = {'max_depth':[2, 3, 5, 10],
              'min_samples_split':[2, 3, 5], 
              'min_samples_leaf':[1, 5, 8]}
 parameters

{'max_depth': [2, 3, 5, 10],

'min_samples_split': [2, 3, 5],

'min_samples_leaf': [1, 5, 8]}

 # GridSearchCV 객체 생성
 grid_dclf = GridSearchCV(dt_clf , param_grid=parameters , scoring='accuracy' , cv=5)
 grid_dclf

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=11),

param_grid={'max_depth': [2, 3, 5, 10],

'min_samples_leaf': [1, 5, 8],
'min_samples_split': [2, 3, 5]},

scoring='accuracy')

 # GridSearchCV 수행
 grid_dclf.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=11),

param_grid={'max_depth': [2, 3, 5, 10],

'min_samples_leaf': [1, 5, 8],

'min_samples_split': [2, 3, 5]},

scoring='accuracy')

 # GridSearchCV 결과를 데이터프레임 형태로 확인
 grid_df = pd.DataFrame(grid_dclf.cv_results_)
 grid_df

 # 필요한 컬럼만 확인
 grid_df = grid_df[['params', 'mean_test_score', 'rank_test_score', 
                   'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']]

 grid_df = grid_df.sort_values(by='rank_test_score')
 grid_df[:10]

 # GridSearchCV 수행 결과 확인
 print('GridSearchCV 최적 하이퍼 파라미터 :',grid_dclf.best_params_)
 print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dclf.best_score_))

GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 3, 'min_samples_leaf': 5, 'min_samples_split': 2}

GridSearchCV 최고 정확도: 0.7992

 best_dclf = grid_dclf.best_estimator_
 best_dclf

DecisionTreeClassifier(max_depth=3, min_samples_leaf=5, random_state=11)

 # 테스트 데이터 예측 및 평가 수행. predict
 dpredictions = best_dclf.predict(X_test)
 accuracy = accuracy_score(y_test , dpredictions)

 accuracy

0.8715083798882681