머신러닝 개념과 교차검증 수행하기 K 폴드, Stratified K 폴드, cross_val_score
- Download
- ml01_ko.html(600.0 KB) 2022-01-131
- ml01_ko.py(9.7 KB) 2022-01-131
In [ ]:
import numpy as np
import pandas as pd
print('numpy version - ', np.__version__)
print('pandas version - ', pd.__version__)
In [ ]:
import sklearn
from sklearn.datasets import load_iris
print('sklearn version - ',sklearn.__version__)
In [ ]:
iris = load_iris()
print('type - ',type(iris)) # Bunch - key, value
print('type - ',iris.keys())
In [ ]:
print('data - ', iris.data)
print('data type - ', type(iris.data))
In [ ]:
print('target - ', iris.target)
print('target type - ', type(iris.target))
print()
print('target_names - ', iris.target_names)
print('target_names type - ', type(iris.target_names))
print()
print('feature_names - ', iris.feature_names)
print('feature_names type - ', type(iris.feature_names))
In [ ]:
print('feature, target을 이용해서 데이터 프레임을만들어보자 --')
iris_frm = pd.DataFrame(data = iris.data, columns=iris.feature_names)
iris_frm['target'] = iris.target
iris_frm
지도학습 - 분류(classification)¶
- step 01. 데이터 분리(training data, test data)
- step 02. 학습데이터를 기반으로 ML 알고리즘을 적용해 학습 모델을 생성
- step 03. 테스트데이터를 기반으로 분류예측 수행
- step 04. 모델에 대한 성능평가
In [ ]:
from sklearn.model_selection import train_test_split #데이터 분리
from sklearn.tree import DecisionTreeClassifier # 분류예측
from sklearn.metrics import accuracy_score # 정확도 평가
In [ ]:
print('step 01.')
print()
X_train, x_test, Y_train, y_test = train_test_split(iris.data,
iris.target,
test_size = 0.2,
shuffle = True,
random_state = 100)
In [ ]:
X_train.shape, x_test.shape, Y_train.shape, y_test.shape
In [ ]:
print('tranin data - ',X_train)
print()
print('tranin target - ',Y_train)
In [ ]:
print('tranin data - ',x_test)
print()
print('tranin target - ',y_test)
In [ ]:
print('step 02. fit() - ')
print()
iris_dtc_model = DecisionTreeClassifier() # 의사결정 학습모델 선정 -> 학습
iris_dtc_model.fit(X_train,Y_train) #피쳐와 타켓 입력
In [ ]:
print('step 03. predict() - ')
print()
y_pred = iris_dtc_model.predict(x_test) # 에측결과
print('y_test - ', y_test) # 정답 결과
print('y_pred - ', y_pred) # 에측 결과
In [ ]:
print('step 04. 예측정확도 - accuracy_score()')
print()
print('acc - ',accuracy_score(y_test, y_pred))
In [ ]:
display(iris_frm)
In [ ]:
print('데이터 프레임 형식에서 학습데이터와 테스트데이터를 분리한다면? - ')
print()
print('target - ')
print(iris_frm['target'], type(iris_frm['target']))
print('iloc() 이용해서 피처와 타켓을 추출한다면 - ')
print()
iris_feature_frm = iris_frm.iloc[:,:-1]
iris_target_frm = iris_frm.iloc[:,-1]
print('iris_feature_frm -', type(iris_feature_frm))
print('iris_target_frm -', type(iris_target_frm))
# https://separang.tistory.com/91
# iris_feature_frm
# iris_target_frm
In [ ]:
X_train, x_test, Y_train, y_test = train_test_split(iris_feature_frm,
iris_target_frm,
test_size = 0.2,
shuffle = True,
random_state = 100)
In [ ]:
X_train.shape, x_test.shape, Y_train.shape, y_test.shape
In [ ]:
iris_dtc_model = DecisionTreeClassifier() # 의사결정 학습모델 선정 -> 학습
iris_dtc_model.fit(X_train,Y_train) #피쳐와 타켓 입력
In [ ]:
y_pred = iris_dtc_model.predict(X_train) # 에측결과
print('y_test - ', Y_train) # 정답 결과
print('y_pred - ', y_pred, type(y_pred)) # 에측 결과 ndarray - ',' 가 있으면 리스트
In [ ]:
acc = accuracy_score(Y_train,y_pred)
print('acc - ', acc) # 모델의 과대적합을 방지하기 위한작업- 유효성체크, 교차검증 (테스트단계전)
교차검증(cross validation)¶
- 회귀에 사용못함, 분류에서만 사용
- 과적합(overffiting)을 방지하기 위한 방법
- 데이터의 편중을 막기위해서
- KFold 방식
In [ ]:
# import sklearn
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, cross_validate
# StratifiedKFold을 사용해야 셔플 발생
In [ ]:
fold_iris = load_iris()
features = fold_iris.data
label = fold_iris.target
In [ ]:
features
In [ ]:
label
In [ ]:
print('5개의 폴더 셋트를 분리하여 각 폴더 세트별 정확도를 확인해보자 -')
cv_acc = []
kfold = KFold(n_splits=5, shuffle=False)
# KFold(n_splits=5, *, shuffle=False, random_state=None)
#알고리즘이 적용된 모델이 필요
fold_dct_model = DecisionTreeClassifier()
for train_idx, test_idx in kfold.split(features):
# print('train_idx - ', train_idx)
# print('test_idx - ', test_idx)
X_train, X_val = features[train_idx], features[test_idx]
Y_train, Y_val = label[train_idx], label[test_idx]
# print('X_val - ', X_val)
fold_dct_model.fit(X_train, Y_train)
fold_pred = fold_dct_model.predict(X_val)
acc = accuracy_score(Y_val, fold_pred)
print('acc - ', acc)
cv_acc.append(acc)
print('교차검증 평균 정확도 - ',np.mean(cv_acc))
In [ ]:
print('기존 KFold방식의 문제점 확인 -')
print()
fold_iris_frm = pd.DataFrame(data = fold_iris.data,
columns = fold_iris.feature_names)
fold_iris_frm['target'] = fold_iris.target
fold_iris_frm
In [ ]:
fold_iris_frm['target'].value_counts()
# 데이터의 분포가 균등하게
In [ ]:
In [ ]:
bad_fold_iris = KFold(n_splits=3)
n_iter = 0
fold_dct_model = DecisionTreeClassifier()
for train_idx, test_idx in bad_fold_iris.split(fold_iris_frm):
n_iter += 1
# print('train_idx - ', train_idx)
# print('test_idx - ', test_idx)
label_train = fold_iris_frm['target'].iloc[train_idx]
label_val = fold_iris_frm['target'].iloc[test_idx]
print('교차검증 횟수 - ',n_iter)
print()
print('학습 레이블 데이터 분포 - \n', label_train)
print('검증 레이블 데이터 분포 - \n', label_val)
실습¶
- 아이리스 데이터를 이용하여 StratifiedKFold 교차검증을 진행해 보자
- random_state = 200
- StratifiedKFold(3,5) 평균 정확도 확인
In [ ]:
# https://ek-koh.github.io/data%20analysis/cv/ #참고 URL
fold_iris = load_iris()
features = fold_iris.data
label = fold_iris.target
cv_acc = []
skf = StratifiedKFold(n_splits=5)
fold_dct_model = DecisionTreeClassifier(random_state = 200)
n_tier = 0
for train_idx, test_idx in skf.split(features, label):
n_tier += 1
# pass
X_train, X_val = features[train_idx], features[test_idx]
Y_train, Y_val = label[train_idx], label[test_idx]
# print('X_val - ', X_val)
fold_dct_model.fit(X_train, Y_train)
fold_pred = fold_dct_model.predict(X_val)
acc = accuracy_score(Y_val, fold_pred)
accuracy = np.round(accuracy_score(Y_val,fold_pred), 4)
train_size = X_train.shape[0]
test_size = X_val.shape[0]
print("검증횟수 {}, 교차검증 정확도 {}, 학습데이터 크기 {}, 검증데이터 크기 {}".format(n_tier, acc,X_train.shape[0], X_val.shape[0]) )
cv_acc.append(acc)
print()
print("*" * 50)
print('교차검증 평균 정확도 - ',np.mean(cv_acc))
cross_val_score, cross_validate¶
- 위 과정을 한번에 수행하는 함수 : cross_val_score()
- 인자로 예측모델, 피처세트, 레이블, 성능평가 지표, 폴더 수
In [ ]:
fold_iris = load_iris()
features = fold_iris.data
label = fold_iris.target
dt_model = DecisionTreeClassifier(random_state=100)
In [ ]:
print('성능평가 acc, 교차검증 5회 수행 - ')
print('cross_val_score() - ')
print()
# cross_val_score(
# estimator,
# X,
# y=None,
# *,
# groups=None,
# scoring=None,
# cv=None,
# n_jobs=None,
# verbose=0,
# fit_params=None,
# pre_dispatch='2*n_jobs',
# error_score=nan,
# )
scores = cross_val_score(dt_model,
features,
label,
scoring='accuracy',
cv=5 )
In [ ]:
print('type - ', type(scores))
print('data - ', scores)
print('mean - ', np.round(np.mean(scores),2))
cross_validate¶
In [ ]:
# features 값만 변경해서 교차검증이 있는것과 없는것을 확인해볼수 있다
scores = cross_validate(dt_model,
features,
label,
scoring='accuracy',
cv=5 )
In [ ]:
print('type - ', type(scores))
# print('data - ', scores)
# print('mean - ', np.round(np.mean(scores),2))
for key in scores.keys():
print(key)
print('fit_time - ', scores['fit_time'])
print('score_time - ', scores['score_time'])
print('test_score - ', scores['test_score'])
print('mean - ', np.round(np.mean(scores['test_score']),2))
In [ ]:
In [ ]:
In [ ]:
In [ ]: