13. Cross validation & Pipeline 본문
0. 개요
교차 검증과 파이프라인에 대해서 학습
### Building a Pipeline ###
# load iris and create X and y
import pandas as pd
import seaborn as sns
sns.set_context("paper", font_scale = 1.5)
from sklearn.datasets import load_iris
dataset = load_iris()
X, y = dataset.data, dataset.target
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
# fit and transform using 2 input dimensions
out_pca = pca.fit_transform(X)
# create pca output dataframe and add label column "species"
df_pca = pd.DataFrame(data = out_pca, columns = ['pca1', 'pca2'])
df_pca = pd.concat([df_pca, pd.DataFrame(y, columns = ['species'])], axis = 1)
# plot scatter of pca data
sns.lmplot(x = 'pca1', y = 'pca2', hue = 'species', data = df_pca, fit_reg = False)
sns.violinplot(x = 'species', y = 'pca1', data = df_pca)
sns.violinplot(x = 'species', y = 'pca2', data = df_pca)
# 로지스틱 회귀 & f1 스코어
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
# create train and test sets
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size = 0.30)
# instantiate classifier object and fit to training data
clf = LogisticRegression()
clf.fit(X_train, y_train)
# predict on test set and score the predictions against y_test
y_pred = clf.predict(X_test)
f1 = f1_score(y_test, y_pred, average = 'weighted')
print('f1 score is = ' + str(f1))
# PCA 거친 데이터로 분석하기
X_train, X_test, y_train, y_test = \
train_test_split(df_pca[['pca1', 'pca2']], df_pca['species'], test_size = 0.30)
clf = LogisticRegression()
clf.fit(X_train, y_train)
# predict on test set and score the predictions against y_test
y_pred = clf.predict(X_test)
f1 = f1_score(y_test, y_pred, average = 'weighted')
print('f1 score is = ' + str(f1))
pca = PCA(n_components = 3)
# fit and transform using 2 input dimensions
out_pca = pca.fit_transform(X)
# create pca output dataframe and add label column "species"
df_pca = pd.DataFrame(data = out_pca, columns = ['pca1', 'pca2', 'pca3'])
df_pca = pd.concat([df_pca, pd.DataFrame(y, columns = ['species'])], axis = 1)
sns.violinplot(x = 'species', y = 'pca1', data = df_pca)
sns.violinplot(x = 'species', y = 'pca2', data = df_pca)
sns.violinplot(x = 'species', y = 'pca3', data = df_pca)
X_train, X_test, y_train, y_test = \
train_test_split(df_pca[['pca1', 'pca2', 'pca3']], df_pca['species'], test_size = 0.30)
clf = LogisticRegression()
clf.fit(X_train, y_train)
# predict on test set and score the predictions against y_test
y_pred = clf.predict(X_test)
f1 = f1_score(y_test, y_pred, average = 'weighted')
print('f1 score is = ' + str(f1))
## Pipeline ##
# import modules
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# create train and test sets
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size = 0.30)
pca = PCA()
logistic = LogisticRegression()
# instantiate a pipeline and add steps to the pipeline
pipe = Pipeline(steps = [('pca', pca), ('logistic', logistic)])
# print list of steps with names
# set the parameter grid to be passed to the grid search
param_grid = {
'pca__n_components': [2, 3, 4]
# instantiate the grid search object and pass the pipe and param_grid
model = GridSearchCV(pipe, param_grid, iid = False, cv = 5,
return_train_score = False)
# fit entire pipeline using grid serach and 5-fold cross validation
model.fit(X_train, y_train)
print("Best parameter (CV score = %0.3f):" % model.best_score_)
y_pred = model.predict(X_test)
파이프라인 응용 예제
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# 데이터 스플릿
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target,
test_size = 0.1,
stratify = cancer.target,
random_state = 42)
# 모델 객체 생성
tree = DecisionTreeClassifier(criterion = 'gini', max_depth = 1, random_state = 0)
# 파이프라인 설정
pipe = Pipeline(steps = [('tree', tree)])
# 반복할 변수 설정
param_grid = {
'tree__max_depth': [2, 4, 5,6,7]
# * 트리는 max_depth로 깊이를 설정하므로 해당 파라미터의 값에 들어갈 반복에 대한 설정이다.
# 매우 중요한 부분
# 파이프라인 모델 적합, 옵션은 5겹
model = GridSearchCV(pipe, param_grid, iid = False, cv = 5,
return_train_score = False)
# 파이프라인 모델 적합
model.fit(X_train, y_train)
# 결과 출력
print("Best parameter (CV score = %0.3f):" % model.best_score_)
y_pred = model.predict(X_test)
