붓꽃 분류

데이터 임포트

# 붓꽃 분류
from sklearn import datasets
import numpy as np

iris = datasets.load_iris()
x = iris.data[:, [2,3]]
y = iris.target

x.shape, y.shape

# 0 = setosa, 1 = versicolor, 2 = virginica
# 다중 분류
print(np.unique(y))

변수 설정

from sklearn.model_selection import train_test_split

# x = 특성 feature 데이터
# y = 레이블 label 데이터
# target
x_train, x_test, y_train, y_test = train_test_split(
    x,y, test_size = 0.3, random_state = 1            # random_stat 숫자는 무의미, 아무 숫자나 가능
    ,stratify = y                                     # 층하추출
)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

변수의 차이 (x_val vs x_test)

26일 코드 x_val
- 모델 한번에 결정 안함 (검증 필요, 실험을 하겠다.)
- 테스트 데이터가 확보된 상태
27일 (위에) 코드 x_test
- 테스트 데이터가 없음. 임의로 만듬
- 모델 검증 안함 (바로 사용하겠다.)

데이터 학습

# 모든 수치 데이터를 표준화 평균 0, 표준편차 1로 만들겠음
# 단위가 서로 다를 때 한다.
# linear model 활용 시 필수 (linear model == 수식이 들어간 모델)
# 결정트리와 비선형모델 은 (특성표준화를 할)필요가 없다.
from sklearn.preprocessing import StandardScaler

sc = StandardScaler() # 인스턴스 (객체) 생성
sc.fit(x_train)

# 훈련 데이터와 테스트 데이터를 표준화
# 테스트 데이터는 미지의 데이터 (모수를 모르는 데이터)
# 연구자가 아는 건 훈련 데이터만 파악된 상태
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)

# 모델 만들기

from sklearn.linear_model import Perceptron

# eta0 = 0.1 학습률 설정
# 하이퍼파라미터 튜닝

# 퍼셉트론 : 딥러닝 모델의 초창기 모델 (1950년대)
ppn = Perceptron(eta0=0.1, random_state = 1)
ppn.fit(x_train_std, y_train)

fit()을 한 번만 하는 이유