Python_ML_pipeline & streamlit

model file (ml-ensemble2.py)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score
import xgboost as xgb
import lightgbm as lgb
import joblib
import os
from datetime import datetime

# 1. 데이터 로드
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 2. 주요 변수 선택 (예시)
main_features = [
    'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'
]

# 3. 피처/타겟 분리
X = train[main_features]
y = train['Survived']
X_test = test[main_features]

# 4. 훈련/검증 분리
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. 전처리 파이프라인 (결측치 처리 포함)
numeric_feats = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_feats = X_train.select_dtypes(include=['object']).columns.tolist()

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_feats),
    ('cat', categorical_transformer, categorical_feats)
])

# 6. Base 모델 정의
base_estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)),
    ('xgb', xgb.XGBClassifier(n_estimators=100, max_depth=3, tree_method='hist', random_state=42, use_label_encoder=False, eval_metric='logloss')),
    ('lgb', lgb.LGBMClassifier(n_estimators=100, max_depth=3, random_state=42))
]

# 7. Stacking 앙상블 (파이프라인 포함)
stacking = StackingClassifier(
    estimators=[(name, Pipeline([('pre', preprocessor), (name, model)])) for name, model in base_estimators],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42),
    cv=5,
    n_jobs=-1,
    passthrough=False
)

# 8. 학습
stacking.fit(X_train, y_train)

# 9. 검증 예측 및 평가
val_preds = stacking.predict(X_val)
val_probs = stacking.predict_proba(X_val)[:, 1]

print("검증 정확도:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))

# 10. ROC Curve (fig, ax)
fpr, tpr, _ = roc_curve(y_val, val_probs)
auc = roc_auc_score(y_val, val_probs)

fig, ax = plt.subplots(figsize=(6, 5))
ax.plot(fpr, tpr, label=f'Stacking (AUC={auc:.3f})')
ax.plot([0, 1], [0, 1], 'k--', label='Random')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve - Titanic Survival (Stacking)')
ax.legend()
ax.grid(True)
plt.tight_layout()
plt.show()

# 11. Test 데이터 예측 및 제출 파일 생성
test_preds = stacking.predict(X_test)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_preds
})

# 12. 모델 내보내기
model_path = "titanic_stacking_model.pkl"
os.makedirs("models", exist_ok=True)
with open(model_path, "wb") as f:
    joblib.dump(stacking, f)

print(f"모델이 {model_path}에 저장되었습니다.")

app.py

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

## 페이지 설정
st.set_page_config(
    page_title='Titanic Survival Prediction',
    layout='wide'
)

st.title("Titanic Main page, Dashboard")

## 데이터 불러오기
@st.cache_data  # 데이터 불러올 땐 무조건 있어야 함
def load_data():
    train = pd.read_csv('train.csv')
    # 추가로 이곳에서 데이터 처리 (불필요한 컬럼 제거 등)을 할 수 있음
    # 추가로 local에서가 아닌 sql에서 수집하는 방법 작성 
    return train

train = load_data()
# st.dataframe(train) # 확인용

## 탭 생성
tab1, tab2, tab3 = st.tabs(['EDA', 'Statistics', 'Prediction'])

# EDA 탭
with tab1:
    st.header("Exploratory Data Analysis")
    
    # 데이터 미리보기
    st.subheader("Data Preview")
    st.dataframe(train.head())
    
    # 생존자 분포
    st.subheader("Survival Distribution")
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.countplot(data=train, x='Survived')
    plt.title("Survival Count")
    st.pyplot(fig)
    
    # 성별별 생존율
    st.subheader("Survival Rate by Gender")
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.barplot(data=train, x='Sex', y='Survived')
    plt.title("Survival Rate by Gender")
    st.pyplot(fig)
    
    # 객실 등급별 생존율
    st.subheader("Survival Rate by Passenger Class")
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.barplot(data=train, x='Pclass', y='Survived')
    plt.title("Survival Rate by Passenger Class")
    st.pyplot(fig)

# Statistics 탭
with tab2:
    st.header("Statistical Analysis")
    
    # 기본 통계량
    st.subheader("Basic Statistics")
    st.dataframe(train.describe())
    
    # 상관관계 분석
    st.subheader("Correlation Analysis")
    numeric_cols = train.select_dtypes(include=['int64', 'float64']).columns
    corr = train[numeric_cols].corr()
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(corr, annot=True, cmap='coolwarm', ax=ax)
    st.pyplot(fig)
    
    # 결측치 분석
    st.subheader("Missing Values Analysis")
    missing_data = pd.DataFrame({
        'Missing Values': train.isnull().sum(),
        'Percentage': (train.isnull().sum() / len(train)) * 100
    })
    st.dataframe(missing_data)

with tab3:
    st.header('예측')    

    @st.cache_resource # 세션과 관련 있는 데코레이터
    def load_model():
        return joblib.load('titanic_stacking_model.pkl')
    
    model = load_model()
    # st.write(model)

    ## 입력 폼
    st.subheader('고객 정보 입력')
    
    # col1, col2 = st.columns(2)

    # # 입력값 기준점은 train.csv 파일 기준에 작성
    # with col1:
    #     pclass = st.selectbox('Passenger Class', [1,2,3])
    # with col2:
    #     fare = st.number_input('Fare', min_value=0.0, max_value=500.0, value=50.0)
    
    # if st.button('Predict Survival'):
    #     input_data = pd.DataFrame({
    #         'Pclass': [pclass],
    #         'Fare': [fare]
    #     })
    #     st.dataframe(input_data)
    #     model.predict(input_data)
    col1, col2 = st.columns(2)
    
    with col1:
        pclass = st.selectbox("Passenger Class", [1, 2, 3])
        sex = st.selectbox("Sex", ["male", "female"])
        age = st.number_input("Age", min_value=0, max_value=100, value=30)
        
    with col2:
        sibsp = st.number_input("Number of Siblings/Spouses", min_value=0, max_value=10, value=0)
        parch = st.number_input("Number of Parents/Children", min_value=0, max_value=10, value=0)
        fare = st.number_input("Fare", min_value=0.0, max_value=500.0, value=50.0)
        embarked = st.selectbox("Port of Embarkation", ["C", "Q", "S"])
    
    # 예측 버튼
    if st.button("Predict Survival"):
        # 입력 데이터 준비
        input_data = pd.DataFrame({
            'Pclass': [pclass],
            'Sex': [sex],
            'Age': [age],
            'SibSp': [sibsp],
            'Parch': [parch],
            'Fare': [fare],
            'Embarked': [embarked]
        })
        
        # 예측
        prediction = model.predict(input_data)[0]
        probability = model.predict_proba(input_data)[0][1]
        
        # 결과 표시
        st.subheader("Prediction Result")
        if prediction == 1:
            st.success(f"Survival Probability: {probability:.2%}")
            st.write("The passenger is predicted to survive.")
        else:
            st.error(f"Survival Probability: {probability:.2%}")
            st.write("The passenger is predicted to not survive.")