import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler from io import BytesIO # 讓使用者上傳 CSV 檔案 uploaded_file = st.file_uploader("上傳一個 CSV 檔案", type="csv") if uploaded_file is not None: # 讀取上傳的 CSV 檔案 df = pd.read_csv(uploaded_file) # 確保數據裡有 "target" 欄位 if 'target' in df.columns: # 準備特徵和目標變量 X = df.drop('target', axis=1) y = df['target'] # 分割數據 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 標準化特徵 scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # 計算特徵重要性 def calculate_importance(): # Linear Regression lr = LinearRegression() lr.fit(X_train_scaled, y_train) lr_importance = np.abs(lr.coef_) # CART cart = DecisionTreeClassifier(random_state=42) cart.fit(X_train, y_train) cart_importance = cart.feature_importances_ # Random Forest rf = RandomForestClassifier(n_estimators=100, random_state=42) rf.fit(X_train, y_train) rf_importance = rf.feature_importances_ return lr_importance, cart_importance, rf_importance # 創建特徵重要性 DataFrame lr_importance, cart_importance, rf_importance = calculate_importance() feature_importance = pd.DataFrame({ 'Feature': X.columns, 'Linear Regression': lr_importance, 'CART': cart_importance, 'Random Forest': rf_importance }) # 排序 feature_importance = feature_importance.sort_values('Random Forest', ascending=False) # 繪製相關矩陣 st.write("### 相關矩陣") corr_matrix = df.corr() plt.figure(figsize=(10, 8)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5) st.pyplot(plt) # 分別繪製各個模型的特徵重要性圖表 def plot_individual_model(model_name): plt.figure(figsize=(10, 6)) plt.bar(feature_importance['Feature'], feature_importance[model_name]) plt.title(f'{model_name} Feature Importance') plt.xlabel('Features') plt.ylabel('Importance') plt.xticks(rotation=45, ha='right') st.pyplot(plt) # Streamlit UI st.write("### 特徵重要性分析") # 分開顯示三個模型的特徵重要性圖表 st.write("#### Linear Regression") plot_individual_model('Linear Regression') st.write("#### CART (Decision Tree)") plot_individual_model('CART') st.write("#### Random Forest") plot_individual_model('Random Forest') # 顯示數據框 st.write("### 特徵重要性數據表") st.dataframe(feature_importance) # 讓使用者下載特徵重要性的 Excel 檔案 def to_excel(df): output = BytesIO() writer = pd.ExcelWriter(output, engine='xlsxwriter') df.to_excel(writer, index=False, sheet_name='Feature Importance') writer.close() # 使用 close() 來正確保存 Excel 文件 processed_data = output.getvalue() return processed_data excel_data = to_excel(feature_importance) st.download_button(label='下載特徵重要性數據為 Excel 檔案', data=excel_data, file_name='feature_importance.xlsx', mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet') else: st.error("上傳的檔案中找不到 'target' 欄位,請確認檔案格式。")