import streamlit as st import pandas as pd import numpy as np from sklearn.pipeline import make_pipeline from catboost import CatBoostClassifier from sklearn.preprocessing import StandardScaler import shap import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression from catboost import CatBoostClassifier from sklearn.base import BaseEstimator, TransformerMixin from sklearn.cluster import DBSCAN from sklearn.neighbors import NearestNeighbors import numpy as np import pandas as pd from tqdm.auto import tqdm from sklearn.preprocessing import OneHotEncoder import pickle class CustomFeatureTransformer(BaseEstimator, TransformerMixin): def __init__(self, verbose=False): self.verbose = verbose self.column_means_ = None def fit(self, X, y=None): X_copy = X.copy() self.numerical_columns = list(X_copy.select_dtypes(include=np.number).columns) self.categorical_columns = list(X_copy.select_dtypes(exclude=np.number).columns) # filter out with > 100 unique values for col in self.categorical_columns: if len(X_copy[col].unique()) > 100: self.categorical_columns.remove(col) if self.verbose: print(f'removed {col} with {len(X_copy[col].unique())} unique values') # Store means for each column self.column_means_ = X_copy[self.numerical_columns].mean().fillna(0) self.onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') self.onehot_encoder.fit(X_copy[self.categorical_columns]) return self def transform(self, X): X_copy = X.copy() X_copy.reset_index(drop=True, inplace=True) result_dfs = [] # Process each column for col in self.numerical_columns: # Add is_null indicator is_null = X_copy[col].isna() result_dfs.append(pd.DataFrame({ f"{col}_is_null": is_null.astype(int) })) filled_values = X_copy[col].fillna(self.column_means_[col]) result_dfs.append(pd.DataFrame({ f"{col}_value": filled_values })) # Add non-numerical columns using one-hot encoding result_dfs.append(pd.DataFrame(self.onehot_encoder.transform(X_copy[self.categorical_columns]), columns=self.onehot_encoder.get_feature_names_out())) # Concatenate all transformed features df = pd.concat(result_dfs, axis=1) assert not df.isna().any().any() return df class DayNumberTransformer: def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X, y=None): X = X.copy() X['message_timestamp'] = pd.to_datetime(X['message_timestamp']) X['week_number'] = X['message_timestamp'].dt.strftime('%U %w') return X class WeatherTransformer: def __init__(self, weather): self.weather = weather self.weather['date'] = pd.to_datetime(self.weather['date']).dt.tz_convert('Europe/Berlin') def fit(self, X, y=None): return self def transform(self, X, y=None): X = X.copy() # round ot hour X['message_timestamp'] = pd.to_datetime(X['message_timestamp']).dt.tz_localize('Europe/Berlin') X['message_timestamp'] = X['message_timestamp'].dt.round('h') # join weather data by column message_timestamp and date X = X.merge(self.weather, left_on='message_timestamp', right_on='date', how='left') # print number of rows in X that have no weather data if X['temperature_2m'].isna().sum() > 0: print("Number of rows without weather data: ", X['temperature_2m'].isna().sum()) columns_X = X.columns # delete all that contain 'sensor' in the name columns_X = [col for col in columns_X if 'sensor' not in col] # print("Columns in X: ", columns_X) # 1 / 0 return X class TopFeaturesSelector: def __init__(self, top_features): self.top_features = top_features def fit(self, X, y=None): return self def transform(self, X, y=None): return X[self.top_features] import warnings warnings.filterwarnings("ignore") weather_file = 'hourly_data.csv' shap_importance_file = 'shap_importance.csv' weather = pd.read_csv(weather_file) shap_importance_df = pd.read_csv(shap_importance_file) print(shap_importance_df.head()) top_features = shap_importance_df['Feature'].head(25).values catboost = CatBoostClassifier().load_model('catboost_model.cbm') scaler = pickle.load(open('scaler.pkl', 'rb')) custom_feature_transformer = pickle.load(open('customfeatureselector.pkl', 'rb')) # Define the sklearn pipeline pipe = make_pipeline( WeatherTransformer(weather), DayNumberTransformer(), custom_feature_transformer, TopFeaturesSelector(top_features), scaler, catboost ) def egor_plots(X_test, k=1000): # Preprocess X_test X_prescaled = pipe[:-2].transform(X_test)[:k] X_test_preprocessed = pipe[-2].transform(X_prescaled) # SHAP Analysis st.write("SHAP Analysis... This may take a couple of minutes depending on the number of samples.") explainer = shap.TreeExplainer(pipe[-1]) shap_values = explainer(X_test_preprocessed) shap_values.feature_names = X_prescaled.columns # SHAP Summary Plot st.write("### SHAP Summary Plot") fig_summary = shap.summary_plot(shap_values, X_test_preprocessed, show=False) st.pyplot(fig_summary) # SHAP Scatter Plots st.write("### SHAP Scatter Plots") for i in range(25): feature_name = top_features[i] st.write(f"#### Scatter Plot for Feature: {feature_name}") fig, ax = plt.subplots() shap.plots.scatter(shap_values[:, i], X_test_preprocessed[:, i], show=False, ax=ax) ax.axhline(y=0, color='r', linestyle='--') ax.axvline(x=0, color='g', linestyle='--') st.pyplot(fig) # Streamlit App st.title("BMW Hackathon Defect Detection") st.write("### Upload your tabular data") # File uploader uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) # Add radio button for prediction type prediction_type = st.radio( "Select prediction type", ["predict", "predict_proba"], index=0 ) k = st.slider("Number of samples for SHAP plots", min_value=10, max_value=1000, value=100) if uploaded_file: # Load the uploaded file data = pd.read_csv(uploaded_file) st.write("Uploaded Data:") st.write(data.head()) st.write("Predicting...") if prediction_type == 'predict': y_pred = pipe.predict(data) # status 1 -> OK, 0 -> NOK status = pd.Series(['OK' if pred == 1 else 'NOK' for pred in y_pred]) elif prediction_type == 'predict_proba': status = pipe.predict_proba(data)[:, 1] else: raise ValueError(f"Invalid prediction type: {prediction_type}") res = pd.DataFrame( {"physical_part_id": data["physical_part_id"], "status": status} ) st.write("### Results") st.write(res.head()) # Download the predictions as CSV csv = res.to_csv(index=False) st.download_button( label="Download predictions as CSV", data=csv, file_name="predictions.csv", mime="text/csv" ) st.write("### SHAP plots") egor_plots(data, k)