Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from sklearn.pipeline import make_pipeline | |
from catboost import CatBoostClassifier | |
from sklearn.preprocessing import StandardScaler | |
import shap | |
import matplotlib.pyplot as plt | |
from sklearn.decomposition import PCA | |
from sklearn.feature_selection import SelectKBest | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.pipeline import make_pipeline | |
from sklearn.linear_model import LogisticRegression | |
from catboost import CatBoostClassifier | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.cluster import DBSCAN | |
from sklearn.neighbors import NearestNeighbors | |
import numpy as np | |
import pandas as pd | |
from tqdm.auto import tqdm | |
from sklearn.preprocessing import OneHotEncoder | |
import pickle | |
class CustomFeatureTransformer(BaseEstimator, TransformerMixin): | |
def __init__(self, verbose=False): | |
self.verbose = verbose | |
self.column_means_ = None | |
def fit(self, X, y=None): | |
X_copy = X.copy() | |
self.numerical_columns = list(X_copy.select_dtypes(include=np.number).columns) | |
self.categorical_columns = list(X_copy.select_dtypes(exclude=np.number).columns) | |
# filter out with > 100 unique values | |
for col in self.categorical_columns: | |
if len(X_copy[col].unique()) > 100: | |
self.categorical_columns.remove(col) | |
if self.verbose: | |
print(f'removed {col} with {len(X_copy[col].unique())} unique values') | |
# Store means for each column | |
self.column_means_ = X_copy[self.numerical_columns].mean().fillna(0) | |
self.onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') | |
self.onehot_encoder.fit(X_copy[self.categorical_columns]) | |
return self | |
def transform(self, X): | |
X_copy = X.copy() | |
X_copy.reset_index(drop=True, inplace=True) | |
result_dfs = [] | |
# Process each column | |
for col in self.numerical_columns: | |
# Add is_null indicator | |
is_null = X_copy[col].isna() | |
result_dfs.append(pd.DataFrame({ | |
f"{col}_is_null": is_null.astype(int) | |
})) | |
filled_values = X_copy[col].fillna(self.column_means_[col]) | |
result_dfs.append(pd.DataFrame({ | |
f"{col}_value": filled_values | |
})) | |
# Add non-numerical columns using one-hot encoding | |
result_dfs.append(pd.DataFrame(self.onehot_encoder.transform(X_copy[self.categorical_columns]), columns=self.onehot_encoder.get_feature_names_out())) | |
# Concatenate all transformed features | |
df = pd.concat(result_dfs, axis=1) | |
assert not df.isna().any().any() | |
return df | |
class DayNumberTransformer: | |
def __init__(self): | |
pass | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X, y=None): | |
X = X.copy() | |
X['message_timestamp'] = pd.to_datetime(X['message_timestamp']) | |
X['week_number'] = X['message_timestamp'].dt.strftime('%U %w') | |
return X | |
class WeatherTransformer: | |
def __init__(self, weather): | |
self.weather = weather | |
self.weather['date'] = pd.to_datetime(self.weather['date']).dt.tz_convert('Europe/Berlin') | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X, y=None): | |
X = X.copy() | |
# round ot hour | |
X['message_timestamp'] = pd.to_datetime(X['message_timestamp']).dt.tz_localize('Europe/Berlin') | |
X['message_timestamp'] = X['message_timestamp'].dt.round('h') | |
# join weather data by column message_timestamp and date | |
X = X.merge(self.weather, left_on='message_timestamp', right_on='date', how='left') | |
# print number of rows in X that have no weather data | |
if X['temperature_2m'].isna().sum() > 0: | |
print("Number of rows without weather data: ", X['temperature_2m'].isna().sum()) | |
columns_X = X.columns | |
# delete all that contain 'sensor' in the name | |
columns_X = [col for col in columns_X if 'sensor' not in col] | |
# print("Columns in X: ", columns_X) | |
# 1 / 0 | |
return X | |
class TopFeaturesSelector: | |
def __init__(self, top_features): | |
self.top_features = top_features | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X, y=None): | |
return X[self.top_features] | |
import warnings | |
warnings.filterwarnings("ignore") | |
weather_file = 'hourly_data.csv' | |
shap_importance_file = 'shap_importance.csv' | |
weather = pd.read_csv(weather_file) | |
shap_importance_df = pd.read_csv(shap_importance_file) | |
print(shap_importance_df.head()) | |
top_features = shap_importance_df['Feature'].head(25).values | |
catboost = CatBoostClassifier().load_model('catboost_model.cbm') | |
scaler = pickle.load(open('scaler.pkl', 'rb')) | |
custom_feature_transformer = pickle.load(open('customfeatureselector.pkl', 'rb')) | |
# Define the sklearn pipeline | |
pipe = make_pipeline( | |
WeatherTransformer(weather), | |
DayNumberTransformer(), | |
custom_feature_transformer, | |
TopFeaturesSelector(top_features), | |
scaler, | |
catboost | |
) | |
def egor_plots(X_test, k=1000): | |
# Preprocess X_test | |
X_prescaled = pipe[:-2].transform(X_test)[:k] | |
X_test_preprocessed = pipe[-2].transform(X_prescaled) | |
# SHAP Analysis | |
st.write("SHAP Analysis... This may take a couple of minutes depending on the number of samples.") | |
explainer = shap.TreeExplainer(pipe[-1]) | |
shap_values = explainer(X_test_preprocessed) | |
shap_values.feature_names = X_prescaled.columns | |
# SHAP Summary Plot | |
st.write("### SHAP Summary Plot") | |
fig_summary = shap.summary_plot(shap_values, X_test_preprocessed, show=False) | |
st.pyplot(fig_summary) | |
# SHAP Scatter Plots | |
st.write("### SHAP Scatter Plots") | |
for i in range(25): | |
feature_name = top_features[i] | |
st.write(f"#### Scatter Plot for Feature: {feature_name}") | |
fig, ax = plt.subplots() | |
shap.plots.scatter(shap_values[:, i], X_test_preprocessed[:, i], show=False, ax=ax) | |
ax.axhline(y=0, color='r', linestyle='--') | |
ax.axvline(x=0, color='g', linestyle='--') | |
st.pyplot(fig) | |
# Streamlit App | |
st.title("BMW Hackathon Defect Detection") | |
st.write("### Upload your tabular data") | |
# File uploader | |
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) | |
# Add radio button for prediction type | |
prediction_type = st.radio( | |
"Select prediction type", | |
["predict", "predict_proba"], | |
index=0 | |
) | |
k = st.slider("Number of samples for SHAP plots", min_value=10, max_value=1000, value=100) | |
if uploaded_file: | |
# Load the uploaded file | |
data = pd.read_csv(uploaded_file) | |
st.write("Uploaded Data:") | |
st.write(data.head()) | |
st.write("Predicting...") | |
if prediction_type == 'predict': | |
y_pred = pipe.predict(data) | |
# status 1 -> OK, 0 -> NOK | |
status = pd.Series(['OK' if pred == 1 else 'NOK' for pred in y_pred]) | |
elif prediction_type == 'predict_proba': | |
status = pipe.predict_proba(data)[:, 1] | |
else: | |
raise ValueError(f"Invalid prediction type: {prediction_type}") | |
res = pd.DataFrame( | |
{"physical_part_id": data["physical_part_id"], | |
"status": status} | |
) | |
st.write("### Results") | |
st.write(res.head()) | |
# Download the predictions as CSV | |
csv = res.to_csv(index=False) | |
st.download_button( | |
label="Download predictions as CSV", | |
data=csv, | |
file_name="predictions.csv", | |
mime="text/csv" | |
) | |
st.write("### SHAP plots") | |
egor_plots(data, k) | |