TroglodyteDerivations's picture
Update app.py
feaffce verified
import streamlit as st
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
import plotly.graph_objs as go
from datasets import load_dataset
# Function to load and preprocess the dataset
def load_and_preprocess_data():
# Load the ETTm2 dataset from Hugging Face
dataset = load_dataset('TroglodyteDerivations/ETTm2')
data = dataset['train'].to_pandas()
# Feature engineering: Create lagged features
lags = 3 # Number of lags to create
for col in ['HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL', 'OT']:
for lag in range(1, lags + 1):
data[f'{col}_lag_{lag}'] = data[col].shift(lag)
# Drop rows with NaN values created by lagging
data = data.dropna()
# Separate features and target variable
X = data.drop(columns=['date', 'OT'])
y = data['OT']
# Normalization
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
# Convert back into DataFrame for easier manipulation
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
return X_scaled, y_scaled, scaler_y
# Function to train and evaluate the model
def train_and_evaluate_model(X_scaled, y_scaled, scaler_y):
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)
# Initialize and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict on the validation set
y_pred_scaled = model.predict(X_val)
# Final prediction
final_prediction_scaled = model.predict(X_scaled.iloc[-1].values.reshape(1, -1))
final_prediction = scaler_y.inverse_transform(final_prediction_scaled.reshape(-1, 1))
# Inverse transform to get the predictions in the original scale
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
y_val_original = scaler_y.inverse_transform(y_val.reshape(-1, 1)).flatten()
# Evaluate the model
mse = mean_squared_error(y_val_original, y_pred)
mae = mean_absolute_error(y_val_original, y_pred)
# Inverse transform the errors to get them in the original scale
mse_original = scaler_y.inverse_transform([[mse]])[0][0]
mae_original = scaler_y.inverse_transform([[mae]])[0][0]
# Perform cross-validation
cv_scores = cross_val_score(model, X_scaled, y_scaled, cv=5, scoring='neg_mean_squared_error')
cv_scores = -cv_scores
cv_scores_original = scaler_y.inverse_transform(cv_scores.reshape(-1, 1)).flatten()
return y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction
# Function to perform feature selection
def feature_selection(X_scaled, y_scaled, scaler_y):
# Perform feature selection
selector = SelectKBest(score_func=f_regression, k=10) # Select top 10 features
X_selected = selector.fit_transform(X_scaled, y_scaled)
# Get the indices of the selected features
selected_indices = selector.get_support(indices=True)
selected_features = X_scaled.columns[selected_indices]
return selected_features, X_selected
# Function to create visualizations
def create_visualizations(y_val_original, y_pred):
# Create a DataFrame for visualization
df_val_lr = pd.DataFrame({
'Actual': y_val_original.flatten(),
'Predicted': y_pred.flatten()
})
# Scatter Plot: Actual vs. Predicted for Linear Regression
scatter_plot_lr = go.Figure()
scatter_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=df_val_lr['Predicted'], mode='markers', name='Actual vs. Predicted', marker=dict(color='orange')))
scatter_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], mode='lines', name='Ideal', line=dict(color='black')))
scatter_plot_lr.update_layout(
title='Actual vs. Predicted Oil Temperature (Linear Regression)',
xaxis_title='Actual Oil Temperature',
yaxis_title='Predicted Oil Temperature',
plot_bgcolor='white'
)
# Residual Plot for Linear Regression
residuals_lr = df_val_lr['Actual'] - df_val_lr['Predicted']
residual_plot_lr = go.Figure()
residual_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=residuals_lr, mode='markers', name='Residuals', marker=dict(color='orange')))
residual_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[0, 0], mode='lines', name='Zero Residual Line', line=dict(color='black')))
residual_plot_lr.update_layout(
title='Residual Plot (Linear Regression)',
xaxis_title='Actual Oil Temperature',
yaxis_title='Residuals',
plot_bgcolor='white'
)
# Time Series Plot for Linear Regression
df_val_lr['Timestamp'] = pd.date_range(start='2016-01-01', periods=len(df_val_lr), freq='h')
time_series_plot_lr = go.Figure()
time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Actual'], mode='lines', name='Actual', line=dict(color='orange')))
time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Predicted'], mode='lines', name='Predicted', line=dict(color='black')))
time_series_plot_lr.update_layout(
title='Time Series Plot of Actual vs. Predicted Oil Temperature (Linear Regression)',
xaxis_title='Timestamp',
yaxis_title='Oil Temperature',
plot_bgcolor='white'
)
return scatter_plot_lr, residual_plot_lr, time_series_plot_lr
# Streamlit App
def main():
st.title("ETTm2 Dataset Analysis and Prediction")
st.image('csb_goldfish_ETTm2.png', caption='Pepperidge Farm Chilean Sea Bass')
# Load and preprocess the dataset
X_scaled, y_scaled, scaler_y = load_and_preprocess_data()
# Train and evaluate the model
y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction = train_and_evaluate_model(X_scaled, y_scaled, scaler_y)
# Display evaluation metrics
st.write(f"Mean Squared Error (Original Scale): {mse_original}")
st.write(f"Mean Absolute Error (Original Scale): {mae_original}")
st.write(f"Mean Cross-Validation Score (Original Scale): {np.mean(cv_scores_original)}")
st.write(f"Final Predicted Oil Temperature: {final_prediction[0][0]}")
# Perform feature selection
selected_features, X_selected = feature_selection(X_scaled, y_scaled, scaler_y)
st.write(f"Selected Features: {selected_features}")
# Create visualizations
scatter_plot_lr, residual_plot_lr, time_series_plot_lr = create_visualizations(y_val_original, y_pred)
# Display visualizations
st.plotly_chart(scatter_plot_lr)
st.plotly_chart(residual_plot_lr)
st.plotly_chart(time_series_plot_lr)
if __name__ == "__main__":
main()