File size: 7,209 Bytes
15a7636 feaffce 15a7636 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import streamlit as st
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
import plotly.graph_objs as go
from datasets import load_dataset
# Function to load and preprocess the dataset
def load_and_preprocess_data():
# Load the ETTm2 dataset from Hugging Face
dataset = load_dataset('TroglodyteDerivations/ETTm2')
data = dataset['train'].to_pandas()
# Feature engineering: Create lagged features
lags = 3 # Number of lags to create
for col in ['HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL', 'OT']:
for lag in range(1, lags + 1):
data[f'{col}_lag_{lag}'] = data[col].shift(lag)
# Drop rows with NaN values created by lagging
data = data.dropna()
# Separate features and target variable
X = data.drop(columns=['date', 'OT'])
y = data['OT']
# Normalization
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()
# Convert back into DataFrame for easier manipulation
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
return X_scaled, y_scaled, scaler_y
# Function to train and evaluate the model
def train_and_evaluate_model(X_scaled, y_scaled, scaler_y):
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)
# Initialize and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict on the validation set
y_pred_scaled = model.predict(X_val)
# Final prediction
final_prediction_scaled = model.predict(X_scaled.iloc[-1].values.reshape(1, -1))
final_prediction = scaler_y.inverse_transform(final_prediction_scaled.reshape(-1, 1))
# Inverse transform to get the predictions in the original scale
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
y_val_original = scaler_y.inverse_transform(y_val.reshape(-1, 1)).flatten()
# Evaluate the model
mse = mean_squared_error(y_val_original, y_pred)
mae = mean_absolute_error(y_val_original, y_pred)
# Inverse transform the errors to get them in the original scale
mse_original = scaler_y.inverse_transform([[mse]])[0][0]
mae_original = scaler_y.inverse_transform([[mae]])[0][0]
# Perform cross-validation
cv_scores = cross_val_score(model, X_scaled, y_scaled, cv=5, scoring='neg_mean_squared_error')
cv_scores = -cv_scores
cv_scores_original = scaler_y.inverse_transform(cv_scores.reshape(-1, 1)).flatten()
return y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction
# Function to perform feature selection
def feature_selection(X_scaled, y_scaled, scaler_y):
# Perform feature selection
selector = SelectKBest(score_func=f_regression, k=10) # Select top 10 features
X_selected = selector.fit_transform(X_scaled, y_scaled)
# Get the indices of the selected features
selected_indices = selector.get_support(indices=True)
selected_features = X_scaled.columns[selected_indices]
return selected_features, X_selected
# Function to create visualizations
def create_visualizations(y_val_original, y_pred):
# Create a DataFrame for visualization
df_val_lr = pd.DataFrame({
'Actual': y_val_original.flatten(),
'Predicted': y_pred.flatten()
})
# Scatter Plot: Actual vs. Predicted for Linear Regression
scatter_plot_lr = go.Figure()
scatter_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=df_val_lr['Predicted'], mode='markers', name='Actual vs. Predicted', marker=dict(color='orange')))
scatter_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], mode='lines', name='Ideal', line=dict(color='black')))
scatter_plot_lr.update_layout(
title='Actual vs. Predicted Oil Temperature (Linear Regression)',
xaxis_title='Actual Oil Temperature',
yaxis_title='Predicted Oil Temperature',
plot_bgcolor='white'
)
# Residual Plot for Linear Regression
residuals_lr = df_val_lr['Actual'] - df_val_lr['Predicted']
residual_plot_lr = go.Figure()
residual_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=residuals_lr, mode='markers', name='Residuals', marker=dict(color='orange')))
residual_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[0, 0], mode='lines', name='Zero Residual Line', line=dict(color='black')))
residual_plot_lr.update_layout(
title='Residual Plot (Linear Regression)',
xaxis_title='Actual Oil Temperature',
yaxis_title='Residuals',
plot_bgcolor='white'
)
# Time Series Plot for Linear Regression
df_val_lr['Timestamp'] = pd.date_range(start='2016-01-01', periods=len(df_val_lr), freq='h')
time_series_plot_lr = go.Figure()
time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Actual'], mode='lines', name='Actual', line=dict(color='orange')))
time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Predicted'], mode='lines', name='Predicted', line=dict(color='black')))
time_series_plot_lr.update_layout(
title='Time Series Plot of Actual vs. Predicted Oil Temperature (Linear Regression)',
xaxis_title='Timestamp',
yaxis_title='Oil Temperature',
plot_bgcolor='white'
)
return scatter_plot_lr, residual_plot_lr, time_series_plot_lr
# Streamlit App
def main():
st.title("ETTm2 Dataset Analysis and Prediction")
st.image('csb_goldfish_ETTm2.png', caption='Pepperidge Farm Chilean Sea Bass')
# Load and preprocess the dataset
X_scaled, y_scaled, scaler_y = load_and_preprocess_data()
# Train and evaluate the model
y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction = train_and_evaluate_model(X_scaled, y_scaled, scaler_y)
# Display evaluation metrics
st.write(f"Mean Squared Error (Original Scale): {mse_original}")
st.write(f"Mean Absolute Error (Original Scale): {mae_original}")
st.write(f"Mean Cross-Validation Score (Original Scale): {np.mean(cv_scores_original)}")
st.write(f"Final Predicted Oil Temperature: {final_prediction[0][0]}")
# Perform feature selection
selected_features, X_selected = feature_selection(X_scaled, y_scaled, scaler_y)
st.write(f"Selected Features: {selected_features}")
# Create visualizations
scatter_plot_lr, residual_plot_lr, time_series_plot_lr = create_visualizations(y_val_original, y_pred)
# Display visualizations
st.plotly_chart(scatter_plot_lr)
st.plotly_chart(residual_plot_lr)
st.plotly_chart(time_series_plot_lr)
if __name__ == "__main__":
main() |