|
import streamlit as st |
|
import numpy as np |
|
import pandas as pd |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.linear_model import LinearRegression |
|
from sklearn.model_selection import train_test_split, cross_val_score |
|
from sklearn.metrics import mean_squared_error, mean_absolute_error |
|
from sklearn.feature_selection import SelectKBest, f_regression |
|
import plotly.graph_objs as go |
|
from datasets import load_dataset |
|
|
|
|
|
def load_and_preprocess_data(): |
|
|
|
dataset = load_dataset('TroglodyteDerivations/ETTm2') |
|
data = dataset['train'].to_pandas() |
|
|
|
|
|
lags = 3 |
|
for col in ['HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL', 'OT']: |
|
for lag in range(1, lags + 1): |
|
data[f'{col}_lag_{lag}'] = data[col].shift(lag) |
|
|
|
|
|
data = data.dropna() |
|
|
|
|
|
X = data.drop(columns=['date', 'OT']) |
|
y = data['OT'] |
|
|
|
|
|
scaler_X = StandardScaler() |
|
scaler_y = StandardScaler() |
|
|
|
X_scaled = scaler_X.fit_transform(X) |
|
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten() |
|
|
|
|
|
X_scaled = pd.DataFrame(X_scaled, columns=X.columns) |
|
|
|
return X_scaled, y_scaled, scaler_y |
|
|
|
|
|
def train_and_evaluate_model(X_scaled, y_scaled, scaler_y): |
|
|
|
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42) |
|
|
|
|
|
model = LinearRegression() |
|
model.fit(X_train, y_train) |
|
|
|
|
|
y_pred_scaled = model.predict(X_val) |
|
|
|
|
|
final_prediction_scaled = model.predict(X_scaled.iloc[-1].values.reshape(1, -1)) |
|
final_prediction = scaler_y.inverse_transform(final_prediction_scaled.reshape(-1, 1)) |
|
|
|
|
|
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten() |
|
y_val_original = scaler_y.inverse_transform(y_val.reshape(-1, 1)).flatten() |
|
|
|
|
|
mse = mean_squared_error(y_val_original, y_pred) |
|
mae = mean_absolute_error(y_val_original, y_pred) |
|
|
|
|
|
mse_original = scaler_y.inverse_transform([[mse]])[0][0] |
|
mae_original = scaler_y.inverse_transform([[mae]])[0][0] |
|
|
|
|
|
cv_scores = cross_val_score(model, X_scaled, y_scaled, cv=5, scoring='neg_mean_squared_error') |
|
cv_scores = -cv_scores |
|
cv_scores_original = scaler_y.inverse_transform(cv_scores.reshape(-1, 1)).flatten() |
|
|
|
return y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction |
|
|
|
|
|
def feature_selection(X_scaled, y_scaled, scaler_y): |
|
|
|
selector = SelectKBest(score_func=f_regression, k=10) |
|
X_selected = selector.fit_transform(X_scaled, y_scaled) |
|
|
|
|
|
selected_indices = selector.get_support(indices=True) |
|
selected_features = X_scaled.columns[selected_indices] |
|
|
|
return selected_features, X_selected |
|
|
|
|
|
def create_visualizations(y_val_original, y_pred): |
|
|
|
df_val_lr = pd.DataFrame({ |
|
'Actual': y_val_original.flatten(), |
|
'Predicted': y_pred.flatten() |
|
}) |
|
|
|
|
|
scatter_plot_lr = go.Figure() |
|
scatter_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=df_val_lr['Predicted'], mode='markers', name='Actual vs. Predicted', marker=dict(color='orange'))) |
|
scatter_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], mode='lines', name='Ideal', line=dict(color='black'))) |
|
scatter_plot_lr.update_layout( |
|
title='Actual vs. Predicted Oil Temperature (Linear Regression)', |
|
xaxis_title='Actual Oil Temperature', |
|
yaxis_title='Predicted Oil Temperature', |
|
plot_bgcolor='white' |
|
) |
|
|
|
|
|
residuals_lr = df_val_lr['Actual'] - df_val_lr['Predicted'] |
|
residual_plot_lr = go.Figure() |
|
residual_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=residuals_lr, mode='markers', name='Residuals', marker=dict(color='orange'))) |
|
residual_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[0, 0], mode='lines', name='Zero Residual Line', line=dict(color='black'))) |
|
residual_plot_lr.update_layout( |
|
title='Residual Plot (Linear Regression)', |
|
xaxis_title='Actual Oil Temperature', |
|
yaxis_title='Residuals', |
|
plot_bgcolor='white' |
|
) |
|
|
|
|
|
df_val_lr['Timestamp'] = pd.date_range(start='2016-01-01', periods=len(df_val_lr), freq='h') |
|
time_series_plot_lr = go.Figure() |
|
time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Actual'], mode='lines', name='Actual', line=dict(color='orange'))) |
|
time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Predicted'], mode='lines', name='Predicted', line=dict(color='black'))) |
|
time_series_plot_lr.update_layout( |
|
title='Time Series Plot of Actual vs. Predicted Oil Temperature (Linear Regression)', |
|
xaxis_title='Timestamp', |
|
yaxis_title='Oil Temperature', |
|
plot_bgcolor='white' |
|
) |
|
|
|
return scatter_plot_lr, residual_plot_lr, time_series_plot_lr |
|
|
|
|
|
def main(): |
|
st.title("ETTm2 Dataset Analysis and Prediction") |
|
st.image('csb_goldfish_ETTm2.png', caption='Pepperidge Farm Chilean Sea Bass') |
|
|
|
|
|
X_scaled, y_scaled, scaler_y = load_and_preprocess_data() |
|
|
|
|
|
y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction = train_and_evaluate_model(X_scaled, y_scaled, scaler_y) |
|
|
|
|
|
st.write(f"Mean Squared Error (Original Scale): {mse_original}") |
|
st.write(f"Mean Absolute Error (Original Scale): {mae_original}") |
|
st.write(f"Mean Cross-Validation Score (Original Scale): {np.mean(cv_scores_original)}") |
|
st.write(f"Final Predicted Oil Temperature: {final_prediction[0][0]}") |
|
|
|
|
|
selected_features, X_selected = feature_selection(X_scaled, y_scaled, scaler_y) |
|
st.write(f"Selected Features: {selected_features}") |
|
|
|
|
|
scatter_plot_lr, residual_plot_lr, time_series_plot_lr = create_visualizations(y_val_original, y_pred) |
|
|
|
|
|
st.plotly_chart(scatter_plot_lr) |
|
st.plotly_chart(residual_plot_lr) |
|
st.plotly_chart(time_series_plot_lr) |
|
|
|
if __name__ == "__main__": |
|
main() |