import streamlit as st import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.feature_selection import SelectKBest, f_regression import plotly.graph_objs as go from datasets import load_dataset # Function to load and preprocess the dataset def load_and_preprocess_data(): # Load the ETTm2 dataset from Hugging Face dataset = load_dataset('TroglodyteDerivations/ETTm2') data = dataset['train'].to_pandas() # Feature engineering: Create lagged features lags = 3 # Number of lags to create for col in ['HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL', 'OT']: for lag in range(1, lags + 1): data[f'{col}_lag_{lag}'] = data[col].shift(lag) # Drop rows with NaN values created by lagging data = data.dropna() # Separate features and target variable X = data.drop(columns=['date', 'OT']) y = data['OT'] # Normalization scaler_X = StandardScaler() scaler_y = StandardScaler() X_scaled = scaler_X.fit_transform(X) y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten() # Convert back into DataFrame for easier manipulation X_scaled = pd.DataFrame(X_scaled, columns=X.columns) return X_scaled, y_scaled, scaler_y # Function to train and evaluate the model def train_and_evaluate_model(X_scaled, y_scaled, scaler_y): # Split the data into training and validation sets X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42) # Initialize and train a linear regression model model = LinearRegression() model.fit(X_train, y_train) # Predict on the validation set y_pred_scaled = model.predict(X_val) # Final prediction final_prediction_scaled = model.predict(X_scaled.iloc[-1].values.reshape(1, -1)) final_prediction = scaler_y.inverse_transform(final_prediction_scaled.reshape(-1, 1)) # Inverse transform to get the predictions in the original scale y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten() y_val_original = scaler_y.inverse_transform(y_val.reshape(-1, 1)).flatten() # Evaluate the model mse = mean_squared_error(y_val_original, y_pred) mae = mean_absolute_error(y_val_original, y_pred) # Inverse transform the errors to get them in the original scale mse_original = scaler_y.inverse_transform([[mse]])[0][0] mae_original = scaler_y.inverse_transform([[mae]])[0][0] # Perform cross-validation cv_scores = cross_val_score(model, X_scaled, y_scaled, cv=5, scoring='neg_mean_squared_error') cv_scores = -cv_scores cv_scores_original = scaler_y.inverse_transform(cv_scores.reshape(-1, 1)).flatten() return y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction # Function to perform feature selection def feature_selection(X_scaled, y_scaled, scaler_y): # Perform feature selection selector = SelectKBest(score_func=f_regression, k=10) # Select top 10 features X_selected = selector.fit_transform(X_scaled, y_scaled) # Get the indices of the selected features selected_indices = selector.get_support(indices=True) selected_features = X_scaled.columns[selected_indices] return selected_features, X_selected # Function to create visualizations def create_visualizations(y_val_original, y_pred): # Create a DataFrame for visualization df_val_lr = pd.DataFrame({ 'Actual': y_val_original.flatten(), 'Predicted': y_pred.flatten() }) # Scatter Plot: Actual vs. Predicted for Linear Regression scatter_plot_lr = go.Figure() scatter_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=df_val_lr['Predicted'], mode='markers', name='Actual vs. Predicted', marker=dict(color='orange'))) scatter_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], mode='lines', name='Ideal', line=dict(color='black'))) scatter_plot_lr.update_layout( title='Actual vs. Predicted Oil Temperature (Linear Regression)', xaxis_title='Actual Oil Temperature', yaxis_title='Predicted Oil Temperature', plot_bgcolor='white' ) # Residual Plot for Linear Regression residuals_lr = df_val_lr['Actual'] - df_val_lr['Predicted'] residual_plot_lr = go.Figure() residual_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=residuals_lr, mode='markers', name='Residuals', marker=dict(color='orange'))) residual_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[0, 0], mode='lines', name='Zero Residual Line', line=dict(color='black'))) residual_plot_lr.update_layout( title='Residual Plot (Linear Regression)', xaxis_title='Actual Oil Temperature', yaxis_title='Residuals', plot_bgcolor='white' ) # Time Series Plot for Linear Regression df_val_lr['Timestamp'] = pd.date_range(start='2016-01-01', periods=len(df_val_lr), freq='h') time_series_plot_lr = go.Figure() time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Actual'], mode='lines', name='Actual', line=dict(color='orange'))) time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Predicted'], mode='lines', name='Predicted', line=dict(color='black'))) time_series_plot_lr.update_layout( title='Time Series Plot of Actual vs. Predicted Oil Temperature (Linear Regression)', xaxis_title='Timestamp', yaxis_title='Oil Temperature', plot_bgcolor='white' ) return scatter_plot_lr, residual_plot_lr, time_series_plot_lr # Streamlit App def main(): st.title("ETTm2 Dataset Analysis and Prediction") st.image('csb_goldfish_ETTm2.png', caption='Pepperidge Farm Chilean Sea Bass') # Load and preprocess the dataset X_scaled, y_scaled, scaler_y = load_and_preprocess_data() # Train and evaluate the model y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction = train_and_evaluate_model(X_scaled, y_scaled, scaler_y) # Display evaluation metrics st.write(f"Mean Squared Error (Original Scale): {mse_original}") st.write(f"Mean Absolute Error (Original Scale): {mae_original}") st.write(f"Mean Cross-Validation Score (Original Scale): {np.mean(cv_scores_original)}") st.write(f"Final Predicted Oil Temperature: {final_prediction[0][0]}") # Perform feature selection selected_features, X_selected = feature_selection(X_scaled, y_scaled, scaler_y) st.write(f"Selected Features: {selected_features}") # Create visualizations scatter_plot_lr, residual_plot_lr, time_series_plot_lr = create_visualizations(y_val_original, y_pred) # Display visualizations st.plotly_chart(scatter_plot_lr) st.plotly_chart(residual_plot_lr) st.plotly_chart(time_series_plot_lr) if __name__ == "__main__": main()