Spaces:

TroglodyteDerivations
/

ETTm2_Analysis_And_Prediction

Sleeping

File size: 7,209 Bytes

import streamlit as st
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
import plotly.graph_objs as go
from datasets import load_dataset

# Function to load and preprocess the dataset
def load_and_preprocess_data():
    # Load the ETTm2 dataset from Hugging Face
    dataset = load_dataset('TroglodyteDerivations/ETTm2')
    data = dataset['train'].to_pandas()

    # Feature engineering: Create lagged features
    lags = 3  # Number of lags to create
    for col in ['HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL', 'OT']:
        for lag in range(1, lags + 1):
            data[f'{col}_lag_{lag}'] = data[col].shift(lag)

    # Drop rows with NaN values created by lagging
    data = data.dropna()

    # Separate features and target variable
    X = data.drop(columns=['date', 'OT'])
    y = data['OT']

    # Normalization
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()

    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

    # Convert back into DataFrame for easier manipulation
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

    return X_scaled, y_scaled, scaler_y

# Function to train and evaluate the model
def train_and_evaluate_model(X_scaled, y_scaled, scaler_y):
    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

    # Initialize and train a linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict on the validation set
    y_pred_scaled = model.predict(X_val)
    
    # Final prediction
    final_prediction_scaled = model.predict(X_scaled.iloc[-1].values.reshape(1, -1))
    final_prediction = scaler_y.inverse_transform(final_prediction_scaled.reshape(-1, 1))

    # Inverse transform to get the predictions in the original scale
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
    y_val_original = scaler_y.inverse_transform(y_val.reshape(-1, 1)).flatten()

    # Evaluate the model
    mse = mean_squared_error(y_val_original, y_pred)
    mae = mean_absolute_error(y_val_original, y_pred)

    # Inverse transform the errors to get them in the original scale
    mse_original = scaler_y.inverse_transform([[mse]])[0][0]
    mae_original = scaler_y.inverse_transform([[mae]])[0][0]

    # Perform cross-validation
    cv_scores = cross_val_score(model, X_scaled, y_scaled, cv=5, scoring='neg_mean_squared_error')
    cv_scores = -cv_scores
    cv_scores_original = scaler_y.inverse_transform(cv_scores.reshape(-1, 1)).flatten()

    return y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction

# Function to perform feature selection
def feature_selection(X_scaled, y_scaled, scaler_y):
    # Perform feature selection
    selector = SelectKBest(score_func=f_regression, k=10)  # Select top 10 features
    X_selected = selector.fit_transform(X_scaled, y_scaled)

    # Get the indices of the selected features
    selected_indices = selector.get_support(indices=True)
    selected_features = X_scaled.columns[selected_indices]

    return selected_features, X_selected

# Function to create visualizations
def create_visualizations(y_val_original, y_pred):
    # Create a DataFrame for visualization
    df_val_lr = pd.DataFrame({
        'Actual': y_val_original.flatten(),
        'Predicted': y_pred.flatten()
    })

    # Scatter Plot: Actual vs. Predicted for Linear Regression
    scatter_plot_lr = go.Figure()
    scatter_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=df_val_lr['Predicted'], mode='markers', name='Actual vs. Predicted', marker=dict(color='orange')))
    scatter_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], mode='lines', name='Ideal', line=dict(color='black')))
    scatter_plot_lr.update_layout(
        title='Actual vs. Predicted Oil Temperature (Linear Regression)',
        xaxis_title='Actual Oil Temperature',
        yaxis_title='Predicted Oil Temperature',
        plot_bgcolor='white'
    )

    # Residual Plot for Linear Regression
    residuals_lr = df_val_lr['Actual'] - df_val_lr['Predicted']
    residual_plot_lr = go.Figure()
    residual_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=residuals_lr, mode='markers', name='Residuals', marker=dict(color='orange')))
    residual_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[0, 0], mode='lines', name='Zero Residual Line', line=dict(color='black')))
    residual_plot_lr.update_layout(
        title='Residual Plot (Linear Regression)',
        xaxis_title='Actual Oil Temperature',
        yaxis_title='Residuals',
        plot_bgcolor='white'
    )

    # Time Series Plot for Linear Regression
    df_val_lr['Timestamp'] = pd.date_range(start='2016-01-01', periods=len(df_val_lr), freq='h')
    time_series_plot_lr = go.Figure()
    time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Actual'], mode='lines', name='Actual', line=dict(color='orange')))
    time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Predicted'], mode='lines', name='Predicted', line=dict(color='black')))
    time_series_plot_lr.update_layout(
        title='Time Series Plot of Actual vs. Predicted Oil Temperature (Linear Regression)',
        xaxis_title='Timestamp',
        yaxis_title='Oil Temperature',
        plot_bgcolor='white'
    )

    return scatter_plot_lr, residual_plot_lr, time_series_plot_lr

# Streamlit App
def main():
    st.title("ETTm2 Dataset Analysis and Prediction")
    st.image('csb_goldfish_ETTm2.png', caption='Pepperidge Farm Chilean Sea Bass')

    # Load and preprocess the dataset
    X_scaled, y_scaled, scaler_y = load_and_preprocess_data()

    # Train and evaluate the model
    y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction = train_and_evaluate_model(X_scaled, y_scaled, scaler_y)

    # Display evaluation metrics
    st.write(f"Mean Squared Error (Original Scale): {mse_original}")
    st.write(f"Mean Absolute Error (Original Scale): {mae_original}")
    st.write(f"Mean Cross-Validation Score (Original Scale): {np.mean(cv_scores_original)}")
    st.write(f"Final Predicted Oil Temperature: {final_prediction[0][0]}")

    # Perform feature selection
    selected_features, X_selected = feature_selection(X_scaled, y_scaled, scaler_y)
    st.write(f"Selected Features: {selected_features}")

    # Create visualizations
    scatter_plot_lr, residual_plot_lr, time_series_plot_lr = create_visualizations(y_val_original, y_pred)

    # Display visualizations
    st.plotly_chart(scatter_plot_lr)
    st.plotly_chart(residual_plot_lr)
    st.plotly_chart(time_series_plot_lr)

if __name__ == "__main__":
    main()