File size: 7,209 Bytes
15a7636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feaffce
15a7636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import streamlit as st
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
import plotly.graph_objs as go
from datasets import load_dataset

# Function to load and preprocess the dataset
def load_and_preprocess_data():
    # Load the ETTm2 dataset from Hugging Face
    dataset = load_dataset('TroglodyteDerivations/ETTm2')
    data = dataset['train'].to_pandas()

    # Feature engineering: Create lagged features
    lags = 3  # Number of lags to create
    for col in ['HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL', 'OT']:
        for lag in range(1, lags + 1):
            data[f'{col}_lag_{lag}'] = data[col].shift(lag)

    # Drop rows with NaN values created by lagging
    data = data.dropna()

    # Separate features and target variable
    X = data.drop(columns=['date', 'OT'])
    y = data['OT']

    # Normalization
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()

    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

    # Convert back into DataFrame for easier manipulation
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

    return X_scaled, y_scaled, scaler_y

# Function to train and evaluate the model
def train_and_evaluate_model(X_scaled, y_scaled, scaler_y):
    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

    # Initialize and train a linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict on the validation set
    y_pred_scaled = model.predict(X_val)
    
    # Final prediction
    final_prediction_scaled = model.predict(X_scaled.iloc[-1].values.reshape(1, -1))
    final_prediction = scaler_y.inverse_transform(final_prediction_scaled.reshape(-1, 1))

    # Inverse transform to get the predictions in the original scale
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
    y_val_original = scaler_y.inverse_transform(y_val.reshape(-1, 1)).flatten()

    # Evaluate the model
    mse = mean_squared_error(y_val_original, y_pred)
    mae = mean_absolute_error(y_val_original, y_pred)

    # Inverse transform the errors to get them in the original scale
    mse_original = scaler_y.inverse_transform([[mse]])[0][0]
    mae_original = scaler_y.inverse_transform([[mae]])[0][0]

    # Perform cross-validation
    cv_scores = cross_val_score(model, X_scaled, y_scaled, cv=5, scoring='neg_mean_squared_error')
    cv_scores = -cv_scores
    cv_scores_original = scaler_y.inverse_transform(cv_scores.reshape(-1, 1)).flatten()

    return y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction

# Function to perform feature selection
def feature_selection(X_scaled, y_scaled, scaler_y):
    # Perform feature selection
    selector = SelectKBest(score_func=f_regression, k=10)  # Select top 10 features
    X_selected = selector.fit_transform(X_scaled, y_scaled)

    # Get the indices of the selected features
    selected_indices = selector.get_support(indices=True)
    selected_features = X_scaled.columns[selected_indices]

    return selected_features, X_selected

# Function to create visualizations
def create_visualizations(y_val_original, y_pred):
    # Create a DataFrame for visualization
    df_val_lr = pd.DataFrame({
        'Actual': y_val_original.flatten(),
        'Predicted': y_pred.flatten()
    })

    # Scatter Plot: Actual vs. Predicted for Linear Regression
    scatter_plot_lr = go.Figure()
    scatter_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=df_val_lr['Predicted'], mode='markers', name='Actual vs. Predicted', marker=dict(color='orange')))
    scatter_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], mode='lines', name='Ideal', line=dict(color='black')))
    scatter_plot_lr.update_layout(
        title='Actual vs. Predicted Oil Temperature (Linear Regression)',
        xaxis_title='Actual Oil Temperature',
        yaxis_title='Predicted Oil Temperature',
        plot_bgcolor='white'
    )

    # Residual Plot for Linear Regression
    residuals_lr = df_val_lr['Actual'] - df_val_lr['Predicted']
    residual_plot_lr = go.Figure()
    residual_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=residuals_lr, mode='markers', name='Residuals', marker=dict(color='orange')))
    residual_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[0, 0], mode='lines', name='Zero Residual Line', line=dict(color='black')))
    residual_plot_lr.update_layout(
        title='Residual Plot (Linear Regression)',
        xaxis_title='Actual Oil Temperature',
        yaxis_title='Residuals',
        plot_bgcolor='white'
    )

    # Time Series Plot for Linear Regression
    df_val_lr['Timestamp'] = pd.date_range(start='2016-01-01', periods=len(df_val_lr), freq='h')
    time_series_plot_lr = go.Figure()
    time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Actual'], mode='lines', name='Actual', line=dict(color='orange')))
    time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Predicted'], mode='lines', name='Predicted', line=dict(color='black')))
    time_series_plot_lr.update_layout(
        title='Time Series Plot of Actual vs. Predicted Oil Temperature (Linear Regression)',
        xaxis_title='Timestamp',
        yaxis_title='Oil Temperature',
        plot_bgcolor='white'
    )

    return scatter_plot_lr, residual_plot_lr, time_series_plot_lr

# Streamlit App
def main():
    st.title("ETTm2 Dataset Analysis and Prediction")
    st.image('csb_goldfish_ETTm2.png', caption='Pepperidge Farm Chilean Sea Bass')

    # Load and preprocess the dataset
    X_scaled, y_scaled, scaler_y = load_and_preprocess_data()

    # Train and evaluate the model
    y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction = train_and_evaluate_model(X_scaled, y_scaled, scaler_y)

    # Display evaluation metrics
    st.write(f"Mean Squared Error (Original Scale): {mse_original}")
    st.write(f"Mean Absolute Error (Original Scale): {mae_original}")
    st.write(f"Mean Cross-Validation Score (Original Scale): {np.mean(cv_scores_original)}")
    st.write(f"Final Predicted Oil Temperature: {final_prediction[0][0]}")

    # Perform feature selection
    selected_features, X_selected = feature_selection(X_scaled, y_scaled, scaler_y)
    st.write(f"Selected Features: {selected_features}")

    # Create visualizations
    scatter_plot_lr, residual_plot_lr, time_series_plot_lr = create_visualizations(y_val_original, y_pred)

    # Display visualizations
    st.plotly_chart(scatter_plot_lr)
    st.plotly_chart(residual_plot_lr)
    st.plotly_chart(time_series_plot_lr)

if __name__ == "__main__":
    main()