Spaces:

TroglodyteDerivations
/

ETTm2_Analysis_And_Prediction

Sleeping

App Files Files Community

ETTm2_Analysis_And_Prediction / app.py

TroglodyteDerivations

Update app.py

feaffce verified 4 months ago

raw

history blame contribute delete

7.21 kB

	import streamlit as st
	import numpy as np
	import pandas as pd
	from sklearn.preprocessing import StandardScaler
	from sklearn.linear_model import LinearRegression
	from sklearn.model_selection import train_test_split, cross_val_score
	from sklearn.metrics import mean_squared_error, mean_absolute_error
	from sklearn.feature_selection import SelectKBest, f_regression
	import plotly.graph_objs as go
	from datasets import load_dataset

	# Function to load and preprocess the dataset
	def load_and_preprocess_data():
	# Load the ETTm2 dataset from Hugging Face
	dataset = load_dataset('TroglodyteDerivations/ETTm2')
	data = dataset['train'].to_pandas()

	# Feature engineering: Create lagged features
	lags = 3 # Number of lags to create
	for col in ['HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL', 'OT']:
	for lag in range(1, lags + 1):
	data[f'{col}_lag_{lag}'] = data[col].shift(lag)

	# Drop rows with NaN values created by lagging
	data = data.dropna()

	# Separate features and target variable
	X = data.drop(columns=['date', 'OT'])
	y = data['OT']

	# Normalization
	scaler_X = StandardScaler()
	scaler_y = StandardScaler()

	X_scaled = scaler_X.fit_transform(X)
	y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

	# Convert back into DataFrame for easier manipulation
	X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

	return X_scaled, y_scaled, scaler_y

	# Function to train and evaluate the model
	def train_and_evaluate_model(X_scaled, y_scaled, scaler_y):
	# Split the data into training and validation sets
	X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

	# Initialize and train a linear regression model
	model = LinearRegression()
	model.fit(X_train, y_train)

	# Predict on the validation set
	y_pred_scaled = model.predict(X_val)

	# Final prediction
	final_prediction_scaled = model.predict(X_scaled.iloc[-1].values.reshape(1, -1))
	final_prediction = scaler_y.inverse_transform(final_prediction_scaled.reshape(-1, 1))

	# Inverse transform to get the predictions in the original scale
	y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
	y_val_original = scaler_y.inverse_transform(y_val.reshape(-1, 1)).flatten()

	# Evaluate the model
	mse = mean_squared_error(y_val_original, y_pred)
	mae = mean_absolute_error(y_val_original, y_pred)

	# Inverse transform the errors to get them in the original scale
	mse_original = scaler_y.inverse_transform([[mse]])[0][0]
	mae_original = scaler_y.inverse_transform([[mae]])[0][0]

	# Perform cross-validation
	cv_scores = cross_val_score(model, X_scaled, y_scaled, cv=5, scoring='neg_mean_squared_error')
	cv_scores = -cv_scores
	cv_scores_original = scaler_y.inverse_transform(cv_scores.reshape(-1, 1)).flatten()

	return y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction

	# Function to perform feature selection
	def feature_selection(X_scaled, y_scaled, scaler_y):
	# Perform feature selection
	selector = SelectKBest(score_func=f_regression, k=10) # Select top 10 features
	X_selected = selector.fit_transform(X_scaled, y_scaled)

	# Get the indices of the selected features
	selected_indices = selector.get_support(indices=True)
	selected_features = X_scaled.columns[selected_indices]

	return selected_features, X_selected

	# Function to create visualizations
	def create_visualizations(y_val_original, y_pred):
	# Create a DataFrame for visualization
	df_val_lr = pd.DataFrame({
	'Actual': y_val_original.flatten(),
	'Predicted': y_pred.flatten()
	})

	# Scatter Plot: Actual vs. Predicted for Linear Regression
	scatter_plot_lr = go.Figure()
	scatter_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=df_val_lr['Predicted'], mode='markers', name='Actual vs. Predicted', marker=dict(color='orange')))
	scatter_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], mode='lines', name='Ideal', line=dict(color='black')))
	scatter_plot_lr.update_layout(
	title='Actual vs. Predicted Oil Temperature (Linear Regression)',
	xaxis_title='Actual Oil Temperature',
	yaxis_title='Predicted Oil Temperature',
	plot_bgcolor='white'
	)

	# Residual Plot for Linear Regression
	residuals_lr = df_val_lr['Actual'] - df_val_lr['Predicted']
	residual_plot_lr = go.Figure()
	residual_plot_lr.add_trace(go.Scatter(x=df_val_lr['Actual'], y=residuals_lr, mode='markers', name='Residuals', marker=dict(color='orange')))
	residual_plot_lr.add_trace(go.Scatter(x=[df_val_lr['Actual'].min(), df_val_lr['Actual'].max()], y=[0, 0], mode='lines', name='Zero Residual Line', line=dict(color='black')))
	residual_plot_lr.update_layout(
	title='Residual Plot (Linear Regression)',
	xaxis_title='Actual Oil Temperature',
	yaxis_title='Residuals',
	plot_bgcolor='white'
	)

	# Time Series Plot for Linear Regression
	df_val_lr['Timestamp'] = pd.date_range(start='2016-01-01', periods=len(df_val_lr), freq='h')
	time_series_plot_lr = go.Figure()
	time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Actual'], mode='lines', name='Actual', line=dict(color='orange')))
	time_series_plot_lr.add_trace(go.Scatter(x=df_val_lr['Timestamp'], y=df_val_lr['Predicted'], mode='lines', name='Predicted', line=dict(color='black')))
	time_series_plot_lr.update_layout(
	title='Time Series Plot of Actual vs. Predicted Oil Temperature (Linear Regression)',
	xaxis_title='Timestamp',
	yaxis_title='Oil Temperature',
	plot_bgcolor='white'
	)

	return scatter_plot_lr, residual_plot_lr, time_series_plot_lr

	# Streamlit App
	def main():
	st.title("ETTm2 Dataset Analysis and Prediction")
	st.image('csb_goldfish_ETTm2.png', caption='Pepperidge Farm Chilean Sea Bass')

	# Load and preprocess the dataset
	X_scaled, y_scaled, scaler_y = load_and_preprocess_data()

	# Train and evaluate the model
	y_val_original, y_pred, mse_original, mae_original, cv_scores_original, final_prediction = train_and_evaluate_model(X_scaled, y_scaled, scaler_y)

	# Display evaluation metrics
	st.write(f"Mean Squared Error (Original Scale): {mse_original}")
	st.write(f"Mean Absolute Error (Original Scale): {mae_original}")
	st.write(f"Mean Cross-Validation Score (Original Scale): {np.mean(cv_scores_original)}")
	st.write(f"Final Predicted Oil Temperature: {final_prediction[0][0]}")

	# Perform feature selection
	selected_features, X_selected = feature_selection(X_scaled, y_scaled, scaler_y)
	st.write(f"Selected Features: {selected_features}")

	# Create visualizations
	scatter_plot_lr, residual_plot_lr, time_series_plot_lr = create_visualizations(y_val_original, y_pred)

	# Display visualizations
	st.plotly_chart(scatter_plot_lr)
	st.plotly_chart(residual_plot_lr)
	st.plotly_chart(time_series_plot_lr)

	if __name__ == "__main__":
	main()