import streamlit as st import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout from tensorflow.keras.optimizers import Adam import matplotlib.pyplot as plt # Load and display dataset @st.cache_data def load_data(): data = pd.read_csv("insurance.csv") # Ensure insurance.csv is in the same directory return data data = load_data() st.title("Medical Insurance Cost Prediction with Hybrid Model") st.write("Dataset preview:") st.write(data.head()) # Preprocessing and Feature Engineering st.subheader("Data Preprocessing and Feature Engineering") data['age_smoker'] = data['age'] * data['smoker'].apply(lambda x: 1 if x == 'yes' else 0) data['bmi_smoker'] = data['bmi'] * data['smoker'].apply(lambda x: 1 if x == 'yes' else 0) # Encode categorical variables label_encoder = LabelEncoder() data['sex'] = label_encoder.fit_transform(data['sex']) data['smoker'] = label_encoder.fit_transform(data['smoker']) data['region'] = label_encoder.fit_transform(data['region']) # Select features X = data[['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'age_smoker', 'bmi_smoker']] y = data['charges'] # Standardize numerical features scaler = StandardScaler() X[['age', 'bmi', 'children', 'age_smoker', 'bmi_smoker']] = scaler.fit_transform(X[['age', 'bmi', 'children', 'age_smoker', 'bmi_smoker']]) # Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Define the neural network model def create_neural_network(): model = Sequential([ Dense(128, activation='relu', input_shape=(X_train.shape[1],)), Dropout(0.3), Dense(64, activation='relu'), Dense(1) ]) model.compile(optimizer=Adam(learning_rate=0.001), loss='mse') return model st.subheader("Training the Neural Network") nn_model = create_neural_network() nn_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1) # Generate predictions from the neural network for train and test sets nn_train_pred = nn_model.predict(X_train).flatten() nn_test_pred = nn_model.predict(X_test).flatten() # Add NN predictions as a new feature for Random Forest X_train_rf = X_train.copy() X_test_rf = X_test.copy() X_train_rf['nn_pred'] = nn_train_pred X_test_rf['nn_pred'] = nn_test_pred # Train a Random Forest on this new feature set st.subheader("Training the Random Forest with Neural Network Predictions") rf_model = RandomForestRegressor(n_estimators=200, max_depth=12, random_state=42) rf_model.fit(X_train_rf, y_train) final_predictions = rf_model.predict(X_test_rf) # Model evaluation rmse = np.sqrt(mean_squared_error(y_test, final_predictions)) r2 = r2_score(y_test, final_predictions) * 100 st.write(f"RMSE (Root Mean Squared Error): {rmse:.2f}") st.write(f"R² (Accuracy): {r2:.2f}%") # Plot actual vs predicted values st.subheader("Actual vs Predicted Values") plt.figure(figsize=(10, 5)) plt.plot(y_test.values, label="Actual Values", color='blue') plt.plot(final_predictions, label="Predicted Values", color='orange') plt.xlabel("Sample Index") plt.ylabel("Insurance Charges") plt.legend() st.pyplot(plt) # Prediction on new data st.subheader("Predict on New Data") input_data = {col: st.number_input(f"Enter {col}:", value=float(X[col].mean())) for col in X.columns} if st.button("Predict Insurance Charge"): input_df = pd.DataFrame([input_data]) input_df[['age', 'bmi', 'children', 'age_smoker', 'bmi_smoker']] = scaler.transform( input_df[['age', 'bmi', 'children', 'age_smoker', 'bmi_smoker']]) # Use neural network to predict intermediate feature nn_feature = nn_model.predict(input_df).flatten() input_df['nn_pred'] = nn_feature # Predict final charge using Random Forest final_prediction = rf_model.predict(input_df) st.write(f"Predicted Insurance Charge: ${final_prediction[0]:.2f}")