import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

import gradio as gr

matplotlib.use('agg') 

# Generate data
def f(x):
    x = x.ravel()

    return np.exp(-(x**2)) + 1.5 * np.exp(-((x - 2) ** 2))


def generate(n_samples, noise, n_repeat=1):
    X = np.random.rand(n_samples) * 10 - 5
    X = np.sort(X)

    if n_repeat == 1:
        y = f(X) + np.random.normal(0.0, noise, n_samples)
    else:
        y = np.zeros((n_samples, n_repeat))

        for i in range(n_repeat):
            y[:, i] = f(X) + np.random.normal(0.0, noise, n_samples)

    X = X.reshape((n_samples, 1))

    return X, y

def train_model(n_train, noise):
    # Settings
    n_repeat = 50  # Number of iterations for computing expectations
    # n_train = 50  # Size of the training set
    n_test = 1000  # Size of the test set
    # noise = noise  # Standard deviation of the noise
    np.random.seed(0)

    # Change this for exploring the bias-variance decomposition of other
    # estimators. This should work well for estimators with high variance (e.g.,
    # decision trees or KNN), but poorly for estimators with low variance (e.g.,
    # linear models).
    estimators = [
        ("Tree", DecisionTreeRegressor()),
        ("Bagging(Tree)", BaggingRegressor(DecisionTreeRegressor())),
    ]

    n_estimators = len(estimators)


    X_train = []
    y_train = []

    for i in range(n_repeat):
        X, y = generate(n_samples=n_train, noise=noise)
        X_train.append(X)
        y_train.append(y)

    X_test, y_test = generate(n_samples=n_test, noise=noise, n_repeat=n_repeat)

    fig = plt.figure(figsize=(10, 8))

    out_str = ""
    # Loop over estimators to compare
    for n, (name, estimator) in enumerate(estimators):
        # Compute predictions
        y_predict = np.zeros((n_test, n_repeat))

        for i in range(n_repeat):
            estimator.fit(X_train[i], y_train[i])
            y_predict[:, i] = estimator.predict(X_test)

        # Bias^2 + Variance + Noise decomposition of the mean squared error
        y_error = np.zeros(n_test)

        for i in range(n_repeat):
            for j in range(n_repeat):
                y_error += (y_test[:, j] - y_predict[:, i]) ** 2

        y_error /= n_repeat * n_repeat

        y_noise = np.var(y_test, axis=1)
        y_bias = (f(X_test) - np.mean(y_predict, axis=1)) ** 2
        y_var = np.var(y_predict, axis=1)

        
        out_str += f"{name}: {np.mean(y_error):.4f} (error) = {np.mean(y_bias):.4f} (bias^2) + {np.mean(y_var):.4f} (var) + {np.mean(y_noise):.4f} (noise)\n"
        

        # Plot figures
        plt.subplot(2, n_estimators, n + 1)
        plt.plot(X_test, f(X_test), "b", label="$f(x)$")
        plt.plot(X_train[0], y_train[0], ".b", label="LS ~ $y = f(x)+noise$")

        for i in range(n_repeat):
            if i == 0:
                plt.plot(X_test, y_predict[:, i], "r", label=r"$\^y(x)$")
            else:
                plt.plot(X_test, y_predict[:, i], "r", alpha=0.05)

        plt.plot(X_test, np.mean(y_predict, axis=1), "c", label=r"$\mathbb{E}_{LS} \^y(x)$")

        plt.xlim([-5, 5])
        plt.title(name)

        if n == n_estimators - 1:
            plt.legend(loc=(1.1, 0.5))

        plt.subplot(2, n_estimators, n_estimators + n + 1)
        plt.plot(X_test, y_error, "r", label="$error(x)$")
        plt.plot(X_test, y_bias, "b", label="$bias^2(x)$"),
        plt.plot(X_test, y_var, "g", label="$variance(x)$"),
        plt.plot(X_test, y_noise, "c", label="$noise(x)$")

        plt.xlim([-5, 5])
        plt.ylim([0, noise])

        if n == n_estimators - 1:
            plt.legend(loc=(1.1, 0.5))

    plt.subplots_adjust(right=0.75)
    
    return fig, out_str

title = "Single estimator versus bagging: bias-variance decomposition ⚖️"
description = """This example illustrates and compares the bias-variance decomposition of the \
                expected mean squared error of a single estimator (Decision Tree Regressor) \
                against a bagging ensemble of Tree Regressors. \
                
                The dataset used for this demo is a one-dimensional synthetic dataset generated \
                for a regression problem. In the top two figures, the blue line represents the true \
                function and the blue dots represent the training data that are obtained by adding some \
                random noise (user selected). The prediction of the models is represented by the red line. \
                The average prediction of each estimator is presented in cyan.

                In the two lower figures, we can see the decomposition of the expected mean squared error \
                (red) into the bias (blue) and variance (green), as well as the noise part of the error (cyan).
            """
with gr.Blocks() as demo:
    gr.Markdown(f"## {title}")
    gr.Markdown(description)

    num_samples = gr.Slider(minimum=50, maximum=200, step=50, value=50, label="Number of samples")
    noise = gr.Slider(minimum=0.05, maximum=0.2, step=0.05, value=0.1, label="Noise")

    with gr.Row():
        with gr.Row():
            with gr.Column(scale=2):
                plot = gr.Plot()
            with gr.Column(scale=1):
                results = gr.Textbox(label="Results")
        
    num_samples.change(fn=train_model, inputs=[num_samples, noise], outputs=[plot, results])
    noise.change(fn=train_model, inputs=[num_samples, noise], outputs=[plot, results])
    
demo.launch(enable_queue=True)