File size: 7,089 Bytes
d40d4af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5471564
 
 
2a3e0b8
d40d4af
 
 
 
 
 
5471564
d40d4af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5471564
d40d4af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a3e0b8
d40d4af
 
 
 
 
 
 
5471564
 
 
 
d40d4af
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import gradio as gr
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from scipy.special import expit

theme = gr.themes.Monochrome(
    primary_hue="indigo",
    secondary_hue="blue",
    neutral_hue="slate",
)
model_card = f"""
## Description

The **Out-of-bag (OOB)** method is a useful technique for estimating the optimal number of boosting iterations.
This method is similar to cross-validation, but it does not require repeated model fitting and can be computed on-the-fly. 
**OOB** estimates are only applicable to Stochastic Gradient Boosting (i.e., subsample < 1.0). They are calculated from the improvement in loss based on examples not included in the bootstrap sample (i.e., out-of-bag examples). 
The **OOB** estimator provides a conservative estimate of the true test loss but is still a reasonable approximation for a small number of trees. 
In this demonstration, a **GradientBoostingClassifier** model is trained on a simulation dataset, and the loss of the training set, test set, and OOB set are displayed in the figure.
This information allows you to determine the level of generalization of your trained model on the simulation dataset.
You can play around with ``number of samples``,``number of splits fold``, ``random seed``and ``number of estimation step``

## Dataset

Simulation data
"""

def do_train(n_samples, n_splits, random_seed, n_estimators):
    # Generate data (adapted from G. Ridgeway's gbm example)
    random_state = np.random.RandomState(random_seed)
    x1 = random_state.uniform(size=n_samples)
    x2 = random_state.uniform(size=n_samples)
    x3 = random_state.randint(0, 4, size=n_samples)

    p = expit(np.sin(3 * x1) - 4 * x2 + x3)
    y = random_state.binomial(1, p, size=n_samples)

    X = np.c_[x1, x2, x3]

    X = X.astype(np.float32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=random_seed)

    # Fit classifier with out-of-bag estimates
    params = {
        "n_estimators": n_estimators,
        "max_depth": 3,
        "subsample": 0.5,
        "learning_rate": 0.01,
        "min_samples_leaf": 1,
        "random_state": random_seed,
    }
    clf = GradientBoostingClassifier(**params)

    clf.fit(X_train, y_train)
    train_acc = clf.score(X_train, y_train)
    test_acc = clf.score(X_test, y_test)
    text = f"Train set accuracy: {train_acc*100:.2f}%. Test set accuracy: {test_acc*100:.2f}%"
    n_estimators = params["n_estimators"]
    x = np.arange(n_estimators) + 1

    def heldout_score(clf, X_test, y_test):
        """compute deviance scores on ``X_test`` and ``y_test``."""
        score = np.zeros((n_estimators,), dtype=np.float64)
        for i, y_proba in enumerate(clf.staged_predict_proba(X_test)):
            score[i] = 2 * log_loss(y_test, y_proba[:, 1])
        return score

    def cv_estimate(n_splits):
        cv = KFold(n_splits=n_splits)
        cv_clf = GradientBoostingClassifier(**params)
        val_scores = np.zeros((n_estimators,), dtype=np.float64)
        for train, test in cv.split(X_train, y_train):
            cv_clf.fit(X_train[train], y_train[train])
            val_scores += heldout_score(cv_clf, X_train[test], y_train[test])
        val_scores /= n_splits
        return val_scores

    # Estimate best n_splits using cross-validation
    cv_score = cv_estimate(n_splits)

    # Compute best n_splits for test data
    test_score = heldout_score(clf, X_test, y_test)

    # negative cumulative sum of oob improvements
    cumsum = -np.cumsum(clf.oob_improvement_)

    # min loss according to OOB
    oob_best_iter = x[np.argmin(cumsum)]

    # min loss according to test (normalize such that first loss is 0)
    test_score -= test_score[0]
    test_best_iter = x[np.argmin(test_score)]

    # min loss according to cv (normalize such that first loss is 0)
    cv_score -= cv_score[0]
    cv_best_iter = x[np.argmin(cv_score)]

    # color brew for the three curves
    oob_color = list(map(lambda x: x / 256.0, (190, 174, 212)))
    test_color = list(map(lambda x: x / 256.0, (127, 201, 127)))
    cv_color = list(map(lambda x: x / 256.0, (253, 192, 134)))

    # line type for the three curves
    oob_line = "dashed"
    test_line = "solid"
    cv_line = "dashdot"

    # plot curves and vertical lines for best iterations
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.plot(x, cumsum, label="OOB loss", color=oob_color, linestyle=oob_line)
    ax.plot(x, test_score, label="Test loss", color=test_color, linestyle=test_line)
    ax.plot(x, cv_score, label="CV loss", color=cv_color, linestyle=cv_line)
    ax.axvline(x=oob_best_iter, color=oob_color, linestyle=oob_line)
    ax.axvline(x=test_best_iter, color=test_color, linestyle=test_line)
    ax.axvline(x=cv_best_iter, color=cv_color, linestyle=cv_line)

    # add three vertical lines to xticks
    xticks = plt.xticks()
    xticks_pos = np.array(
        xticks[0].tolist() + [oob_best_iter, cv_best_iter, test_best_iter]
    )
    xticks_label = np.array(list(map(lambda t: int(t), xticks[0])) + ["OOB", "CV", "Test"])
    ind = np.argsort(xticks_pos)
    xticks_pos = xticks_pos[ind]
    xticks_label = xticks_label[ind]
    ax.set_xticks(xticks_pos, xticks_label, rotation=90)

    ax.legend(loc="upper center")
    ax.set_ylabel("normalized loss")
    ax.set_xlabel("number of iterations")
    return fig, text


with gr.Blocks(theme=theme) as demo:
    gr.Markdown('''
            <div>
            <h1 style='text-align: center'>Gradient Boosting Out-of-Bag estimates</h1>
            </div>
        ''')
    gr.Markdown(model_card)
    gr.Markdown("Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the example from <a href=\"https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_oob.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-oob-py\">scikit-learn</a>")
    n_samples = gr.Slider(minimum=500, maximum=5000, step=500, value=500, label="Number of samples")
    n_splits = gr.Slider(minimum=2, maximum=10, step=1, value=3, label="Number of cross validation folds")
    random_seed = gr.Slider(minimum=0, maximum=2000, step=1, value=0, label="Random seed")
    n_estimators = gr.Slider(minimum=500, maximum=2000, step=100, value=500, label="Number of step")

    with gr.Row():
        with gr.Column():
            plot = gr.Plot()
        with gr.Column():
            result = gr.Textbox(label="Resusts")
        
    n_samples.change(fn=do_train, inputs=[n_samples, n_splits, random_seed, n_estimators], outputs=[plot, result])
    n_splits.change(fn=do_train, inputs=[n_samples, n_splits, random_seed, n_estimators], outputs=[plot, result])
    random_seed.change(fn=do_train, inputs=[n_samples, n_splits, random_seed, n_estimators], outputs=[plot, result])
    n_estimators.change(fn=do_train, inputs=[n_samples, n_splits, random_seed, n_estimators], outputs=[plot, result])

demo.launch()