dog commited on
Commit
d992c15
1 Parent(s): 21cc914

Upload . with huggingface_hub

Browse files
__pycache__/app.cpython-38.pyc ADDED
Binary file (4.82 kB). View file
 
app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import datasets
4
+ import fuego
5
+ import gradio as gr
6
+ from datasets import load_dataset
7
+ from huggingface_hub import HfFolder, create_repo, delete_repo, login
8
+ from PIL import Image
9
+
10
+
11
+ datasets.disable_caching()
12
+
13
+ login(token=os.getenv("HUGGING_FACE_HUB_TOKEN", HfFolder.get_token()), add_to_git_credential=True)
14
+
15
+ labeled_samples_repo_id = create_repo("actlearn_labeled_samples", exist_ok=True, repo_type="dataset").repo_id
16
+ unlabled_samples_repo_id = create_repo("actlearn_unlabeled_samples", exist_ok=True, repo_type="dataset").repo_id
17
+ to_label_samples_repo_id = create_repo("actlearn_to_label_samples", exist_ok=True, repo_type="dataset").repo_id
18
+ test_dataset_repo_id = create_repo("actlearn_test_mnist", exist_ok=True, repo_type="dataset").repo_id
19
+ model_repo_id = create_repo("actlearn_mnist_model", exist_ok=True).repo_id
20
+
21
+
22
+ idx = 0
23
+ try:
24
+ data_to_label = load_dataset(to_label_samples_repo_id)
25
+ imgs = data_to_label["train"]["image"]
26
+ except:
27
+ imgs = None
28
+ data_to_label = None
29
+
30
+
31
+ def get_image():
32
+ global idx
33
+ if imgs is None:
34
+ return None
35
+ new_img = imgs[idx]
36
+ idx += 1
37
+ return new_img
38
+
39
+
40
+ labeled_data = []
41
+
42
+ information = """# Active Learning Demo
43
+ This demo showcases Active Learning, which is great when labeling is expensive. In this demo, you will label images by choosing a digit (0-9).
44
+ How does this work?
45
+ * There is a large pool of unlabeled images
46
+ * A model is trained with the few labeled images
47
+ * We can then use the model to pick the images with the lowest confidence or with the lowest probability of corresponding to an image. These are the images for which the model is confused, so by improving them, the quality of the model can improve much more than queries for which the model was already doing well!
48
+ * In this UI, you will be provided a couple of images to label
49
+ * Once all the provided images are labeled, the model is retrained, and a new set of images is chosen!
50
+ """
51
+
52
+ training_info = """## Model Retraining
53
+ There are new labeled images. The model is retraining. Follow progress in the "fuego" space that was spun up for you in your profile.
54
+ """
55
+
56
+ with gr.Blocks() as demo:
57
+ gr.Markdown(information)
58
+
59
+ img_to_label = gr.Image(shape=[28, 28], value=get_image(), visible=True if imgs is not None else False)
60
+ label_dropdown = gr.Dropdown(
61
+ choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], interactive=True, value=0, visible=True if imgs is not None else False
62
+ )
63
+ save_btn = gr.Button("Save label", visible=True if imgs is not None else False)
64
+ output_box = gr.Markdown(value=training_info, visible=False)
65
+ reload_btn = gr.Button("Reload", visible=False if imgs is not None else True)
66
+
67
+ def save_data(img, label):
68
+ global labeled_data
69
+ global idx
70
+
71
+ labeled_data.append([img, label])
72
+
73
+ if imgs is not None and len(imgs) == idx:
74
+ # Remove dataset of queries to label
75
+ # datasets library does not allow pushing an empty dataset, so as a
76
+ # workaround we just delete the repo
77
+ delete_repo(repo_id=to_label_samples_repo_id, repo_type="dataset")
78
+ create_repo(repo_id=to_label_samples_repo_id, repo_type="dataset")
79
+
80
+ # Push to training dataset
81
+ labeled_dataset = load_dataset(labeled_samples_repo_id)["train"]
82
+ feature = datasets.Image(decode=False)
83
+ for img, label in labeled_data:
84
+ # Hack due to https://github.com/huggingface/datasets/issues/4796
85
+ labeled_dataset = labeled_dataset.add_item(
86
+ {"image": feature.encode_example(Image.fromarray(img)), "label": label}
87
+ )
88
+ labeled_dataset.push_to_hub(labeled_samples_repo_id)
89
+
90
+ # Clean up data
91
+ labeled_data = []
92
+ idx = 0
93
+
94
+ fuego.run("training/run.py", "training/requirements.txt", space_id="actlearn-fuego-runner")
95
+
96
+ # Update UI
97
+ return {
98
+ img_to_label: gr.update(visible=False),
99
+ label_dropdown: gr.update(visible=False),
100
+ save_btn: gr.update(visible=False),
101
+ output_box: gr.update(visible=True, value=training_info),
102
+ reload_btn: gr.update(visible=True),
103
+ }
104
+ else:
105
+ return {img_to_label: gr.update(value=get_image())}
106
+
107
+ def reload_data():
108
+ global data_to_label
109
+ global imgs
110
+ try:
111
+ # See if there is new data to be labeled
112
+ data_to_label = load_dataset(to_label_samples_repo_id)
113
+ imgs = data_to_label["train"]["image"]
114
+ except Exception:
115
+ imgs = None
116
+ data_to_label = None
117
+ return {
118
+ img_to_label: gr.update(visible=False, value=None),
119
+ label_dropdown: gr.update(visible=False),
120
+ save_btn: gr.update(visible=False),
121
+ output_box: gr.update(visible=True, value="No more images to label"),
122
+ reload_btn: gr.update(visible=True),
123
+ }
124
+
125
+ if len(imgs) == 0:
126
+ return
127
+ else:
128
+ global idx
129
+ idx = 0
130
+ return {
131
+ img_to_label: gr.update(visible=True, value=get_image()),
132
+ label_dropdown: gr.update(visible=True),
133
+ save_btn: gr.update(visible=True),
134
+ output_box: gr.update(visible=False),
135
+ reload_btn: gr.update(visible=False),
136
+ }
137
+
138
+ save_btn.click(
139
+ save_data,
140
+ inputs=[img_to_label, label_dropdown],
141
+ outputs=[img_to_label, label_dropdown, save_btn, output_box, reload_btn],
142
+ )
143
+
144
+ reload_btn.click(reload_data, outputs=[img_to_label, label_dropdown, save_btn, output_box, reload_btn])
145
+
146
+
147
+ if __name__ == "__main__":
148
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ huggingface_hub
3
+ datasets
4
+ fuego
training/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ tensorflow
2
+ huggingface_hub
3
+ datasets
4
+ Pillow
training/run.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow as tf
3
+ from datasets import load_dataset
4
+ from huggingface_hub import create_repo, from_pretrained_keras, push_to_hub_keras
5
+ from tensorflow import keras
6
+ from tensorflow.keras import layers
7
+
8
+ labeled_samples_repo_id = create_repo("actlearn_labeled_samples", exist_ok=True, repo_type="dataset").repo_id
9
+ unlabled_samples_repo_id = create_repo("actlearn_unlabeled_samples", exist_ok=True, repo_type="dataset").repo_id
10
+ to_label_samples_repo_id = create_repo("actlearn_to_label_samples", exist_ok=True, repo_type="dataset").repo_id
11
+ test_dataset_repo_id = create_repo("actlearn_test_mnist", exist_ok=True, repo_type="dataset").repo_id
12
+ model_repo_id = create_repo("actlearn_mnist_model", exist_ok=True).repo_id
13
+
14
+
15
+ def to_numpy(examples):
16
+ examples["pixel_values"] = [np.array(image.convert("1")) for image in examples["image"]]
17
+ return examples
18
+
19
+
20
+ def preprocess():
21
+ train_dataset = load_dataset(labeled_samples_repo_id)["train"]
22
+ train_dataset = train_dataset.map(to_numpy, batched=True)
23
+
24
+ test_dataset = load_dataset(test_dataset_repo_id)["test"]
25
+ test_dataset = test_dataset.map(to_numpy, batched=True)
26
+
27
+ x_train = train_dataset["pixel_values"]
28
+ y_train = train_dataset["label"]
29
+
30
+ x_test = test_dataset["pixel_values"]
31
+ y_test = test_dataset["label"]
32
+
33
+ x_train = np.expand_dims(x_train, -1)
34
+ x_test = np.expand_dims(x_test, -1)
35
+
36
+ num_classes = 10
37
+
38
+ y_train = keras.utils.to_categorical(y_train, num_classes)
39
+ y_test = keras.utils.to_categorical(y_test, num_classes)
40
+
41
+ return x_train, y_train, x_test, y_test
42
+
43
+
44
+ def train():
45
+ input_shape = (28, 28, 1)
46
+ x_train, y_train, x_test, y_test = preprocess()
47
+ num_classes = 10
48
+
49
+ model = keras.Sequential(
50
+ [
51
+ keras.Input(shape=input_shape),
52
+ layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
53
+ layers.MaxPooling2D(pool_size=(2, 2)),
54
+ layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
55
+ layers.MaxPooling2D(pool_size=(2, 2)),
56
+ layers.Flatten(),
57
+ layers.Dropout(0.5),
58
+ layers.Dense(num_classes, activation="softmax"),
59
+ ]
60
+ )
61
+
62
+ model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
63
+ model.fit(x_train, y_train, batch_size=128, epochs=4, validation_split=0.1)
64
+
65
+ score = model.evaluate(x_test, y_test, verbose=0)
66
+ print("Test loss:", score[0])
67
+ print("Test accuracy:", score[1])
68
+
69
+ push_to_hub_keras(model, model_repo_id)
70
+
71
+
72
+ def find_samples_to_label():
73
+ loaded_model = from_pretrained_keras(model_repo_id)
74
+ loaded_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
75
+
76
+ unlabeled_data = load_dataset(unlabled_samples_repo_id)["train"]
77
+ processed_data = unlabeled_data.map(to_numpy, batched=True)
78
+ processed_data = processed_data["pixel_values"]
79
+ processed_data = tf.expand_dims(processed_data, -1)
80
+
81
+ # Get all predictions
82
+ # And then get the 5 samples with the lowest prediction score
83
+ preds = loaded_model.predict(processed_data)
84
+ top_pred_confs = 1 - np.max(preds, axis=1)
85
+ idx_to_label = np.argpartition(top_pred_confs, -5)[-5:]
86
+
87
+ # Upload samples to the dataset to label
88
+ to_label_data = unlabeled_data.select(idx_to_label)
89
+ to_label_data.push_to_hub(to_label_samples_repo_id)
90
+
91
+ # Remove from the pool of samples
92
+ unlabeled_data = unlabeled_data.select((i for i in range(len(unlabeled_data)) if i not in set(idx_to_label)))
93
+ unlabeled_data.push_to_hub(unlabled_samples_repo_id)
94
+
95
+
96
+ def main():
97
+ train()
98
+ find_samples_to_label()
99
+
100
+
101
+ if __name__ == "__main__":
102
+ main()