Spaces:
Runtime error
Runtime error
Upload . with huggingface_hub
Browse files- __pycache__/app.cpython-38.pyc +0 -0
- app.py +148 -0
- requirements.txt +4 -0
- training/requirements.txt +4 -0
- training/run.py +102 -0
__pycache__/app.cpython-38.pyc
ADDED
Binary file (4.82 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import datasets
|
4 |
+
import fuego
|
5 |
+
import gradio as gr
|
6 |
+
from datasets import load_dataset
|
7 |
+
from huggingface_hub import HfFolder, create_repo, delete_repo, login
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
|
11 |
+
datasets.disable_caching()
|
12 |
+
|
13 |
+
login(token=os.getenv("HUGGING_FACE_HUB_TOKEN", HfFolder.get_token()), add_to_git_credential=True)
|
14 |
+
|
15 |
+
labeled_samples_repo_id = create_repo("actlearn_labeled_samples", exist_ok=True, repo_type="dataset").repo_id
|
16 |
+
unlabled_samples_repo_id = create_repo("actlearn_unlabeled_samples", exist_ok=True, repo_type="dataset").repo_id
|
17 |
+
to_label_samples_repo_id = create_repo("actlearn_to_label_samples", exist_ok=True, repo_type="dataset").repo_id
|
18 |
+
test_dataset_repo_id = create_repo("actlearn_test_mnist", exist_ok=True, repo_type="dataset").repo_id
|
19 |
+
model_repo_id = create_repo("actlearn_mnist_model", exist_ok=True).repo_id
|
20 |
+
|
21 |
+
|
22 |
+
idx = 0
|
23 |
+
try:
|
24 |
+
data_to_label = load_dataset(to_label_samples_repo_id)
|
25 |
+
imgs = data_to_label["train"]["image"]
|
26 |
+
except:
|
27 |
+
imgs = None
|
28 |
+
data_to_label = None
|
29 |
+
|
30 |
+
|
31 |
+
def get_image():
|
32 |
+
global idx
|
33 |
+
if imgs is None:
|
34 |
+
return None
|
35 |
+
new_img = imgs[idx]
|
36 |
+
idx += 1
|
37 |
+
return new_img
|
38 |
+
|
39 |
+
|
40 |
+
labeled_data = []
|
41 |
+
|
42 |
+
information = """# Active Learning Demo
|
43 |
+
This demo showcases Active Learning, which is great when labeling is expensive. In this demo, you will label images by choosing a digit (0-9).
|
44 |
+
How does this work?
|
45 |
+
* There is a large pool of unlabeled images
|
46 |
+
* A model is trained with the few labeled images
|
47 |
+
* We can then use the model to pick the images with the lowest confidence or with the lowest probability of corresponding to an image. These are the images for which the model is confused, so by improving them, the quality of the model can improve much more than queries for which the model was already doing well!
|
48 |
+
* In this UI, you will be provided a couple of images to label
|
49 |
+
* Once all the provided images are labeled, the model is retrained, and a new set of images is chosen!
|
50 |
+
"""
|
51 |
+
|
52 |
+
training_info = """## Model Retraining
|
53 |
+
There are new labeled images. The model is retraining. Follow progress in the "fuego" space that was spun up for you in your profile.
|
54 |
+
"""
|
55 |
+
|
56 |
+
with gr.Blocks() as demo:
|
57 |
+
gr.Markdown(information)
|
58 |
+
|
59 |
+
img_to_label = gr.Image(shape=[28, 28], value=get_image(), visible=True if imgs is not None else False)
|
60 |
+
label_dropdown = gr.Dropdown(
|
61 |
+
choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], interactive=True, value=0, visible=True if imgs is not None else False
|
62 |
+
)
|
63 |
+
save_btn = gr.Button("Save label", visible=True if imgs is not None else False)
|
64 |
+
output_box = gr.Markdown(value=training_info, visible=False)
|
65 |
+
reload_btn = gr.Button("Reload", visible=False if imgs is not None else True)
|
66 |
+
|
67 |
+
def save_data(img, label):
|
68 |
+
global labeled_data
|
69 |
+
global idx
|
70 |
+
|
71 |
+
labeled_data.append([img, label])
|
72 |
+
|
73 |
+
if imgs is not None and len(imgs) == idx:
|
74 |
+
# Remove dataset of queries to label
|
75 |
+
# datasets library does not allow pushing an empty dataset, so as a
|
76 |
+
# workaround we just delete the repo
|
77 |
+
delete_repo(repo_id=to_label_samples_repo_id, repo_type="dataset")
|
78 |
+
create_repo(repo_id=to_label_samples_repo_id, repo_type="dataset")
|
79 |
+
|
80 |
+
# Push to training dataset
|
81 |
+
labeled_dataset = load_dataset(labeled_samples_repo_id)["train"]
|
82 |
+
feature = datasets.Image(decode=False)
|
83 |
+
for img, label in labeled_data:
|
84 |
+
# Hack due to https://github.com/huggingface/datasets/issues/4796
|
85 |
+
labeled_dataset = labeled_dataset.add_item(
|
86 |
+
{"image": feature.encode_example(Image.fromarray(img)), "label": label}
|
87 |
+
)
|
88 |
+
labeled_dataset.push_to_hub(labeled_samples_repo_id)
|
89 |
+
|
90 |
+
# Clean up data
|
91 |
+
labeled_data = []
|
92 |
+
idx = 0
|
93 |
+
|
94 |
+
fuego.run("training/run.py", "training/requirements.txt", space_id="actlearn-fuego-runner")
|
95 |
+
|
96 |
+
# Update UI
|
97 |
+
return {
|
98 |
+
img_to_label: gr.update(visible=False),
|
99 |
+
label_dropdown: gr.update(visible=False),
|
100 |
+
save_btn: gr.update(visible=False),
|
101 |
+
output_box: gr.update(visible=True, value=training_info),
|
102 |
+
reload_btn: gr.update(visible=True),
|
103 |
+
}
|
104 |
+
else:
|
105 |
+
return {img_to_label: gr.update(value=get_image())}
|
106 |
+
|
107 |
+
def reload_data():
|
108 |
+
global data_to_label
|
109 |
+
global imgs
|
110 |
+
try:
|
111 |
+
# See if there is new data to be labeled
|
112 |
+
data_to_label = load_dataset(to_label_samples_repo_id)
|
113 |
+
imgs = data_to_label["train"]["image"]
|
114 |
+
except Exception:
|
115 |
+
imgs = None
|
116 |
+
data_to_label = None
|
117 |
+
return {
|
118 |
+
img_to_label: gr.update(visible=False, value=None),
|
119 |
+
label_dropdown: gr.update(visible=False),
|
120 |
+
save_btn: gr.update(visible=False),
|
121 |
+
output_box: gr.update(visible=True, value="No more images to label"),
|
122 |
+
reload_btn: gr.update(visible=True),
|
123 |
+
}
|
124 |
+
|
125 |
+
if len(imgs) == 0:
|
126 |
+
return
|
127 |
+
else:
|
128 |
+
global idx
|
129 |
+
idx = 0
|
130 |
+
return {
|
131 |
+
img_to_label: gr.update(visible=True, value=get_image()),
|
132 |
+
label_dropdown: gr.update(visible=True),
|
133 |
+
save_btn: gr.update(visible=True),
|
134 |
+
output_box: gr.update(visible=False),
|
135 |
+
reload_btn: gr.update(visible=False),
|
136 |
+
}
|
137 |
+
|
138 |
+
save_btn.click(
|
139 |
+
save_data,
|
140 |
+
inputs=[img_to_label, label_dropdown],
|
141 |
+
outputs=[img_to_label, label_dropdown, save_btn, output_box, reload_btn],
|
142 |
+
)
|
143 |
+
|
144 |
+
reload_btn.click(reload_data, outputs=[img_to_label, label_dropdown, save_btn, output_box, reload_btn])
|
145 |
+
|
146 |
+
|
147 |
+
if __name__ == "__main__":
|
148 |
+
demo.launch(debug=True)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
huggingface_hub
|
3 |
+
datasets
|
4 |
+
fuego
|
training/requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tensorflow
|
2 |
+
huggingface_hub
|
3 |
+
datasets
|
4 |
+
Pillow
|
training/run.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import tensorflow as tf
|
3 |
+
from datasets import load_dataset
|
4 |
+
from huggingface_hub import create_repo, from_pretrained_keras, push_to_hub_keras
|
5 |
+
from tensorflow import keras
|
6 |
+
from tensorflow.keras import layers
|
7 |
+
|
8 |
+
labeled_samples_repo_id = create_repo("actlearn_labeled_samples", exist_ok=True, repo_type="dataset").repo_id
|
9 |
+
unlabled_samples_repo_id = create_repo("actlearn_unlabeled_samples", exist_ok=True, repo_type="dataset").repo_id
|
10 |
+
to_label_samples_repo_id = create_repo("actlearn_to_label_samples", exist_ok=True, repo_type="dataset").repo_id
|
11 |
+
test_dataset_repo_id = create_repo("actlearn_test_mnist", exist_ok=True, repo_type="dataset").repo_id
|
12 |
+
model_repo_id = create_repo("actlearn_mnist_model", exist_ok=True).repo_id
|
13 |
+
|
14 |
+
|
15 |
+
def to_numpy(examples):
|
16 |
+
examples["pixel_values"] = [np.array(image.convert("1")) for image in examples["image"]]
|
17 |
+
return examples
|
18 |
+
|
19 |
+
|
20 |
+
def preprocess():
|
21 |
+
train_dataset = load_dataset(labeled_samples_repo_id)["train"]
|
22 |
+
train_dataset = train_dataset.map(to_numpy, batched=True)
|
23 |
+
|
24 |
+
test_dataset = load_dataset(test_dataset_repo_id)["test"]
|
25 |
+
test_dataset = test_dataset.map(to_numpy, batched=True)
|
26 |
+
|
27 |
+
x_train = train_dataset["pixel_values"]
|
28 |
+
y_train = train_dataset["label"]
|
29 |
+
|
30 |
+
x_test = test_dataset["pixel_values"]
|
31 |
+
y_test = test_dataset["label"]
|
32 |
+
|
33 |
+
x_train = np.expand_dims(x_train, -1)
|
34 |
+
x_test = np.expand_dims(x_test, -1)
|
35 |
+
|
36 |
+
num_classes = 10
|
37 |
+
|
38 |
+
y_train = keras.utils.to_categorical(y_train, num_classes)
|
39 |
+
y_test = keras.utils.to_categorical(y_test, num_classes)
|
40 |
+
|
41 |
+
return x_train, y_train, x_test, y_test
|
42 |
+
|
43 |
+
|
44 |
+
def train():
|
45 |
+
input_shape = (28, 28, 1)
|
46 |
+
x_train, y_train, x_test, y_test = preprocess()
|
47 |
+
num_classes = 10
|
48 |
+
|
49 |
+
model = keras.Sequential(
|
50 |
+
[
|
51 |
+
keras.Input(shape=input_shape),
|
52 |
+
layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
|
53 |
+
layers.MaxPooling2D(pool_size=(2, 2)),
|
54 |
+
layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
|
55 |
+
layers.MaxPooling2D(pool_size=(2, 2)),
|
56 |
+
layers.Flatten(),
|
57 |
+
layers.Dropout(0.5),
|
58 |
+
layers.Dense(num_classes, activation="softmax"),
|
59 |
+
]
|
60 |
+
)
|
61 |
+
|
62 |
+
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
|
63 |
+
model.fit(x_train, y_train, batch_size=128, epochs=4, validation_split=0.1)
|
64 |
+
|
65 |
+
score = model.evaluate(x_test, y_test, verbose=0)
|
66 |
+
print("Test loss:", score[0])
|
67 |
+
print("Test accuracy:", score[1])
|
68 |
+
|
69 |
+
push_to_hub_keras(model, model_repo_id)
|
70 |
+
|
71 |
+
|
72 |
+
def find_samples_to_label():
|
73 |
+
loaded_model = from_pretrained_keras(model_repo_id)
|
74 |
+
loaded_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
|
75 |
+
|
76 |
+
unlabeled_data = load_dataset(unlabled_samples_repo_id)["train"]
|
77 |
+
processed_data = unlabeled_data.map(to_numpy, batched=True)
|
78 |
+
processed_data = processed_data["pixel_values"]
|
79 |
+
processed_data = tf.expand_dims(processed_data, -1)
|
80 |
+
|
81 |
+
# Get all predictions
|
82 |
+
# And then get the 5 samples with the lowest prediction score
|
83 |
+
preds = loaded_model.predict(processed_data)
|
84 |
+
top_pred_confs = 1 - np.max(preds, axis=1)
|
85 |
+
idx_to_label = np.argpartition(top_pred_confs, -5)[-5:]
|
86 |
+
|
87 |
+
# Upload samples to the dataset to label
|
88 |
+
to_label_data = unlabeled_data.select(idx_to_label)
|
89 |
+
to_label_data.push_to_hub(to_label_samples_repo_id)
|
90 |
+
|
91 |
+
# Remove from the pool of samples
|
92 |
+
unlabeled_data = unlabeled_data.select((i for i in range(len(unlabeled_data)) if i not in set(idx_to_label)))
|
93 |
+
unlabeled_data.push_to_hub(unlabled_samples_repo_id)
|
94 |
+
|
95 |
+
|
96 |
+
def main():
|
97 |
+
train()
|
98 |
+
find_samples_to_label()
|
99 |
+
|
100 |
+
|
101 |
+
if __name__ == "__main__":
|
102 |
+
main()
|