Info updated
Browse files
README.md
CHANGED
@@ -1,94 +1,190 @@
|
|
1 |
---
|
2 |
-
license: apache-2.0
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
## How to use the discriminator in `transformers`
|
|
|
5 |
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
```
|
7 |
|
8 |
## How to use the discriminator in `transformers` on a custom dataset
|
|
|
9 |
(Heavily based on: https://github.com/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb)
|
10 |
|
11 |
```python
|
12 |
import math
|
|
|
13 |
|
14 |
import tensorflow as tf
|
15 |
-
from datasets import
|
16 |
-
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, create_optimizer
|
|
|
17 |
|
18 |
# This example shows how this model can be used:
|
19 |
-
# you should finetune the model of your specific corpus if commands,
|
20 |
dict_train = {
|
21 |
-
"idx": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
}
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
tokenizer = AutoTokenizer.from_pretrained("Aureliano/distilbert-base-uncased-if")
|
36 |
|
37 |
tokenize_function = lambda example: tokenizer(example["sentence"], truncation=True)
|
38 |
|
39 |
-
pre_tokenizer_columns = set(
|
40 |
-
|
41 |
-
tokenizer_columns = list(set(
|
42 |
|
43 |
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
|
44 |
|
45 |
-
batch_size =
|
46 |
-
tf_train_dataset =
|
47 |
columns=tokenizer_columns,
|
48 |
label_cols=["labels"],
|
49 |
shuffle=True,
|
50 |
batch_size=batch_size,
|
51 |
collate_fn=data_collator
|
52 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
55 |
-
num_epochs =
|
56 |
-
batches_per_epoch = math.ceil(len(
|
57 |
total_train_steps = int(batches_per_epoch * num_epochs)
|
58 |
|
59 |
optimizer, schedule = create_optimizer(
|
60 |
-
init_lr=
|
61 |
)
|
62 |
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
discriminator.fit(
|
65 |
tf_train_dataset,
|
66 |
-
epochs=num_epochs
|
|
|
|
|
67 |
)
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
text = "get lamp"
|
70 |
encoded_input = tokenizer(text, return_tensors='tf')
|
71 |
output = discriminator(encoded_input)
|
72 |
prediction = tf.nn.softmax(output["logits"][0], -1)
|
73 |
-
label =
|
74 |
-
print(text, ":", label
|
75 |
-
# ideally
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
```
|
78 |
|
79 |
## How to use in a Rasa pipeline
|
80 |
-
|
|
|
|
|
|
|
81 |
```yaml
|
82 |
recipe: default.v1
|
83 |
language: en
|
84 |
|
85 |
pipeline:
|
86 |
-
# See https://rasa.com/docs/rasa/tuning-your-model for more information.
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
89 |
...
|
90 |
-
- name: LanguageModelFeaturizer
|
91 |
-
model_name: "distilbert"
|
92 |
-
model_weights: "Aureliano/distilbert-base-uncased-if"
|
93 |
-
...
|
94 |
```
|
|
|
1 |
---
|
2 |
+
language: en license: apache-2.0 datasets:
|
3 |
+
|
4 |
+
- bookcorpus
|
5 |
+
- wikipedia
|
6 |
+
|
7 |
---
|
8 |
+
|
9 |
+
# DistilBERT base model (uncased) for Interactive Fiction
|
10 |
+
|
11 |
+
[`distilbert-base-uncased`](https://huggingface.co/distilbert-base-uncased) finetuned on a dataset of Interactive
|
12 |
+
Fiction commands.
|
13 |
+
|
14 |
+
Details on the datasets can be found [here](https://github.com/aporporato/jericho-corpora).
|
15 |
+
|
16 |
+
The resulting model scored an accuracy of 0.976253 on the WordNet task test set.
|
17 |
+
|
18 |
## How to use the discriminator in `transformers`
|
19 |
+
|
20 |
```python
|
21 |
+
import tensorflow as tf
|
22 |
+
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
|
23 |
+
|
24 |
+
discriminator = TFAutoModelForSequenceClassification.from_pretrained("Aureliano/distilbert-base-uncased-if")
|
25 |
+
tokenizer = AutoTokenizer.from_pretrained("Aureliano/distilbert-base-uncased-if")
|
26 |
+
|
27 |
+
text = "get lamp"
|
28 |
+
encoded_input = tokenizer(text, return_tensors='tf')
|
29 |
+
output = discriminator(encoded_input)
|
30 |
+
prediction = tf.nn.softmax(output["logits"][0], -1)
|
31 |
+
label = discriminator.config.id2label[tf.math.argmax(prediction).numpy()]
|
32 |
+
print(text, ":", label) # take.v.04 -> "get into one's hands, take physically"
|
33 |
+
|
34 |
```
|
35 |
|
36 |
## How to use the discriminator in `transformers` on a custom dataset
|
37 |
+
|
38 |
(Heavily based on: https://github.com/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb)
|
39 |
|
40 |
```python
|
41 |
import math
|
42 |
+
import numpy as np
|
43 |
|
44 |
import tensorflow as tf
|
45 |
+
from datasets import load_metric, Dataset, DatasetDict
|
46 |
+
from transformers import TFAutoModel, TFAutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, create_optimizer
|
47 |
+
from transformers.keras_callbacks import KerasMetricCallback
|
48 |
|
49 |
# This example shows how this model can be used:
|
50 |
+
# you should finetune the model of your specific corpus if commands, bigger than this
|
51 |
dict_train = {
|
52 |
+
"idx": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18",
|
53 |
+
"19", "20"],
|
54 |
+
"sentence": ["e", "get pen", "drop book", "x paper", "i", "south", "get paper", "drop the pen", "x book",
|
55 |
+
"inventory", "n", "get the book", "drop paper", "look at Pen", "inv", "g", "s", "get sandwich",
|
56 |
+
"drop sandwich", "x sandwich", "agin"],
|
57 |
+
"label": ["travel.v.01", "take.v.04", "drop.v.01", "examine.v.02", "inventory.v.01", "travel.v.01", "take.v.04",
|
58 |
+
"drop.v.01", "examine.v.02", "inventory.v.01", "travel.v.01", "take.v.04", "drop.v.01", "examine.v.02",
|
59 |
+
"inventory.v.01", "repeat.v.01", "travel.v.01", "take.v.04", "drop.v.01", "examine.v.02", "repeat.v.01"]
|
60 |
+
}
|
61 |
+
dict_val = {
|
62 |
+
"idx": ["0", "1", "2", "3", "4", "5"],
|
63 |
+
"sentence": ["w", "get shield", "drop sword", "x spikes", "i", "repeat"],
|
64 |
+
"label": ["travel.v.01", "take.v.04", "drop.v.01", "examine.v.02", "inventory.v.01", "repeat.v.01"]
|
65 |
}
|
66 |
|
67 |
+
raw_train_dataset = Dataset.from_dict(dict_train)
|
68 |
+
raw_val_dataset = Dataset.from_dict(dict_val)
|
69 |
+
raw_dataset = DatasetDict()
|
70 |
+
raw_dataset["train"] = raw_train_dataset
|
71 |
+
raw_dataset["val"] = raw_val_dataset
|
72 |
+
raw_dataset = raw_dataset.class_encode_column("label")
|
73 |
+
print(raw_dataset)
|
74 |
+
print(raw_dataset["train"].features)
|
75 |
+
print(raw_dataset["val"].features)
|
76 |
+
print(raw_dataset["train"][1])
|
77 |
+
label2id = {}
|
78 |
+
id2label = {}
|
79 |
+
for i, l in enumerate(raw_dataset["train"].features["label"].names):
|
80 |
+
label2id[l] = i
|
81 |
+
id2label[i] = l
|
82 |
+
|
83 |
+
discriminator = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
|
84 |
+
label2id=label2id,
|
85 |
+
id2label=id2label)
|
86 |
+
discriminator.distilbert = TFAutoModel.from_pretrained("Aureliano/distilbert-base-uncased-if")
|
87 |
tokenizer = AutoTokenizer.from_pretrained("Aureliano/distilbert-base-uncased-if")
|
88 |
|
89 |
tokenize_function = lambda example: tokenizer(example["sentence"], truncation=True)
|
90 |
|
91 |
+
pre_tokenizer_columns = set(raw_dataset["train"].features)
|
92 |
+
encoded_dataset = raw_dataset.map(tokenize_function, batched=True)
|
93 |
+
tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
|
94 |
|
95 |
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
|
96 |
|
97 |
+
batch_size = len(encoded_dataset["train"])
|
98 |
+
tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
|
99 |
columns=tokenizer_columns,
|
100 |
label_cols=["labels"],
|
101 |
shuffle=True,
|
102 |
batch_size=batch_size,
|
103 |
collate_fn=data_collator
|
104 |
)
|
105 |
+
tf_validation_dataset = encoded_dataset["val"].to_tf_dataset(
|
106 |
+
columns=tokenizer_columns,
|
107 |
+
label_cols=["labels"],
|
108 |
+
shuffle=False,
|
109 |
+
batch_size=batch_size,
|
110 |
+
collate_fn=data_collator
|
111 |
+
)
|
112 |
|
113 |
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
114 |
+
num_epochs = 20
|
115 |
+
batches_per_epoch = math.ceil(len(encoded_dataset["train"]) / batch_size)
|
116 |
total_train_steps = int(batches_per_epoch * num_epochs)
|
117 |
|
118 |
optimizer, schedule = create_optimizer(
|
119 |
+
init_lr=2e-5, num_warmup_steps=total_train_steps // 5, num_train_steps=total_train_steps
|
120 |
)
|
121 |
|
122 |
+
metric = load_metric("accuracy")
|
123 |
+
|
124 |
+
|
125 |
+
def compute_metrics(eval_predictions):
|
126 |
+
logits, labels = eval_predictions
|
127 |
+
predictions = np.argmax(logits, axis=-1)
|
128 |
+
return metric.compute(predictions=predictions, references=labels)
|
129 |
+
|
130 |
+
|
131 |
+
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_dataset)
|
132 |
+
callbacks = [metric_callback]
|
133 |
+
|
134 |
+
discriminator.compile(optimizer=optimizer, loss=loss, metrics=["sparse_categorical_accuracy"])
|
135 |
discriminator.fit(
|
136 |
tf_train_dataset,
|
137 |
+
epochs=num_epochs,
|
138 |
+
validation_data=tf_validation_dataset,
|
139 |
+
callbacks=callbacks
|
140 |
)
|
141 |
|
142 |
+
print("Evaluate on test data")
|
143 |
+
results = discriminator.evaluate(tf_validation_dataset)
|
144 |
+
print("test loss, test acc:", results)
|
145 |
+
|
146 |
+
text = "i"
|
147 |
+
encoded_input = tokenizer(text, return_tensors='tf')
|
148 |
+
output = discriminator(encoded_input)
|
149 |
+
prediction = tf.nn.softmax(output["logits"][0], -1)
|
150 |
+
label = id2label[tf.math.argmax(prediction).numpy()]
|
151 |
+
print("\n", text, ":", label,
|
152 |
+
"\n") # ideally 'inventory.v.01' (-> "make or include in an itemized record or report"), but probably only with a better finetuning dataset
|
153 |
+
|
154 |
text = "get lamp"
|
155 |
encoded_input = tokenizer(text, return_tensors='tf')
|
156 |
output = discriminator(encoded_input)
|
157 |
prediction = tf.nn.softmax(output["logits"][0], -1)
|
158 |
+
label = id2label[tf.math.argmax(prediction).numpy()]
|
159 |
+
print("\n", text, ":", label,
|
160 |
+
"\n") # ideally 'take.v.04' (-> "get into one's hands, take physically"), but probably only with a better finetuning dataset
|
161 |
+
|
162 |
+
text = "w"
|
163 |
+
encoded_input = tokenizer(text, return_tensors='tf')
|
164 |
+
output = discriminator(encoded_input)
|
165 |
+
prediction = tf.nn.softmax(output["logits"][0], -1)
|
166 |
+
label = id2label[tf.math.argmax(prediction).numpy()]
|
167 |
+
print("\n", text, ":", label,
|
168 |
+
"\n") # ideally 'travel.v.01' (-> "change location; move, travel, or proceed, also metaphorically"), but probably only with a better finetuning dataset
|
169 |
|
170 |
```
|
171 |
|
172 |
## How to use in a Rasa pipeline
|
173 |
+
|
174 |
+
The model can integrated in a Rasa pipeline through
|
175 |
+
a [`LanguageModelFeaturizer`](https://rasa.com/docs/rasa/components#languagemodelfeaturizer)
|
176 |
+
|
177 |
```yaml
|
178 |
recipe: default.v1
|
179 |
language: en
|
180 |
|
181 |
pipeline:
|
182 |
+
# See https://rasa.com/docs/rasa/tuning-your-model for more information.
|
183 |
+
...
|
184 |
+
- name: "WhitespaceTokenizer"
|
185 |
+
...
|
186 |
+
- name: LanguageModelFeaturizer
|
187 |
+
model_name: "distilbert"
|
188 |
+
model_weights: "Aureliano/distilbert-base-uncased-if"
|
189 |
...
|
|
|
|
|
|
|
|
|
190 |
```
|