Aureliano commited on
Commit
74478a0
1 Parent(s): c15a48c

Info updated

Browse files
Files changed (1) hide show
  1. README.md +133 -37
README.md CHANGED
@@ -1,94 +1,190 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
4
  ## How to use the discriminator in `transformers`
 
5
  ```python
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  ```
7
 
8
  ## How to use the discriminator in `transformers` on a custom dataset
 
9
  (Heavily based on: https://github.com/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb)
10
 
11
  ```python
12
  import math
 
13
 
14
  import tensorflow as tf
15
- from datasets import Dataset, ClassLabel, Features, Value
16
- from transformers import TFAutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, create_optimizer
 
17
 
18
  # This example shows how this model can be used:
19
- # you should finetune the model of your specific corpus if commands, bogger than this
20
  dict_train = {
21
- "idx": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"],
22
- "sentence": ["e", "get pen", "drop book", "x paper", "i", "south", "get paper", "drop pen", "x book", "inventory",
23
- "n", "get book", "drop paper", "examine Pen", "inv", "w"],
24
- "label": ["v01835496", "v01214265", "v01977701", "v02131279", "v02472495", "v01835496", "v01214265", "v01977701",
25
- "v02131279", "v02472495", "v01835496", "v01214265", "v01977701", "v02131279", "v02472495", "v01835496"]
 
 
 
 
 
 
 
 
26
  }
27
 
28
- num_labels = len(set(dict_train["label"]))
29
- features = Features({'idx': Value('uint32'), 'sentence': Value('string'),
30
- 'label': ClassLabel(names=list(set(dict_train["label"])))})
31
-
32
- raw_train_dataset = Dataset.from_dict(dict_train, features=features)
33
-
34
- discriminator = TFAutoModelForSequenceClassification.from_pretrained("Aureliano/distilbert-base-uncased-if", num_labels=num_labels)
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  tokenizer = AutoTokenizer.from_pretrained("Aureliano/distilbert-base-uncased-if")
36
 
37
  tokenize_function = lambda example: tokenizer(example["sentence"], truncation=True)
38
 
39
- pre_tokenizer_columns = set(raw_train_dataset.features)
40
- train_dataset = raw_train_dataset.map(tokenize_function, batched=True)
41
- tokenizer_columns = list(set(train_dataset.features) - pre_tokenizer_columns)
42
 
43
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
44
 
45
- batch_size = 16
46
- tf_train_dataset = train_dataset.to_tf_dataset(
47
  columns=tokenizer_columns,
48
  label_cols=["labels"],
49
  shuffle=True,
50
  batch_size=batch_size,
51
  collate_fn=data_collator
52
  )
 
 
 
 
 
 
 
53
 
54
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
55
- num_epochs = 100
56
- batches_per_epoch = math.ceil(len(train_dataset) / batch_size)
57
  total_train_steps = int(batches_per_epoch * num_epochs)
58
 
59
  optimizer, schedule = create_optimizer(
60
- init_lr=1e-6, num_warmup_steps=1, num_train_steps=total_train_steps
61
  )
62
 
63
- discriminator.compile(optimizer=optimizer, loss=loss)
 
 
 
 
 
 
 
 
 
 
 
 
64
  discriminator.fit(
65
  tf_train_dataset,
66
- epochs=num_epochs
 
 
67
  )
68
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  text = "get lamp"
70
  encoded_input = tokenizer(text, return_tensors='tf')
71
  output = discriminator(encoded_input)
72
  prediction = tf.nn.softmax(output["logits"][0], -1)
73
- label = dict_train["label"][tf.math.argmax(prediction)]
74
- print(text, ":", label)
75
- # ideally [v01214265 -> take.v.04 -> "get into one's hands, take physically"], but probably only with a better dataset
 
 
 
 
 
 
 
 
76
 
77
  ```
78
 
79
  ## How to use in a Rasa pipeline
80
- The model can integrated in a Rasa pipeline through a [`LanguageModelFeaturizer`](https://rasa.com/docs/rasa/components#languagemodelfeaturizer)
 
 
 
81
  ```yaml
82
  recipe: default.v1
83
  language: en
84
 
85
  pipeline:
86
- # See https://rasa.com/docs/rasa/tuning-your-model for more information.
87
- ...
88
- - name: "WhitespaceTokenizer"
 
 
 
 
89
  ...
90
- - name: LanguageModelFeaturizer
91
- model_name: "distilbert"
92
- model_weights: "Aureliano/distilbert-base-uncased-if"
93
- ...
94
  ```
 
1
  ---
2
+ language: en license: apache-2.0 datasets:
3
+
4
+ - bookcorpus
5
+ - wikipedia
6
+
7
  ---
8
+
9
+ # DistilBERT base model (uncased) for Interactive Fiction
10
+
11
+ [`distilbert-base-uncased`](https://huggingface.co/distilbert-base-uncased) finetuned on a dataset of Interactive
12
+ Fiction commands.
13
+
14
+ Details on the datasets can be found [here](https://github.com/aporporato/jericho-corpora).
15
+
16
+ The resulting model scored an accuracy of 0.976253 on the WordNet task test set.
17
+
18
  ## How to use the discriminator in `transformers`
19
+
20
  ```python
21
+ import tensorflow as tf
22
+ from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
23
+
24
+ discriminator = TFAutoModelForSequenceClassification.from_pretrained("Aureliano/distilbert-base-uncased-if")
25
+ tokenizer = AutoTokenizer.from_pretrained("Aureliano/distilbert-base-uncased-if")
26
+
27
+ text = "get lamp"
28
+ encoded_input = tokenizer(text, return_tensors='tf')
29
+ output = discriminator(encoded_input)
30
+ prediction = tf.nn.softmax(output["logits"][0], -1)
31
+ label = discriminator.config.id2label[tf.math.argmax(prediction).numpy()]
32
+ print(text, ":", label) # take.v.04 -> "get into one's hands, take physically"
33
+
34
  ```
35
 
36
  ## How to use the discriminator in `transformers` on a custom dataset
37
+
38
  (Heavily based on: https://github.com/huggingface/notebooks/blob/master/examples/text_classification-tf.ipynb)
39
 
40
  ```python
41
  import math
42
+ import numpy as np
43
 
44
  import tensorflow as tf
45
+ from datasets import load_metric, Dataset, DatasetDict
46
+ from transformers import TFAutoModel, TFAutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, create_optimizer
47
+ from transformers.keras_callbacks import KerasMetricCallback
48
 
49
  # This example shows how this model can be used:
50
+ # you should finetune the model of your specific corpus if commands, bigger than this
51
  dict_train = {
52
+ "idx": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18",
53
+ "19", "20"],
54
+ "sentence": ["e", "get pen", "drop book", "x paper", "i", "south", "get paper", "drop the pen", "x book",
55
+ "inventory", "n", "get the book", "drop paper", "look at Pen", "inv", "g", "s", "get sandwich",
56
+ "drop sandwich", "x sandwich", "agin"],
57
+ "label": ["travel.v.01", "take.v.04", "drop.v.01", "examine.v.02", "inventory.v.01", "travel.v.01", "take.v.04",
58
+ "drop.v.01", "examine.v.02", "inventory.v.01", "travel.v.01", "take.v.04", "drop.v.01", "examine.v.02",
59
+ "inventory.v.01", "repeat.v.01", "travel.v.01", "take.v.04", "drop.v.01", "examine.v.02", "repeat.v.01"]
60
+ }
61
+ dict_val = {
62
+ "idx": ["0", "1", "2", "3", "4", "5"],
63
+ "sentence": ["w", "get shield", "drop sword", "x spikes", "i", "repeat"],
64
+ "label": ["travel.v.01", "take.v.04", "drop.v.01", "examine.v.02", "inventory.v.01", "repeat.v.01"]
65
  }
66
 
67
+ raw_train_dataset = Dataset.from_dict(dict_train)
68
+ raw_val_dataset = Dataset.from_dict(dict_val)
69
+ raw_dataset = DatasetDict()
70
+ raw_dataset["train"] = raw_train_dataset
71
+ raw_dataset["val"] = raw_val_dataset
72
+ raw_dataset = raw_dataset.class_encode_column("label")
73
+ print(raw_dataset)
74
+ print(raw_dataset["train"].features)
75
+ print(raw_dataset["val"].features)
76
+ print(raw_dataset["train"][1])
77
+ label2id = {}
78
+ id2label = {}
79
+ for i, l in enumerate(raw_dataset["train"].features["label"].names):
80
+ label2id[l] = i
81
+ id2label[i] = l
82
+
83
+ discriminator = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
84
+ label2id=label2id,
85
+ id2label=id2label)
86
+ discriminator.distilbert = TFAutoModel.from_pretrained("Aureliano/distilbert-base-uncased-if")
87
  tokenizer = AutoTokenizer.from_pretrained("Aureliano/distilbert-base-uncased-if")
88
 
89
  tokenize_function = lambda example: tokenizer(example["sentence"], truncation=True)
90
 
91
+ pre_tokenizer_columns = set(raw_dataset["train"].features)
92
+ encoded_dataset = raw_dataset.map(tokenize_function, batched=True)
93
+ tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
94
 
95
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
96
 
97
+ batch_size = len(encoded_dataset["train"])
98
+ tf_train_dataset = encoded_dataset["train"].to_tf_dataset(
99
  columns=tokenizer_columns,
100
  label_cols=["labels"],
101
  shuffle=True,
102
  batch_size=batch_size,
103
  collate_fn=data_collator
104
  )
105
+ tf_validation_dataset = encoded_dataset["val"].to_tf_dataset(
106
+ columns=tokenizer_columns,
107
+ label_cols=["labels"],
108
+ shuffle=False,
109
+ batch_size=batch_size,
110
+ collate_fn=data_collator
111
+ )
112
 
113
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
114
+ num_epochs = 20
115
+ batches_per_epoch = math.ceil(len(encoded_dataset["train"]) / batch_size)
116
  total_train_steps = int(batches_per_epoch * num_epochs)
117
 
118
  optimizer, schedule = create_optimizer(
119
+ init_lr=2e-5, num_warmup_steps=total_train_steps // 5, num_train_steps=total_train_steps
120
  )
121
 
122
+ metric = load_metric("accuracy")
123
+
124
+
125
+ def compute_metrics(eval_predictions):
126
+ logits, labels = eval_predictions
127
+ predictions = np.argmax(logits, axis=-1)
128
+ return metric.compute(predictions=predictions, references=labels)
129
+
130
+
131
+ metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_dataset)
132
+ callbacks = [metric_callback]
133
+
134
+ discriminator.compile(optimizer=optimizer, loss=loss, metrics=["sparse_categorical_accuracy"])
135
  discriminator.fit(
136
  tf_train_dataset,
137
+ epochs=num_epochs,
138
+ validation_data=tf_validation_dataset,
139
+ callbacks=callbacks
140
  )
141
 
142
+ print("Evaluate on test data")
143
+ results = discriminator.evaluate(tf_validation_dataset)
144
+ print("test loss, test acc:", results)
145
+
146
+ text = "i"
147
+ encoded_input = tokenizer(text, return_tensors='tf')
148
+ output = discriminator(encoded_input)
149
+ prediction = tf.nn.softmax(output["logits"][0], -1)
150
+ label = id2label[tf.math.argmax(prediction).numpy()]
151
+ print("\n", text, ":", label,
152
+ "\n") # ideally 'inventory.v.01' (-> "make or include in an itemized record or report"), but probably only with a better finetuning dataset
153
+
154
  text = "get lamp"
155
  encoded_input = tokenizer(text, return_tensors='tf')
156
  output = discriminator(encoded_input)
157
  prediction = tf.nn.softmax(output["logits"][0], -1)
158
+ label = id2label[tf.math.argmax(prediction).numpy()]
159
+ print("\n", text, ":", label,
160
+ "\n") # ideally 'take.v.04' (-> "get into one's hands, take physically"), but probably only with a better finetuning dataset
161
+
162
+ text = "w"
163
+ encoded_input = tokenizer(text, return_tensors='tf')
164
+ output = discriminator(encoded_input)
165
+ prediction = tf.nn.softmax(output["logits"][0], -1)
166
+ label = id2label[tf.math.argmax(prediction).numpy()]
167
+ print("\n", text, ":", label,
168
+ "\n") # ideally 'travel.v.01' (-> "change location; move, travel, or proceed, also metaphorically"), but probably only with a better finetuning dataset
169
 
170
  ```
171
 
172
  ## How to use in a Rasa pipeline
173
+
174
+ The model can integrated in a Rasa pipeline through
175
+ a [`LanguageModelFeaturizer`](https://rasa.com/docs/rasa/components#languagemodelfeaturizer)
176
+
177
  ```yaml
178
  recipe: default.v1
179
  language: en
180
 
181
  pipeline:
182
+ # See https://rasa.com/docs/rasa/tuning-your-model for more information.
183
+ ...
184
+ - name: "WhitespaceTokenizer"
185
+ ...
186
+ - name: LanguageModelFeaturizer
187
+ model_name: "distilbert"
188
+ model_weights: "Aureliano/distilbert-base-uncased-if"
189
  ...
 
 
 
 
190
  ```