Create sequence_classification.py
Browse files- sequence_classification.py +180 -0
sequence_classification.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
"""
|
3 |
+
We encourage you to login to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to login:
|
4 |
+
"""
|
5 |
+
|
6 |
+
"""## Load IMDb dataset
|
7 |
+
|
8 |
+
Start by loading the IMDb dataset from the 🤗 Datasets library:
|
9 |
+
"""
|
10 |
+
|
11 |
+
from datasets import load_dataset
|
12 |
+
|
13 |
+
imdb = load_dataset("imdb")
|
14 |
+
|
15 |
+
"""Then take a look at an example:"""
|
16 |
+
|
17 |
+
imdb["test"][0]
|
18 |
+
|
19 |
+
"""There are two fields in this dataset:
|
20 |
+
|
21 |
+
- `text`: the movie review text.
|
22 |
+
- `label`: a value that is either `0` for a negative review or `1` for a positive review.
|
23 |
+
|
24 |
+
## Preprocess
|
25 |
+
|
26 |
+
The next step is to load a DistilBERT tokenizer to preprocess the `text` field:
|
27 |
+
"""
|
28 |
+
|
29 |
+
from transformers import AutoTokenizer
|
30 |
+
|
31 |
+
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
32 |
+
|
33 |
+
"""Create a preprocessing function to tokenize `text` and truncate sequences to be no longer than DistilBERT's maximum input length:"""
|
34 |
+
|
35 |
+
def preprocess_function(examples):
|
36 |
+
return tokenizer(examples["text"], truncation=True)
|
37 |
+
|
38 |
+
"""To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once:"""
|
39 |
+
|
40 |
+
tokenized_imdb = imdb.map(preprocess_function, batched=True)
|
41 |
+
|
42 |
+
"""Now create a batch of examples using [DataCollatorWithPadding](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorWithPadding). It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length."""
|
43 |
+
|
44 |
+
from transformers import DataCollatorWithPadding
|
45 |
+
|
46 |
+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
47 |
+
|
48 |
+
"""## Evaluate
|
49 |
+
|
50 |
+
Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
|
51 |
+
"""
|
52 |
+
|
53 |
+
import evaluate
|
54 |
+
|
55 |
+
accuracy = evaluate.load("accuracy")
|
56 |
+
|
57 |
+
"""Then create a function that passes your predictions and labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the accuracy:"""
|
58 |
+
|
59 |
+
import numpy as np
|
60 |
+
|
61 |
+
|
62 |
+
def compute_metrics(eval_pred):
|
63 |
+
predictions, labels = eval_pred
|
64 |
+
predictions = np.argmax(predictions, axis=1)
|
65 |
+
return accuracy.compute(predictions=predictions, references=labels)
|
66 |
+
|
67 |
+
"""Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
|
68 |
+
|
69 |
+
## Train
|
70 |
+
|
71 |
+
Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`:
|
72 |
+
"""
|
73 |
+
|
74 |
+
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
|
75 |
+
label2id = {"NEGATIVE": 0, "POSITIVE": 1}
|
76 |
+
|
77 |
+
"""<Tip>
|
78 |
+
|
79 |
+
If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!
|
80 |
+
|
81 |
+
</Tip>
|
82 |
+
|
83 |
+
You're ready to start training your model now! Load DistilBERT with [AutoModelForSequenceClassification](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSequenceClassification) along with the number of expected labels, and the label mappings:
|
84 |
+
"""
|
85 |
+
|
86 |
+
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
|
87 |
+
|
88 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
89 |
+
"distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
|
90 |
+
)
|
91 |
+
|
92 |
+
"""At this point, only three steps remain:
|
93 |
+
|
94 |
+
1. Define your training hyperparameters in [TrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) will evaluate the accuracy and save the training checkpoint.
|
95 |
+
2. Pass the training arguments to [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
|
96 |
+
3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model.
|
97 |
+
"""
|
98 |
+
|
99 |
+
training_args = TrainingArguments(
|
100 |
+
output_dir="my_tc_model",
|
101 |
+
learning_rate=2e-5,
|
102 |
+
per_device_train_batch_size=16,
|
103 |
+
per_device_eval_batch_size=16,
|
104 |
+
num_train_epochs=2,
|
105 |
+
weight_decay=0.01,
|
106 |
+
evaluation_strategy="epoch",
|
107 |
+
save_strategy="epoch",
|
108 |
+
load_best_model_at_end=True,
|
109 |
+
push_to_hub=True,
|
110 |
+
)
|
111 |
+
|
112 |
+
trainer = Trainer(
|
113 |
+
model=model,
|
114 |
+
args=training_args,
|
115 |
+
train_dataset=tokenized_imdb["train"],
|
116 |
+
eval_dataset=tokenized_imdb["test"],
|
117 |
+
tokenizer=tokenizer,
|
118 |
+
data_collator=data_collator,
|
119 |
+
compute_metrics=compute_metrics,
|
120 |
+
)
|
121 |
+
|
122 |
+
trainer.train()
|
123 |
+
|
124 |
+
"""<Tip>
|
125 |
+
|
126 |
+
[Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) applies dynamic padding by default when you pass `tokenizer` to it. In this case, you don't need to specify a data collator explicitly.
|
127 |
+
|
128 |
+
</Tip>
|
129 |
+
|
130 |
+
Once training is completed, share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:
|
131 |
+
"""
|
132 |
+
|
133 |
+
trainer.push_to_hub()
|
134 |
+
|
135 |
+
"""<Tip>
|
136 |
+
|
137 |
+
For a more in-depth example of how to finetune a model for text classification, take a look at the corresponding
|
138 |
+
[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)
|
139 |
+
or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
|
140 |
+
|
141 |
+
</Tip>
|
142 |
+
|
143 |
+
## Inference
|
144 |
+
|
145 |
+
Great, now that you've finetuned a model, you can use it for inference!
|
146 |
+
|
147 |
+
Grab some text you'd like to run inference on:
|
148 |
+
"""
|
149 |
+
|
150 |
+
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
|
151 |
+
|
152 |
+
"""The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for sentiment analysis with your model, and pass your text to it:"""
|
153 |
+
|
154 |
+
from transformers import pipeline
|
155 |
+
|
156 |
+
classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model")
|
157 |
+
print(classifier(text))
|
158 |
+
|
159 |
+
"""You can also manually replicate the results of the `pipeline` if you'd like:
|
160 |
+
|
161 |
+
Tokenize the text and return PyTorch tensors:
|
162 |
+
"""
|
163 |
+
|
164 |
+
from transformers import AutoTokenizer
|
165 |
+
|
166 |
+
tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
|
167 |
+
inputs = tokenizer(text, return_tensors="pt")
|
168 |
+
|
169 |
+
"""Pass your inputs to the model and return the `logits`:"""
|
170 |
+
|
171 |
+
from transformers import AutoModelForSequenceClassification
|
172 |
+
|
173 |
+
model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
|
174 |
+
with torch.no_grad():
|
175 |
+
logits = model(**inputs).logits
|
176 |
+
|
177 |
+
"""Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label:"""
|
178 |
+
|
179 |
+
predicted_class_id = logits.argmax().item()
|
180 |
+
print(model.config.id2label[predicted_class_id])
|