muellerzr HF staff commited on
Commit
c995e38
1 Parent(s): 6685014

Core scripts work 1:1

Browse files
scripts/core_example_multigpu.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import evaluate
17
+ import torch
18
+ from datasets import load_dataset
19
+ from torch.optim import AdamW
20
+ from torch.utils.data import DataLoader
21
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
22
+
23
+ from accelerate import Accelerator, DistributedType
24
+ from accelerate.utils import set_seed
25
+
26
+ import transformers
27
+
28
+ transformers.logging.set_verbosity_error()
29
+
30
+ import os
31
+ from torch.nn.parallel import DistributedDataParallel
32
+ import torch.distributed as torch_distributed
33
+
34
+
35
+
36
+ def get_dataloaders(batch_size: int = 16):
37
+ """
38
+ Creates a set of `DataLoader`s for the `glue` dataset,
39
+ using "bert-base-cased" as the tokenizer.
40
+
41
+ Args:
42
+ accelerator (`Accelerator`):
43
+ An `Accelerator` object
44
+ batch_size (`int`, *optional*):
45
+ The batch size for the train and validation DataLoaders.
46
+ """
47
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
48
+ datasets = load_dataset("glue", "mrpc")
49
+
50
+ def tokenize_function(examples):
51
+ outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
52
+ return outputs
53
+
54
+ tokenized_datasets = datasets.map(
55
+ tokenize_function,
56
+ batched=True,
57
+ remove_columns=["idx", "sentence1", "sentence2"],
58
+ )
59
+ tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
60
+
61
+ def collate_fn(examples):
62
+ return tokenizer.pad(
63
+ examples,
64
+ padding="longest",
65
+ max_length=None,
66
+ pad_to_multiple_of=8,
67
+ return_tensors="pt",
68
+ )
69
+
70
+ train_dataloader = DataLoader(
71
+ tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
72
+ )
73
+ eval_dataloader = DataLoader(
74
+ tokenized_datasets["validation"],
75
+ shuffle=False,
76
+ collate_fn=collate_fn,
77
+ batch_size=32,
78
+ drop_last=False,
79
+ )
80
+
81
+ return train_dataloader, eval_dataloader
82
+
83
+
84
+ def training_function():
85
+ torch_distributed.init_process_group(backend="nccl")
86
+ num_processes = torch_distributed.get_world_size()
87
+ process_index = torch_distributed.get_rank()
88
+ local_process_index = int(os.environ.get("LOCAL_RANK", -1))
89
+ device = torch.device("cuda", local_process_index)
90
+ torch.cuda.set_device(device)
91
+ config = {"lr": 2e-5, "num_epochs": 3, "seed": 42}
92
+ seed = int(config["seed"])
93
+ batch_size = 32 # Check if this needs to be 32?
94
+ config["batch_size"] = batch_size
95
+ metric = evaluate.load("glue", "mrpc")
96
+
97
+ set_seed(seed, device_specific=False)
98
+ train_dataloader, eval_dataloader = get_dataloaders(batch_size)
99
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True).to(device)
100
+ model = DistributedDataParallel(
101
+ model, device_ids=[local_process_index], output_device=local_process_index
102
+ )
103
+
104
+ optimizer = AdamW(params=model.parameters(), lr=config["lr"])
105
+ lr_scheduler = get_linear_schedule_with_warmup(
106
+ optimizer=optimizer,
107
+ num_warmup_steps=0,
108
+ num_training_steps=(len(train_dataloader) * config["num_epochs"]),
109
+ )
110
+
111
+ current_step = 0
112
+ for epoch in range(config["num_epochs"]):
113
+ model.train()
114
+ total_loss = 0
115
+ for _, batch in enumerate(train_dataloader):
116
+ batch = batch.to(device)
117
+ outputs = model(**batch)
118
+ loss = outputs.loss
119
+ total_loss += loss.detach().cpu().float()
120
+ current_step += 1
121
+ loss.backward()
122
+ optimizer.step()
123
+ lr_scheduler.step()
124
+ optimizer.zero_grad()
125
+
126
+ model.eval()
127
+ for step, batch in enumerate(eval_dataloader):
128
+ # We could avoid this line since we set the accelerator with `device_placement=True`.
129
+ batch = batch.to(device)
130
+ with torch.no_grad():
131
+ outputs = model(**batch)
132
+ predictions = outputs.logits.argmax(dim=-1)
133
+ metric.add_batch(
134
+ predictions=predictions,
135
+ references=batch["labels"],
136
+ )
137
+
138
+ eval_metric = metric.compute()
139
+ if process_index == 0:
140
+ print(
141
+ f"epoch {epoch}: {eval_metric}\n"
142
+ f"train_loss: {total_loss.item()/len(train_dataloader)}"
143
+ )
144
+
145
+
146
+ def main():
147
+ training_function()
148
+
149
+
150
+ if __name__ == "__main__":
151
+ main()
scripts/core_example_single_gpu.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import evaluate
17
+ import torch
18
+ from datasets import load_dataset
19
+ from torch.optim import AdamW
20
+ from torch.utils.data import DataLoader
21
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
22
+
23
+ from accelerate import Accelerator, DistributedType
24
+ from accelerate.utils import set_seed
25
+
26
+ import transformers
27
+
28
+ transformers.logging.set_verbosity_error()
29
+
30
+
31
+
32
+ def get_dataloaders(batch_size: int = 16):
33
+ """
34
+ Creates a set of `DataLoader`s for the `glue` dataset,
35
+ using "bert-base-cased" as the tokenizer.
36
+
37
+ Args:
38
+ accelerator (`Accelerator`):
39
+ An `Accelerator` object
40
+ batch_size (`int`, *optional*):
41
+ The batch size for the train and validation DataLoaders.
42
+ """
43
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
44
+ datasets = load_dataset("glue", "mrpc")
45
+
46
+ def tokenize_function(examples):
47
+ outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
48
+ return outputs
49
+
50
+ tokenized_datasets = datasets.map(
51
+ tokenize_function,
52
+ batched=True,
53
+ remove_columns=["idx", "sentence1", "sentence2"],
54
+ )
55
+ tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
56
+
57
+ def collate_fn(examples):
58
+ return tokenizer.pad(
59
+ examples,
60
+ padding="longest",
61
+ max_length=None,
62
+ pad_to_multiple_of=8,
63
+ return_tensors="pt",
64
+ )
65
+
66
+ train_dataloader = DataLoader(
67
+ tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
68
+ )
69
+ eval_dataloader = DataLoader(
70
+ tokenized_datasets["validation"],
71
+ shuffle=False,
72
+ collate_fn=collate_fn,
73
+ batch_size=32,
74
+ drop_last=False,
75
+ )
76
+
77
+ return train_dataloader, eval_dataloader
78
+
79
+
80
+ def training_function():
81
+ config = {"lr": 2e-5, "num_epochs": 3, "seed": 42}
82
+ seed = int(config["seed"])
83
+ batch_size = 32
84
+ config["batch_size"] = batch_size
85
+ metric = evaluate.load("glue", "mrpc")
86
+
87
+ set_seed(seed, device_specific=False)
88
+ train_dataloader, eval_dataloader = get_dataloaders(batch_size)
89
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
90
+ model.cuda()
91
+
92
+ optimizer = AdamW(params=model.parameters(), lr=config["lr"])
93
+ lr_scheduler = get_linear_schedule_with_warmup(
94
+ optimizer=optimizer,
95
+ num_warmup_steps=0,
96
+ num_training_steps=(len(train_dataloader) * config["num_epochs"]),
97
+ )
98
+
99
+ current_step = 0
100
+ for epoch in range(config["num_epochs"]):
101
+ model.train()
102
+ total_loss = 0
103
+ for _, batch in enumerate(train_dataloader):
104
+ batch = batch.to("cuda")
105
+ outputs = model(**batch)
106
+ loss = outputs.loss
107
+ total_loss += loss.detach().cpu().float()
108
+ current_step += 1
109
+ loss.backward()
110
+ optimizer.step()
111
+ lr_scheduler.step()
112
+ optimizer.zero_grad()
113
+
114
+ model.eval()
115
+ for step, batch in enumerate(eval_dataloader):
116
+ # We could avoid this line since we set the accelerator with `device_placement=True`.
117
+ batch = batch.to("cuda")
118
+ with torch.no_grad():
119
+ outputs = model(**batch)
120
+ predictions = outputs.logits.argmax(dim=-1)
121
+ metric.add_batch(
122
+ predictions=predictions,
123
+ references=batch["labels"],
124
+ )
125
+
126
+ eval_metric = metric.compute()
127
+
128
+ # Use accelerator.print to print only on the main process.
129
+ print(f"epoch {epoch}:", eval_metric)
130
+ print("train_loss: ", total_loss.item() / len(train_dataloader))
131
+
132
+
133
+ def main():
134
+ training_function()
135
+
136
+
137
+ if __name__ == "__main__":
138
+ main()
scripts/nlp_example.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import evaluate
17
+ import torch
18
+ from datasets import load_dataset
19
+ from torch.optim import AdamW
20
+ from torch.utils.data import DataLoader
21
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
22
+
23
+ from accelerate import Accelerator, DistributedType
24
+ from accelerate.utils import set_seed
25
+
26
+
27
+ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16):
28
+ """
29
+ Creates a set of `DataLoader`s for the `glue` dataset,
30
+ using "bert-base-cased" as the tokenizer.
31
+
32
+ Args:
33
+ accelerator (`Accelerator`):
34
+ An `Accelerator` object
35
+ batch_size (`int`, *optional*):
36
+ The batch size for the train and validation DataLoaders.
37
+ """
38
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
39
+ datasets = load_dataset("glue", "mrpc")
40
+
41
+ def tokenize_function(examples):
42
+ # max_length=None => use the model max length (it's actually the default)
43
+ outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
44
+ return outputs
45
+
46
+ # Apply the method we just defined to all the examples in all the splits of the dataset
47
+ # starting with the main process first:
48
+ with accelerator.main_process_first():
49
+ tokenized_datasets = datasets.map(
50
+ tokenize_function,
51
+ batched=True,
52
+ remove_columns=["idx", "sentence1", "sentence2"],
53
+ )
54
+
55
+ # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
56
+ # transformers library
57
+ tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
58
+
59
+ def collate_fn(examples):
60
+ # On TPU it's best to pad everything to the same length or training will be very slow.
61
+ max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
62
+ # When using mixed precision we want round multiples of 8/16
63
+ if accelerator.mixed_precision != "no":
64
+ pad_to_multiple_of = 8
65
+ else:
66
+ pad_to_multiple_of = None
67
+
68
+ return tokenizer.pad(
69
+ examples,
70
+ padding="longest",
71
+ max_length=max_length,
72
+ pad_to_multiple_of=pad_to_multiple_of,
73
+ return_tensors="pt",
74
+ )
75
+
76
+ # Instantiate dataloaders.
77
+ train_dataloader = DataLoader(
78
+ tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size, drop_last=True
79
+ )
80
+ eval_dataloader = DataLoader(
81
+ tokenized_datasets["validation"],
82
+ shuffle=False,
83
+ collate_fn=collate_fn,
84
+ batch_size=32,
85
+ drop_last=(accelerator.mixed_precision == "fp8"),
86
+ )
87
+
88
+ return train_dataloader, eval_dataloader
89
+
90
+
91
+ def training_function(config):
92
+ # Initialize accelerator
93
+ accelerator = Accelerator(
94
+ mixed_precision="fp16",
95
+ log_with="aim",
96
+ project_dir="aim_logs"
97
+ )
98
+ # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
99
+ lr = config["lr"]
100
+ num_epochs = int(config["num_epochs"])
101
+ seed = int(config["seed"])
102
+ batch_size = 16 if accelerator.num_processes > 1 else 32
103
+ config["batch_size"] = batch_size
104
+ metric = evaluate.load("glue", "mrpc")
105
+
106
+ set_seed(seed, device_specific=True)
107
+ train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)
108
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", return_dict=True)
109
+ lr = lr * accelerator.num_processes
110
+
111
+ optimizer = AdamW(params=model.parameters(), lr=lr)
112
+ lr_scheduler = get_linear_schedule_with_warmup(
113
+ optimizer=optimizer,
114
+ num_warmup_steps=0,
115
+ num_training_steps=(len(train_dataloader) * num_epochs),
116
+ )
117
+
118
+ model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
119
+ model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
120
+ )
121
+
122
+ accelerator.init_trackers(f'{accelerator.num_processes}_gpus', config)
123
+
124
+ current_step = 0
125
+ for epoch in range(num_epochs):
126
+ model.train()
127
+ total_loss = 0
128
+ for _, batch in enumerate(train_dataloader):
129
+ lr = lr_scheduler.get_lr()
130
+ outputs = model(**batch)
131
+ loss = outputs.loss
132
+ batch_loss = accelerator.gather(loss).detach().mean().cpu().float()
133
+ total_loss += batch_loss
134
+ current_step += 1
135
+ accelerator.log(
136
+ {
137
+ "batch_loss":batch_loss,
138
+ "learning_rate":lr,
139
+ },
140
+ step=current_step,
141
+ log_kwargs={"aim":{"epoch":epoch}}
142
+ )
143
+ accelerator.backward(loss)
144
+ optimizer.step()
145
+ lr_scheduler.step()
146
+ optimizer.zero_grad()
147
+ current_step += 1
148
+
149
+ model.eval()
150
+ for step, batch in enumerate(eval_dataloader):
151
+ # We could avoid this line since we set the accelerator with `device_placement=True`.
152
+ batch.to(accelerator.device)
153
+ with torch.no_grad():
154
+ outputs = model(**batch)
155
+ predictions = outputs.logits.argmax(dim=-1)
156
+ predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
157
+ metric.add_batch(
158
+ predictions=predictions,
159
+ references=references,
160
+ )
161
+
162
+ eval_metric = metric.compute()
163
+
164
+ # Use accelerator.print to print only on the main process.
165
+ accelerator.print(f"epoch {epoch}:", eval_metric)
166
+
167
+ accelerator.log(
168
+ {
169
+ "accuracy": eval_metric["accuracy"],
170
+ "f1": eval_metric["f1"],
171
+ "train_loss": total_loss.item() / len(train_dataloader),
172
+ },
173
+ log_kwargs = {"aim":{"epoch":epoch}}
174
+ )
175
+ accelerator.end_training()
176
+
177
+
178
+ def main():
179
+ config = {"lr": 2e-5, "num_epochs": 3, "seed": 42}
180
+ training_function(config)
181
+
182
+
183
+ if __name__ == "__main__":
184
+ main()