muellerzr HF staff commited on
Commit
d091751
1 Parent(s): 52eaca3

Move zero_grad

Browse files
code_samples/base/accelerate CHANGED
@@ -7,11 +7,11 @@ train_dataloader, model, optimizer scheduler = accelerator.prepare(
7
 
8
  model.train()
9
  for batch in train_dataloader:
10
- optimizer.zero_grad()
11
  inputs, targets = batch
12
  outputs = model(inputs)
13
  loss = loss_function(outputs, targets)
14
  accelerator.backward(loss)
15
  optimizer.step()
16
  scheduler.step()
 
17
  </pre>
 
7
 
8
  model.train()
9
  for batch in train_dataloader:
 
10
  inputs, targets = batch
11
  outputs = model(inputs)
12
  loss = loss_function(outputs, targets)
13
  accelerator.backward(loss)
14
  optimizer.step()
15
  scheduler.step()
16
+ optimizer.zero_grad()
17
  </pre>
code_samples/base/basic CHANGED
@@ -7,7 +7,6 @@
7
  +)
8
 
9
  for batch in dataloader:
10
- optimizer.zero_grad()
11
  inputs, targets = batch
12
  - inputs = inputs.to(device)
13
  - targets = targets.to(device)
@@ -16,7 +15,8 @@ for batch in dataloader:
16
  - loss.backward()
17
  + accelerator.backward(loss)
18
  optimizer.step()
19
- scheduler.step()</pre>
 
20
  ##
21
  Everything around `accelerate` occurs with the `Accelerator` class. To use it, first make an object.
22
  Then call `.prepare` passing in the PyTorch objects that you would normally train with. This will
 
7
  +)
8
 
9
  for batch in dataloader:
 
10
  inputs, targets = batch
11
  - inputs = inputs.to(device)
12
  - targets = targets.to(device)
 
15
  - loss.backward()
16
  + accelerator.backward(loss)
17
  optimizer.step()
18
+ scheduler.step()
19
+ optimizer.zero_grad()</pre>
20
  ##
21
  Everything around `accelerate` occurs with the `Accelerator` class. To use it, first make an object.
22
  Then call `.prepare` passing in the PyTorch objects that you would normally train with. This will
code_samples/base/calculating_metrics CHANGED
@@ -11,7 +11,6 @@ import evaluate
11
  +)
12
  metric = evaluate.load("accuracy")
13
  for batch in train_dataloader:
14
- optimizer.zero_grad()
15
  inputs, targets = batch
16
  - inputs = inputs.to(device)
17
  - targets = targets.to(device)
@@ -20,6 +19,7 @@ for batch in train_dataloader:
20
  loss.backward()
21
  optimizer.step()
22
  scheduler.step()
 
23
 
24
  model.eval()
25
  for batch in eval_dataloader:
 
11
  +)
12
  metric = evaluate.load("accuracy")
13
  for batch in train_dataloader:
 
14
  inputs, targets = batch
15
  - inputs = inputs.to(device)
16
  - targets = targets.to(device)
 
19
  loss.backward()
20
  optimizer.step()
21
  scheduler.step()
22
+ optimizer.zero_grad()
23
 
24
  model.eval()
25
  for batch in eval_dataloader:
code_samples/base/checkpointing CHANGED
@@ -7,13 +7,13 @@ dataloader, model, optimizer scheduler = accelerator.prepare(
7
  )
8
 
9
  for batch in dataloader:
10
- optimizer.zero_grad()
11
  inputs, targets = batch
12
  outputs = model(inputs)
13
  loss = loss_function(outputs, targets)
14
  accelerator.backward(loss)
15
  optimizer.step()
16
  scheduler.step()
 
17
  +accelerator.save_state("checkpoint_dir")
18
  +accelerator.load_state("checkpoint_dir")</pre>
19
  ##
 
7
  )
8
 
9
  for batch in dataloader:
 
10
  inputs, targets = batch
11
  outputs = model(inputs)
12
  loss = loss_function(outputs, targets)
13
  accelerator.backward(loss)
14
  optimizer.step()
15
  scheduler.step()
16
+ optimizer.zero_grad()
17
  +accelerator.save_state("checkpoint_dir")
18
  +accelerator.load_state("checkpoint_dir")</pre>
19
  ##
code_samples/base/experiment_tracking CHANGED
@@ -9,7 +9,6 @@ train_dataloader, model, optimizer scheduler = accelerator.prepare(
9
  +accelerator.init_trackers()
10
  model.train()
11
  for batch in train_dataloader:
12
- optimizer.zero_grad()
13
  inputs, targets = batch
14
  outputs = model(inputs)
15
  loss = loss_function(outputs, targets)
@@ -17,6 +16,7 @@ for batch in train_dataloader:
17
  accelerator.backward(loss)
18
  optimizer.step()
19
  scheduler.step()
 
20
  +accelerator.end_training()
21
  </pre>
22
  ##
 
9
  +accelerator.init_trackers()
10
  model.train()
11
  for batch in train_dataloader:
 
12
  inputs, targets = batch
13
  outputs = model(inputs)
14
  loss = loss_function(outputs, targets)
 
16
  accelerator.backward(loss)
17
  optimizer.step()
18
  scheduler.step()
19
+ optimizer.zero_grad()
20
  +accelerator.end_training()
21
  </pre>
22
  ##
code_samples/base/gradient_accumulation CHANGED
@@ -10,13 +10,13 @@ dataloader, model, optimizer scheduler = accelerator.prepare(
10
 
11
  for batch in dataloader:
12
  + with accelerator.accumulate(model):
13
- optimizer.zero_grad()
14
  inputs, targets = batch
15
  outputs = model(inputs)
16
  loss = loss_function(outputs, targets)
17
  accelerator.backward(loss)
18
  optimizer.step()
19
- scheduler.step()</pre>
 
20
 
21
  ##
22
  When performing gradient accumulation in a distributed setup, there are many opportunities for efficiency mistakes
 
10
 
11
  for batch in dataloader:
12
  + with accelerator.accumulate(model):
 
13
  inputs, targets = batch
14
  outputs = model(inputs)
15
  loss = loss_function(outputs, targets)
16
  accelerator.backward(loss)
17
  optimizer.step()
18
+ scheduler.step()
19
+ optimizer.zero_grad()</pre>
20
 
21
  ##
22
  When performing gradient accumulation in a distributed setup, there are many opportunities for efficiency mistakes
code_samples/base/initial CHANGED
@@ -1,6 +1,5 @@
1
  <pre>
2
  for batch in dataloader:
3
- optimizer.zero_grad()
4
  inputs, targets = batch
5
  inputs = inputs.to(device)
6
  targets = targets.to(device)
@@ -8,4 +7,5 @@ for batch in dataloader:
8
  loss = loss_function(outputs, targets)
9
  loss.backward()
10
  optimizer.step()
11
- scheduler.step()</pre>
 
 
1
  <pre>
2
  for batch in dataloader:
 
3
  inputs, targets = batch
4
  inputs = inputs.to(device)
5
  targets = targets.to(device)
 
7
  loss = loss_function(outputs, targets)
8
  loss.backward()
9
  optimizer.step()
10
+ scheduler.step()
11
+ optimizer.zero_grad()</pre>
code_samples/base/initial_with_metrics CHANGED
@@ -2,7 +2,6 @@
2
  import evaluate
3
  metric = evaluate.load("accuracy")
4
  for batch in train_dataloader:
5
- optimizer.zero_grad()
6
  inputs, targets = batch
7
  inputs = inputs.to(device)
8
  targets = targets.to(device)
@@ -11,6 +10,7 @@ for batch in train_dataloader:
11
  loss.backward()
12
  optimizer.step()
13
  scheduler.step()
 
14
 
15
  model.eval()
16
  for batch in eval_dataloader:
 
2
  import evaluate
3
  metric = evaluate.load("accuracy")
4
  for batch in train_dataloader:
 
5
  inputs, targets = batch
6
  inputs = inputs.to(device)
7
  targets = targets.to(device)
 
10
  loss.backward()
11
  optimizer.step()
12
  scheduler.step()
13
+ optimizer.zero_grad()
14
 
15
  model.eval()
16
  for batch in eval_dataloader: