pere
/

norwegian-roberta_large

Model card Files Files and versions Metrics Training metrics Community

pere commited on 7 days ago

Commit

af7221f

•

1 Parent(s): 073e1d8

Update run_mlm_flax.py

Files changed (1) hide show

run_mlm_flax.py +8 -7

run_mlm_flax.py CHANGED Viewed

@@ -679,11 +679,13 @@ def main():
     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
-    # Take into account all hosts and all devices for proper global batch size scaling
-    global_device_count = jax.device_count() * jax.process_count()
-    train_batch_size = training_args.per_device_train_batch_size * global_device_count
     per_device_eval_batch_size = training_args.per_device_eval_batch_size
-    eval_batch_size = per_device_eval_batch_size * global_device_count
     num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
@@ -816,9 +818,8 @@ def main():
         #train_samples_idx = np.random.permutation(np.arange(num_train_samples))
         train_samples_idx = np.random.permutation(train_samples_idx)
-        # Split the training indices across processes
-        train_samples_idx = np.array_split(train_samples_idx, jax.process_count())[jax.process_index()]
-        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
         # Gather the indexes for creating the batch and do a training step
         for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):

     # Store some constant
     num_epochs = int(training_args.num_train_epochs)
+    # Use local_device_count for per-process batch size
+    local_device_count = jax.local_device_count()
+    # Each process handles per_device_train_batch_size * local_device_count
+    train_batch_size = training_args.per_device_train_batch_size * local_device_count
     per_device_eval_batch_size = training_args.per_device_eval_batch_size
+    eval_batch_size = per_device_eval_batch_size * local_device_count
     num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
         #train_samples_idx = np.random.permutation(np.arange(num_train_samples))
         train_samples_idx = np.random.permutation(train_samples_idx)
+        # Split the training indices across processes        train_samples_idx = np.array_split(train_samples_idx, jax.process_count())[jax.process_index()]
+        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size, drop_last=True)
         # Gather the indexes for creating the batch and do a training step
         for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):