pere
/

norwegian-robert-large-test

Model card Files Files and versions Metrics Training metrics Community

pere commited on Dec 12, 2024

Commit

a8c4f2a

·

verified ·

1 Parent(s): 2c614c4

Update run_mlm_flax.py

Files changed (1) hide show

run_mlm_flax.py +13 -1

run_mlm_flax.py CHANGED Viewed

@@ -687,7 +687,18 @@ def main():
     per_device_eval_batch_size = training_args.per_device_eval_batch_size
     eval_batch_size = per_device_eval_batch_size * local_device_count
     num_train_steps = (len(tokenized_datasets["train"]) // (train_batch_size * jax.process_count())) * num_epochs
     # Create learning rate schedule
     warmup_fn = optax.linear_schedule(
@@ -817,7 +828,8 @@ def main():
         train_samples_idx = np.arange(num_train_samples)
         train_samples_idx = np.random.permutation(train_samples_idx)
-        # Split the training indices across processes        train_samples_idx = np.array_split(train_samples_idx, jax.process_count())[jax.process_index()]
         train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size, drop_last=True)
         # Gather the indexes for creating the batch and do a training step

     per_device_eval_batch_size = training_args.per_device_eval_batch_size
     eval_batch_size = per_device_eval_batch_size * local_device_count
+    # Calculate Global Batch Sizes
+    global_train_batch_size = train_batch_size * jax.process_count()
+    global_eval_batch_size = eval_batch_size * jax.process_count()
+    # Log Batch Sizes
+    logger.info(f"Per-process train batch size: {train_batch_size}")
+    logger.info(f"Global train batch size: {global_train_batch_size}")
+    logger.info(f"Per-process eval batch size: {per_device_eval_batch_size}")
+    logger.info(f"Global eval batch size: {global_eval_batch_size}")
     num_train_steps = (len(tokenized_datasets["train"]) // (train_batch_size * jax.process_count())) * num_epochs
+    logger.info(f"Number of training steps: {num_train_steps}")
     # Create learning rate schedule
     warmup_fn = optax.linear_schedule(
         train_samples_idx = np.arange(num_train_samples)
         train_samples_idx = np.random.permutation(train_samples_idx)
+        # Split the training indices across processes
+        train_samples_idx = np.array_split(train_samples_idx, jax.process_count())[jax.process_index()]
         train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size, drop_last=True)
         # Gather the indexes for creating the batch and do a training step