Saving weights and logs of step 2000

Files changed (5) hide show

flax_model.msgpack CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bfe1b36176c2200bf1451814194ad46ad5a6cde61cfaece83b99f3b30a8b8634
 size 891548548

 version https://git-lfs.github.com/spec/v1
+oid sha256:02c8aedd34c528d3a7806d216941cc23732a751a8d687f8bf1db06eb1e1e75a3
 size 891548548

run_t5.sh CHANGED Viewed

@@ -25,7 +25,7 @@ mkdir -p "${MODEL_DIR}/runs"
     --logging_steps="50" \
     --save_steps="2000" \
     --eval_steps="10000000" \
-    --resume_from_checkpoint="${MODEL_DIR}/ckpt-16000" \
     --warmup_steps="3413" \
     --push_to_hub

     --logging_steps="50" \
     --save_steps="2000" \
     --eval_steps="10000000" \
+    --resume_from_checkpoint="${MODEL_DIR}/ckpt-18000" \
     --warmup_steps="3413" \
     --push_to_hub

run_t5_mlm_flax_custom_dataset.py CHANGED Viewed

@@ -580,7 +580,7 @@ if __name__ == "__main__":
         train, val = train_val_files()
-        load_grouped = False
         if not load_grouped:
             datasets = load_dataset('json', data_files={'train': train, 'validation': val})
@@ -899,8 +899,8 @@ if __name__ == "__main__":
         for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
             cur_step = epoch * (num_train_samples // train_batch_size) + step
             # skip to the step from which we are resuming
-            if cur_step < resume_step:
-                continue
             samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
             model_inputs = data_collator(samples)

         train, val = train_val_files()
+        load_grouped = True
         if not load_grouped:
             datasets = load_dataset('json', data_files={'train': train, 'validation': val})
         for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
             cur_step = epoch * (num_train_samples // train_batch_size) + step
             # skip to the step from which we are resuming
+#            if cur_step < resume_step:
+#                continue
             samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
             model_inputs = data_collator(samples)

runs/Jul11_12-53-41_t1v-n-0e7426e8-w-0/events.out.tfevents.1626008983.t1v-n-0e7426e8-w-0.161493.3.v2 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba46ea33a14787fe4fc81fe586551875621a7f46301e0a680643702851773220
-size 300067

 version https://git-lfs.github.com/spec/v1
+oid sha256:03ddeed93b5615c1239be282f05bf781971c8a799be72c9bebc4de1d596fbd63
+size 585827

runs/Jul11_17-06-36_t1v-n-0e7426e8-w-0/events.out.tfevents.1626023202.t1v-n-0e7426e8-w-0.178001.3.v2 ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b89824cdb72fe97627209c68074b163e725d00349a36ed38b233e7d579e1b92
+size 296685