frankmorales2020 commited on
Commit
a12a12b
·
verified ·
1 Parent(s): 9464de5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +10 -15
README.md CHANGED
@@ -68,21 +68,15 @@ args = TrainingArguments(
68
 
69
  output_dir="/content/gdrive/MyDrive/model/NEW-Meta-Llama-3-8B-MEDAL-flash-attention-2-cosine-evaldata",
70
 
71
- #num_train_epochs=3, # number of training epochs
72
- num_train_epochs=1, # number of training epochs for POC
73
  per_device_train_batch_size=2, # batch size per device during training
74
- #2
75
  gradient_accumulation_steps=8, # number of steps before performing a backward/update pass
76
  gradient_checkpointing=True, # use gradient checkpointing to save memory
77
- #gradient_checkpointing_kwargs={"use_reentrant": True},
78
- optim="adamw_torch_fused", # use fused adamw optimizer
79
-
80
- #ELECTRA is trained with Adam optimizer with learning
81
- #rate of 0.00002 and with batch size of 16
82
-
83
- #trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None))
84
- logging_steps=200, # log every 10 steps
85
- #save_strategy="epoch", # save checkpoint every epoch
86
 
87
  learning_rate=2e-4, # learning rate, based on QLoRA paper # i used in the first model
88
  bf16=True, # use bfloat16 precision
@@ -95,15 +89,16 @@ args = TrainingArguments(
95
 
96
  push_to_hub=True, # push model to hub
97
  report_to="tensorboard", # report metrics to tensorboard
 
98
  gradient_checkpointing_kwargs={"use_reentrant": True},
99
 
100
  load_best_model_at_end=True,
101
  logging_dir="/content/gdrive/MyDrive/model/NEW-Meta-Llama-3-8B-MEDAL-flash-attention-2-cosine-evaldata/logs",
102
 
103
- evaluation_strategy="steps", # Evaluate at step intervals
104
  eval_steps=200, # Evaluate every 50 steps
105
- save_strategy="steps", # Save checkpoints at step intervals
106
- save_steps=200, # Save every 50 steps (aligned with eval_steps)
107
  metric_for_best_model = "loss",
108
  ]
109
  )
 
68
 
69
  output_dir="/content/gdrive/MyDrive/model/NEW-Meta-Llama-3-8B-MEDAL-flash-attention-2-cosine-evaldata",
70
 
71
+ #num_train_epochs=3, # number of training epochs
72
+ num_train_epochs=1, # number of training epochs for POC
73
  per_device_train_batch_size=2, # batch size per device during training
74
+
75
  gradient_accumulation_steps=8, # number of steps before performing a backward/update pass
76
  gradient_checkpointing=True, # use gradient checkpointing to save memory
77
+
78
+ optim="adamw_torch_fused", # use fused adamw optimizer
79
+ logging_steps=200, # log every 200 steps
 
 
 
 
 
 
80
 
81
  learning_rate=2e-4, # learning rate, based on QLoRA paper # i used in the first model
82
  bf16=True, # use bfloat16 precision
 
89
 
90
  push_to_hub=True, # push model to hub
91
  report_to="tensorboard", # report metrics to tensorboard
92
+
93
  gradient_checkpointing_kwargs={"use_reentrant": True},
94
 
95
  load_best_model_at_end=True,
96
  logging_dir="/content/gdrive/MyDrive/model/NEW-Meta-Llama-3-8B-MEDAL-flash-attention-2-cosine-evaldata/logs",
97
 
98
+ evaluation_strategy="steps", # Evaluate at step intervals
99
  eval_steps=200, # Evaluate every 50 steps
100
+ save_strategy="steps", # Save checkpoints at step intervals
101
+ save_steps=200, # Save every 50 steps (aligned with eval_steps)
102
  metric_for_best_model = "loss",
103
  ]
104
  )