cache_dir: ./cache ddp_find_unused_parameters: false ddp_timeout: 30000 device_map: auto do_eval: true do_train: true eval_steps: 500 evaluation_strategy: steps fp16: true gradient_accumulation_steps: 1 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false hub_model_id: hllj/sft-zephyr-7b-beta-v2 hub_strategy: every_save learning_rate: 5.0e-05 log_level: info logging_first_step: true logging_steps: 10 logging_strategy: steps lora_alpha: 128 lora_dropout: 0.05 lora_r: 256 lora_target_modules: - q_proj - k_proj - v_proj - o_proj lr_scheduler_type: cosine max_seq_length: 512 model_name_or_path: hllj/zephyr-7b-beta-vi-math model_type: auto num_train_epochs: 2 output_dir: outputs-sft-zephyr-beta-v2 overwrite_output_dir: true per_device_eval_batch_size: 4 per_device_train_batch_size: 4 preprocessing_num_workers: 4 push_to_hub: true report_to: wandb run_name: sft-zephyr-7b-beta-v2 save_steps: 500 save_strategy: steps save_total_limit: 13 seed: 42 torch_dtype: float16 train_file_dir: datasets/finetune use_peft: true warmup_ratio: 0.05 weight_decay: 0.05