base_model: meta-llama/Llama-2-7b-chat-hf model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: false load_in_4bit: true strict: false datasets: - path: ../upsampled_train.json ds_type: json type: alpaca split: train test_datasets: - path: ../val.json ds_type: json type: alpaca split: train load_best_model_at_end: False early_stopping_patience: dataset_prepared_path: val_set_size: 0 output_dir: ./qlora-out-llama2-balance-1st adapter: qlora lora_model_dir: sequence_len: 4096 sample_packing: false pad_to_sequence_len: false lora_r: 32 lora_alpha: 16 lora_dropout: 0.15 lora_target_linear: true lora_fan_in_fan_out: lora_target_modules: - gate_proj - down_proj - up_proj - q_proj - v_proj - k_proj - o_proj wandb_project: fine-tune-sal wandb_entity: wandb_watch: wandb_run_id: wandb_log_model: gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 5 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0001 train_on_inputs: false group_by_length: false bf16: true fp16: false tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: true wandb_project: fine-tune-sal wandb_entity: wandb_watch: wandb_run_id: wandb_log_model: warmup_steps: 10 eval_steps: 0.05 eval_table_size: eval_table_max_new_tokens: 128 save_steps: debug: deepspeed: weight_decay: 0.0 fsdp: fsdp_config: special_tokens: bos_token: "" eos_token: "" unk_token: ""