|
{ |
|
"experiment_key": "base", |
|
"save_safetensors": true, |
|
"max_shard_size": "10GB", |
|
"local_rank": 0, |
|
"use_gradient_checkpointing": true, |
|
"trainer_key": "lm", |
|
"force_fp32": false, |
|
"force_fp16": false, |
|
"from_gptq": false, |
|
"huggingface_hub_token": null, |
|
"single_gpu": null, |
|
"master_port": 9994, |
|
"deepspeed_stage": null, |
|
"deepspeed_config_path": null, |
|
"fsdp_strategy": "", |
|
"fsdp_offload": true, |
|
"seed": 42, |
|
"stabilize": false, |
|
"norm_fp32": false, |
|
"path_to_env_file": "./.env", |
|
"prepare_dataset": true, |
|
"lora_hub_model_id": null, |
|
"lora_model_local_path": null, |
|
"fused_model_local_path": null, |
|
"fuse_after_training": false, |
|
"quantization_dataset_id": null, |
|
"quantization_max_samples": 1024, |
|
"quantized_model_path": "./quantized_model/", |
|
"quantized_hub_model_id": null, |
|
"quantized_hub_private_repo": true, |
|
"dataset_key": "soda", |
|
"train_local_path_to_data": "./train.jsonl", |
|
"eval_local_path_to_data": null, |
|
"shuffle": true, |
|
"max_eval_samples": 1000, |
|
"add_eval_to_train_if_no_path": false, |
|
"tokenizer_name_or_path": null, |
|
"tokenizer_use_fast": null, |
|
"tokenizer_padding_side": null, |
|
"collator_key": "lm", |
|
"max_length": 2048, |
|
"model_name_or_path": "bn22/Mistral-7B-Instruct-v0.1-sharded", |
|
"push_to_hub_bos_add_bos_token": false, |
|
"use_flash_attention_2": false, |
|
"trust_remote_code": false, |
|
"device_map": null, |
|
"prepare_model_for_kbit_training": true, |
|
"offload_folder": null, |
|
"load_in_8bit": false, |
|
"load_in_4bit": true, |
|
"llm_int8_threshold": 6.0, |
|
"llm_int8_has_fp16_weight": true, |
|
"bnb_4bit_use_double_quant": true, |
|
"bnb_4bit_quant_type": "nf4", |
|
"bnb_quantize_after_model_init": false, |
|
"gptq_bits": 4, |
|
"gptq_group_size": 128, |
|
"gptq_disable_exllama": true, |
|
"apply_lora": true, |
|
"lora_rank": 8, |
|
"lora_alpha": 32, |
|
"lora_dropout": 0.1, |
|
"raw_lora_target_modules": "all", |
|
"output_dir": "./outputs/", |
|
"per_device_train_batch_size": 2, |
|
"do_eval": false, |
|
"per_device_eval_batch_size": null, |
|
"gradient_accumulation_steps": 2, |
|
"eval_accumulation_steps": null, |
|
"eval_delay": 0, |
|
"eval_steps": 1000, |
|
"warmup_steps": 5, |
|
"max_steps": 100, |
|
"num_train_epochs": 1, |
|
"learning_rate": 0.0002, |
|
"max_grad_norm": 1.0, |
|
"weight_decay": 0.001, |
|
"label_smoothing_factor": 0.0, |
|
"logging_steps": 1, |
|
"save_steps": 10, |
|
"save_total_limit": 1, |
|
"optim": "paged_adamw_8bit", |
|
"push_to_hub": true, |
|
"hub_model_id": "TachyHealthResearch/Thealth-Mistral", |
|
"hub_private_repo": false, |
|
"neftune_noise_alpha": null, |
|
"project_name": null, |
|
"report_to_wandb": false, |
|
"wandb_api_key": null, |
|
"wandb_project": null, |
|
"wandb_entity": null |
|
} |