base_model: /media/renfroe/llms/SmolLM-360M/ model_type: LlamaForCausalLM tokenizer_type: GPT2Tokenizer seed: 122887 load_in_8bit: false load_in_4bit: false strict: false max_steps: 0 resume_from_checkpoint: datasets: - path: /home/renfroe/Desktop/sqa_tiny-llama_dataset/Dynamic_Optimization_Methods_with_Applications_sqa_answers_only.json type: field_instruction: question field_output: answer format: "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" no_input_format: "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" - path: /home/renfroe/Dev/tinyllama-models/dataset/open_hermes_top_tech.json type: sharegpt - path: /home/renfroe/Desktop/sqa_tiny-llama_dataset/hermes_prior_knowledge_question_expansion_with_answers.json type: field_instruction: question field_output: answer format: "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" no_input_format: "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" - path: /home/renfroe/Desktop/sqa_tiny-llama_dataset/hermes_prior_knowledge_question_expansion_with_answers.json type: field_instruction: question field_output: answer format: "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" no_input_format: "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" - path: /home/renfroe/Desktop/sqa_tiny-llama_dataset/or-farm_sharegpt.json type: sharegpt dataset_prepared_path: val_set_size: 0.2 output_dir: ./SmolLM-Ora auto_resume_from_checkpoints: false sequence_len: 2048 sample_packing: true chat_template: chatml wandb_project: SmolLM-Ora wandb_entity: wandb_watch: wandb_name: wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 10 num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: linear weight_decay: 0.0000001 learning_rate: 0.0001 lr_scheduler_kwargs: # num_cycles: 3 train_on_inputs: false group_by_length: false bf16: auto fp16: tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 xformers_attention: flash_attention: true eval_sample_packing: False warmup_steps: 50 evals_per_epoch: 4 eval_table_size: saves_per_epoch: 4 debug: deepspeed: fsdp: fsdp_config: special_tokens: bos_token: "<|endoftext|>" eos_token: "<|endoftext|>" pad_token: "<|endoftext|>"