Mistral-SEC-Models
Collection
8 items
•
Updated
•
1
This model was trained for a single epoch with 1.6B SEC data in Llama-pro style
We only trained the newly added blocks as in the Llama pro paper while keeping every other layer frozen.
Can use with SEC data.
Trained with p5.48xlarge GPU (8 x 80GB).
# Model arguments
model_name_or_path: arcee-ai/Mistral-7B-Instruct-v0.2-expanded
model_revision: main
torch_dtype: bfloat16
# Data training arguments
dataset_mixer:
arcee-ai/sec-data-full: 1.0
dataset_splits:
- train
preprocessing_num_workers: 12
# SFT trainer config
bf16: true
do_eval: False
evaluation_strategy: "no"
gradient_accumulation_steps: 32
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: False
hub_model_id: arcee-ai/mistral-instruct-v2-sec-expanded
hub_strategy: every_save
learning_rate: 2.0e-05
log_level: info
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine
max_seq_length: 2048
max_steps: -1
num_train_epochs: 1
output_dir: data/mistral-instruct-v2-sec-expanded-new
overwrite_output_dir: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 16
push_to_hub: true
remove_unused_columns: true
report_to:
- wandb
save_strategy: "steps"
save_steps: 100
save_total_limit: 1
seed: 42
warmup_ratio: 0.01
# ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=8 scripts/run_cpt.py recipes/gpt2-nl/cpt/config_full.yaml