|
--- |
|
license: apache-2.0 |
|
base_model: allura-org/TQ2.5-14B-Neon-v1 |
|
language: |
|
- en |
|
library_name: transformers |
|
pipeline_tag: text-generation |
|
tags: |
|
- llama-cpp |
|
- gguf-my-repo |
|
--- |
|
|
|
# Triangle104/TQ2.5-14B-Neon-v1-Q4_K_S-GGUF |
|
This model was converted to GGUF format from [`allura-org/TQ2.5-14B-Neon-v1`](https://huggingface.co/allura-org/TQ2.5-14B-Neon-v1) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space. |
|
Refer to the [original model card](https://huggingface.co/allura-org/TQ2.5-14B-Neon-v1) for more details on the model. |
|
|
|
--- |
|
Model details: |
|
- |
|
RP finetune of Supernova-Medius. Turned out surprisingly nice on it's own, I honestly made it only as a merge fuel, but it impressed me and Prodeus enough to release it separately (history repeats I guess, Sugarquill also started out this way). Quite interesting prose, definitely quite distinct from Supernova or EVA for that matter. Instruction following is decent as well. Not really much to say about this one, just a decent RP model, tbh. Euryale-inspired I guess. |
|
|
|
Model was trained by Auri. |
|
|
|
Training notes |
|
|
|
Model was trained on a dataset consisting of 77M tokens of synthetic RP and short story gen data. Training took around 2 hours on 8xH100 SXM node. Training config was more or less reused from Sugarquill, and it worked fairly well again. Had the node crash after finishing the training and merging in the LoRA, so I had to merge it with MergeKit on a separate node, otherwise everything was smooth. |
|
|
|
Huge thanks to Retis Labs for sponsoring this run! |
|
|
|
Format |
|
|
|
Model responds to ChatML instruct formatting, exactly like it's base model. |
|
|
|
<|im_start|>system |
|
{system message}<|im_end|> |
|
<|im_start|>user |
|
{user message}<|im_end|> |
|
<|im_start|>assistant |
|
{response}<|im_end|> |
|
|
|
Recommended Samplers |
|
|
|
My classic stable Qwen setup works quite well: |
|
|
|
Temperature - 0.8 |
|
Min-P - 0.05 |
|
Top-A - 0.3 |
|
Repetition Penalty - 1.03 |
|
|
|
Training config |
|
See Axolotl config |
|
|
|
axolotl version 0.6.0 |
|
|
|
# Model |
|
base_model: arcee-ai/SuperNova-Medius |
|
strict: false |
|
|
|
# Liger Kernels (optimization) |
|
plugins: |
|
- axolotl.integrations.liger.LigerPlugin |
|
liger_rope: true |
|
liger_rms_norm: true |
|
liger_swiglu: true |
|
liger_fused_linear_cross_entropy: true |
|
|
|
# Output and HuggingFace |
|
output_dir: /workspace/axolotl/TQ-2.5-14B-Neon |
|
hub_model_id: allura-org/TQ-2.5-14B-Neon-LoRA |
|
hf_use_auth_token: true |
|
hub_strategy: "all_checkpoints" |
|
|
|
# WandB |
|
wandb_project: allura-org |
|
wandb_entity: |
|
wandb_name: TQ-2.5-14B-Neon-1 |
|
|
|
# Data |
|
chat_template: chatml |
|
#train_on_inputs: false |
|
group_by_length: false |
|
datasets: |
|
- path: allura-org/neon-41k |
|
type: chat_template |
|
field_messages: conversations |
|
message_field_role: from |
|
message_field_content: value |
|
|
|
## Evaluation |
|
val_set_size: 0.01 |
|
evals_per_epoch: 4 |
|
eval_table_size: |
|
eval_max_new_tokens: 128 |
|
|
|
# Technical aspects |
|
sequence_len: 16384 |
|
save_safetensors: true |
|
saves_per_epoch: 2 |
|
logging_steps: 1 |
|
special_tokens: |
|
|
|
# Quantization |
|
bf16: auto |
|
fp16: |
|
tf32: false |
|
## For LoRA |
|
load_in_8bit: false |
|
load_in_4bit: false |
|
|
|
# LoRA |
|
peft_use_rslora: true |
|
peft_use_dora: false # better but slower |
|
adapter: lora # lora or qlora |
|
lora_model_dir: |
|
lora_r: 64 # 64 is optimal for most trains on instruct |
|
lora_alpha: 32 |
|
lora_dropout: 0.1 |
|
lora_target_linear: true |
|
lora_fan_in_fan_out: |
|
lora_target_modules: |
|
# - embed_tokens |
|
# - lm_head |
|
|
|
#loraplus_lr_ratio: 8 # works to converge faster but is kinda cancer bc makes model unstable |
|
#loraplus_lr_embedding: |
|
|
|
# Training hyperparameters |
|
# max_steps: |
|
num_epochs: 2 |
|
|
|
# Anti Overfit and Stability |
|
weight_decay: 0.01 |
|
max_grad_norm: 1.0 |
|
|
|
## Learning Rate |
|
warmup_ratio: 0.05 |
|
learning_rate: 0.00003 |
|
lr_scheduler: cosine |
|
#lr_scheduler_kwargs: |
|
# min_lr: 0.0000024 |
|
optimizer: paged_ademamix_8bit # usually adamw_torch or paged_adamw_8bit |
|
|
|
## Batch Size |
|
gradient_accumulation_steps: 4 # More effective batch size - stabler train, usually. MBS also speeds it up. |
|
micro_batch_size: 4 # Batch size per gpu = micro_batch_size * gradient_accumulation_steps |
|
eval_batch_size: 1 |
|
|
|
# Optimizations |
|
pad_to_sequence_len: true |
|
sample_packing: true |
|
eval_sample_packing: false |
|
flash_attention: true |
|
xformers_attention: |
|
gradient_checkpointing: "unsloth" |
|
gradient_checkpointing_kwargs: |
|
use_reentrant: true |
|
local_rank: |
|
deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16.json # Only use with multi gpu # _bf16_cpuoffload_all |
|
# fsdp: |
|
# - full_shard |
|
# - auto_wrap |
|
# fsdp_config: |
|
# fsdp_limit_all_gathers: true |
|
# fsdp_sync_module_states: true |
|
# fsdp_offload_params: true |
|
# fsdp_use_orig_params: false |
|
# fsdp_cpu_ram_efficient_loading: true |
|
# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP |
|
# fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer |
|
# fsdp_state_dict_type: FULL_STATE_DICT |
|
# fsdp_sharding_strategy: FULL_SHARD |
|
# Misc |
|
early_stopping_patience: |
|
debug: |
|
|
|
--- |
|
## Use with llama.cpp |
|
Install llama.cpp through brew (works on Mac and Linux) |
|
|
|
```bash |
|
brew install llama.cpp |
|
|
|
``` |
|
Invoke the llama.cpp server or the CLI. |
|
|
|
### CLI: |
|
```bash |
|
llama-cli --hf-repo Triangle104/TQ2.5-14B-Neon-v1-Q4_K_S-GGUF --hf-file tq2.5-14b-neon-v1-q4_k_s.gguf -p "The meaning to life and the universe is" |
|
``` |
|
|
|
### Server: |
|
```bash |
|
llama-server --hf-repo Triangle104/TQ2.5-14B-Neon-v1-Q4_K_S-GGUF --hf-file tq2.5-14b-neon-v1-q4_k_s.gguf -c 2048 |
|
``` |
|
|
|
Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well. |
|
|
|
Step 1: Clone llama.cpp from GitHub. |
|
``` |
|
git clone https://github.com/ggerganov/llama.cpp |
|
``` |
|
|
|
Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux). |
|
``` |
|
cd llama.cpp && LLAMA_CURL=1 make |
|
``` |
|
|
|
Step 3: Run inference through the main binary. |
|
``` |
|
./llama-cli --hf-repo Triangle104/TQ2.5-14B-Neon-v1-Q4_K_S-GGUF --hf-file tq2.5-14b-neon-v1-q4_k_s.gguf -p "The meaning to life and the universe is" |
|
``` |
|
or |
|
``` |
|
./llama-server --hf-repo Triangle104/TQ2.5-14B-Neon-v1-Q4_K_S-GGUF --hf-file tq2.5-14b-neon-v1-q4_k_s.gguf -c 2048 |
|
``` |
|
|