Triangle104 commited on
Commit
28630c7
·
verified ·
1 Parent(s): 2ea6558

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +160 -0
README.md CHANGED
@@ -14,6 +14,166 @@ tags:
14
  This model was converted to GGUF format from [`allura-org/TQ2.5-14B-Neon-v1`](https://huggingface.co/allura-org/TQ2.5-14B-Neon-v1) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
15
  Refer to the [original model card](https://huggingface.co/allura-org/TQ2.5-14B-Neon-v1) for more details on the model.
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ## Use with llama.cpp
18
  Install llama.cpp through brew (works on Mac and Linux)
19
 
 
14
  This model was converted to GGUF format from [`allura-org/TQ2.5-14B-Neon-v1`](https://huggingface.co/allura-org/TQ2.5-14B-Neon-v1) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
15
  Refer to the [original model card](https://huggingface.co/allura-org/TQ2.5-14B-Neon-v1) for more details on the model.
16
 
17
+ ---
18
+ Model details:
19
+ -
20
+ RP finetune of Supernova-Medius. Turned out surprisingly nice on it's own, I honestly made it only as a merge fuel, but it impressed me and Prodeus enough to release it separately (history repeats I guess, Sugarquill also started out this way). Quite interesting prose, definitely quite distinct from Supernova or EVA for that matter. Instruction following is decent as well. Not really much to say about this one, just a decent RP model, tbh. Euryale-inspired I guess.
21
+
22
+ Model was trained by Auri.
23
+
24
+ Training notes
25
+
26
+ Model was trained on a dataset consisting of 77M tokens of synthetic RP and short story gen data. Training took around 2 hours on 8xH100 SXM node. Training config was more or less reused from Sugarquill, and it worked fairly well again. Had the node crash after finishing the training and merging in the LoRA, so I had to merge it with MergeKit on a separate node, otherwise everything was smooth.
27
+
28
+ Huge thanks to Retis Labs for sponsoring this run!
29
+
30
+ Format
31
+
32
+ Model responds to ChatML instruct formatting, exactly like it's base model.
33
+
34
+ <|im_start|>system
35
+ {system message}<|im_end|>
36
+ <|im_start|>user
37
+ {user message}<|im_end|>
38
+ <|im_start|>assistant
39
+ {response}<|im_end|>
40
+
41
+ Recommended Samplers
42
+
43
+ My classic stable Qwen setup works quite well:
44
+
45
+ Temperature - 0.8
46
+ Min-P - 0.05
47
+ Top-A - 0.3
48
+ Repetition Penalty - 1.03
49
+
50
+ Training config
51
+ See Axolotl config
52
+
53
+ axolotl version 0.6.0
54
+
55
+ # Model
56
+ base_model: arcee-ai/SuperNova-Medius
57
+ strict: false
58
+
59
+ # Liger Kernels (optimization)
60
+ plugins:
61
+ - axolotl.integrations.liger.LigerPlugin
62
+ liger_rope: true
63
+ liger_rms_norm: true
64
+ liger_swiglu: true
65
+ liger_fused_linear_cross_entropy: true
66
+
67
+ # Output and HuggingFace
68
+ output_dir: /workspace/axolotl/TQ-2.5-14B-Neon
69
+ hub_model_id: allura-org/TQ-2.5-14B-Neon-LoRA
70
+ hf_use_auth_token: true
71
+ hub_strategy: "all_checkpoints"
72
+
73
+ # WandB
74
+ wandb_project: allura-org
75
+ wandb_entity:
76
+ wandb_name: TQ-2.5-14B-Neon-1
77
+
78
+ # Data
79
+ chat_template: chatml
80
+ #train_on_inputs: false
81
+ group_by_length: false
82
+ datasets:
83
+ - path: allura-org/neon-41k
84
+ type: chat_template
85
+ field_messages: conversations
86
+ message_field_role: from
87
+ message_field_content: value
88
+
89
+ ## Evaluation
90
+ val_set_size: 0.01
91
+ evals_per_epoch: 4
92
+ eval_table_size:
93
+ eval_max_new_tokens: 128
94
+
95
+ # Technical aspects
96
+ sequence_len: 16384
97
+ save_safetensors: true
98
+ saves_per_epoch: 2
99
+ logging_steps: 1
100
+ special_tokens:
101
+
102
+ # Quantization
103
+ bf16: auto
104
+ fp16:
105
+ tf32: false
106
+ ## For LoRA
107
+ load_in_8bit: false
108
+ load_in_4bit: false
109
+
110
+ # LoRA
111
+ peft_use_rslora: true
112
+ peft_use_dora: false # better but slower
113
+ adapter: lora # lora or qlora
114
+ lora_model_dir:
115
+ lora_r: 64 # 64 is optimal for most trains on instruct
116
+ lora_alpha: 32
117
+ lora_dropout: 0.1
118
+ lora_target_linear: true
119
+ lora_fan_in_fan_out:
120
+ lora_target_modules:
121
+ # - embed_tokens
122
+ # - lm_head
123
+
124
+ #loraplus_lr_ratio: 8 # works to converge faster but is kinda cancer bc makes model unstable
125
+ #loraplus_lr_embedding:
126
+
127
+ # Training hyperparameters
128
+ # max_steps:
129
+ num_epochs: 2
130
+
131
+ # Anti Overfit and Stability
132
+ weight_decay: 0.01
133
+ max_grad_norm: 1.0
134
+
135
+ ## Learning Rate
136
+ warmup_ratio: 0.05
137
+ learning_rate: 0.00003
138
+ lr_scheduler: cosine
139
+ #lr_scheduler_kwargs:
140
+ # min_lr: 0.0000024
141
+ optimizer: paged_ademamix_8bit # usually adamw_torch or paged_adamw_8bit
142
+
143
+ ## Batch Size
144
+ gradient_accumulation_steps: 4 # More effective batch size - stabler train, usually. MBS also speeds it up.
145
+ micro_batch_size: 4 # Batch size per gpu = micro_batch_size * gradient_accumulation_steps
146
+ eval_batch_size: 1
147
+
148
+ # Optimizations
149
+ pad_to_sequence_len: true
150
+ sample_packing: true
151
+ eval_sample_packing: false
152
+ flash_attention: true
153
+ xformers_attention:
154
+ gradient_checkpointing: "unsloth"
155
+ gradient_checkpointing_kwargs:
156
+ use_reentrant: true
157
+ local_rank:
158
+ deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16.json # Only use with multi gpu # _bf16_cpuoffload_all
159
+ # fsdp:
160
+ # - full_shard
161
+ # - auto_wrap
162
+ # fsdp_config:
163
+ # fsdp_limit_all_gathers: true
164
+ # fsdp_sync_module_states: true
165
+ # fsdp_offload_params: true
166
+ # fsdp_use_orig_params: false
167
+ # fsdp_cpu_ram_efficient_loading: true
168
+ # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
169
+ # fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
170
+ # fsdp_state_dict_type: FULL_STATE_DICT
171
+ # fsdp_sharding_strategy: FULL_SHARD
172
+ # Misc
173
+ early_stopping_patience:
174
+ debug:
175
+
176
+ ---
177
  ## Use with llama.cpp
178
  Install llama.cpp through brew (works on Mac and Linux)
179