base_model: Qwen/Qwen2.5-1.5B-Instruct | |
gate_mode: hidden | |
dtype: bfloat16 | |
experts_per_token: 2 | |
experts: | |
- source_model: Qwen/Qwen2.5-1.5B-Instruct | |
positive_prompts: | |
- "chat assistant" | |
- "chat" | |
- "assistant" | |
- source_model: Qwen/Qwen2.5-1.5B | |
positive_prompts: | |
- "writing" | |
- "text writing" | |
- "text editing" | |
- "text analysis" | |
- "text enchancing" | |
- source_model: Qwen/Qwen2.5-Math-1.5B-Instruct | |
positive_prompts: | |
- "math" | |
- "math expert" | |
- "calculating" | |
- "math problem resolving" | |
- "logics" | |
- source_model: Qwen/Qwen2.5-Coder-1.5B-Instruct | |
positive_prompts: | |
- "coding" | |
- "coder" | |
- "Python coder" | |
- "Java coder" | |
- "JS coder" | |
- "HTML/CSS coder" | |
- "code refactor" | |
- "code review" | |
- "code enchancing" | |
- "rewrite code" | |
- "optimize code" | |
shared_experts: | |
- source_model: Qwen/Qwen2.5-1.5B-Instruct | |
positive_prompts: # required by Qwen MoE for "hidden" gate mode, otherwise not allowed | |
- "chat assistant" | |
# (optional, but recommended:) | |
residual_scale: 0.1 # downweight output from shared expert to prevent overcooking the model | |