File size: 1,205 Bytes
93e4a8e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
base_model: Qwen/Qwen2.5-1.5B-Instruct
gate_mode: hidden
dtype: bfloat16
experts_per_token: 2
experts:
- source_model: Qwen/Qwen2.5-1.5B-Instruct
positive_prompts:
- "chat assistant"
- "chat"
- "assistant"
- source_model: Qwen/Qwen2.5-1.5B
positive_prompts:
- "writing"
- "text writing"
- "text editing"
- "text analysis"
- "text enchancing"
- source_model: Qwen/Qwen2.5-Math-1.5B-Instruct
positive_prompts:
- "math"
- "math expert"
- "calculating"
- "math problem resolving"
- "logics"
- source_model: Qwen/Qwen2.5-Coder-1.5B-Instruct
positive_prompts:
- "coding"
- "coder"
- "Python coder"
- "Java coder"
- "JS coder"
- "HTML/CSS coder"
- "code refactor"
- "code review"
- "code enchancing"
- "rewrite code"
- "optimize code"
shared_experts:
- source_model: Qwen/Qwen2.5-1.5B-Instruct
positive_prompts: # required by Qwen MoE for "hidden" gate mode, otherwise not allowed
- "chat assistant"
# (optional, but recommended:)
residual_scale: 0.1 # downweight output from shared expert to prevent overcooking the model
|