QwenMoe-A1.5B-IT / mergekit_moe_config.yml
ehristoforu's picture
Upload folder using huggingface_hub
93e4a8e verified
base_model: Qwen/Qwen2.5-1.5B-Instruct
gate_mode: hidden
dtype: bfloat16
experts_per_token: 2
experts:
- source_model: Qwen/Qwen2.5-1.5B-Instruct
positive_prompts:
- "chat assistant"
- "chat"
- "assistant"
- source_model: Qwen/Qwen2.5-1.5B
positive_prompts:
- "writing"
- "text writing"
- "text editing"
- "text analysis"
- "text enchancing"
- source_model: Qwen/Qwen2.5-Math-1.5B-Instruct
positive_prompts:
- "math"
- "math expert"
- "calculating"
- "math problem resolving"
- "logics"
- source_model: Qwen/Qwen2.5-Coder-1.5B-Instruct
positive_prompts:
- "coding"
- "coder"
- "Python coder"
- "Java coder"
- "JS coder"
- "HTML/CSS coder"
- "code refactor"
- "code review"
- "code enchancing"
- "rewrite code"
- "optimize code"
shared_experts:
- source_model: Qwen/Qwen2.5-1.5B-Instruct
positive_prompts: # required by Qwen MoE for "hidden" gate mode, otherwise not allowed
- "chat assistant"
# (optional, but recommended:)
residual_scale: 0.1 # downweight output from shared expert to prevent overcooking the model