base_model: Qwen/Qwen2.5-1.5B-Instruct gate_mode: hidden dtype: bfloat16 experts_per_token: 2 experts: - source_model: Qwen/Qwen2.5-1.5B-Instruct positive_prompts: - "chat assistant" - "chat" - "assistant" - source_model: Qwen/Qwen2.5-1.5B positive_prompts: - "writing" - "text writing" - "text editing" - "text analysis" - "text enchancing" - source_model: Qwen/Qwen2.5-Math-1.5B-Instruct positive_prompts: - "math" - "math expert" - "calculating" - "math problem resolving" - "logics" - source_model: Qwen/Qwen2.5-Coder-1.5B-Instruct positive_prompts: - "coding" - "coder" - "Python coder" - "Java coder" - "JS coder" - "HTML/CSS coder" - "code refactor" - "code review" - "code enchancing" - "rewrite code" - "optimize code" shared_experts: - source_model: Qwen/Qwen2.5-1.5B-Instruct positive_prompts: # required by Qwen MoE for "hidden" gate mode, otherwise not allowed - "chat assistant" # (optional, but recommended:) residual_scale: 0.1 # downweight output from shared expert to prevent overcooking the model