ehristoforu
/

QwenMoe-A1.5B-IT

Model card Files Files and versions Community

QwenMoe-A1.5B-IT / mergekit_moe_config.yml

ehristoforu's picture

Upload folder using huggingface_hub

93e4a8e verified 2 days ago

history blame contribute delete

1.21 kB

	base_model: Qwen/Qwen2.5-1.5B-Instruct
	gate_mode: hidden
	dtype: bfloat16
	experts_per_token: 2
	experts:
	- source_model: Qwen/Qwen2.5-1.5B-Instruct
	positive_prompts:
	- "chat assistant"
	- "chat"
	- "assistant"
	- source_model: Qwen/Qwen2.5-1.5B
	positive_prompts:
	- "writing"
	- "text writing"
	- "text editing"
	- "text analysis"
	- "text enchancing"
	- source_model: Qwen/Qwen2.5-Math-1.5B-Instruct
	positive_prompts:
	- "math"
	- "math expert"
	- "calculating"
	- "math problem resolving"
	- "logics"
	- source_model: Qwen/Qwen2.5-Coder-1.5B-Instruct
	positive_prompts:
	- "coding"
	- "coder"
	- "Python coder"
	- "Java coder"
	- "JS coder"
	- "HTML/CSS coder"
	- "code refactor"
	- "code review"
	- "code enchancing"
	- "rewrite code"
	- "optimize code"

	shared_experts:
	- source_model: Qwen/Qwen2.5-1.5B-Instruct
	positive_prompts: # required by Qwen MoE for "hidden" gate mode, otherwise not allowed
	- "chat assistant"
	# (optional, but recommended:)
	residual_scale: 0.1 # downweight output from shared expert to prevent overcooking the model