Failed to convert weights into HuggingFace format
#11
by
hrezaei
- opened
I manged to convert format of 7B_200B_1 into transformers format, but for 7B_200B_4 an error like this occures:KeyError: 'layers.29.attention.wq.weight'
The command I run is like:
python /src/transformers/models/llama/convert_llama_weights_to_hf.py --input_dir ~/multi-token-prediction/7B_200B_4 --model_size 7B --output_dir ~/llama-multi-token/7B_200B_4
And the stack trace is like this:
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
Saving a LlamaTokenizerFast to ~/llama-multi-token/7B_200B_4.
1 32 32 4096
Fetching all parameters from the checkpoint at ~/multi-token-prediction/7B_200B_4.
Traceback (most recent call last):
File "src/transformers/models/llama/convert_llama_weights_to_hf.py", line 415, in <module>
main()
File "src/transformers/models/llama/convert_llama_weights_to_hf.py", line 403, in main
write_model(
File "src/transformers/models/llama/convert_llama_weights_to_hf.py", line 175, in write_model
loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads
KeyError: 'layers.29.attention.wq.weight'
Any workaround would be appreciated.
update convert_llama_weights_to_hf.py
for layer_i in range(n_layers):
filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
if num_shards == 1:
# print(loaded)
# Unsharded
if layer_i >= 29:
state_dict = {
f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
loaded[f"extra_heads.{layer_i-29}.attention.wq.weight"], n_heads=n_heads
),
f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
loaded[f"extra_heads.{layer_i-29}.attention.wk.weight"],
n_heads=num_key_value_heads,
dim1=key_value_dim,
),
f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"extra_heads.{layer_i-29}.attention.wv.weight"],
f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"extra_heads.{layer_i-29}.attention.wo.weight"],
f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"extra_heads.{layer_i-29}.feed_forward.w1.weight"],
f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"extra_heads.{layer_i-29}.feed_forward.w2.weight"],
f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"extra_heads.{layer_i-29}.feed_forward.w3.weight"],
f"model.layers.{layer_i}.input_layernorm.weight": loaded[
f"extra_heads.{layer_i-29}.attention_norm.weight"
],
f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
f"extra_heads.{layer_i-29}.ffn_norm.weight"
],
}
else:
state_dict = {
f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads
),
f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
loaded[f"layers.{layer_i}.attention.wk.weight"],
n_heads=num_key_value_heads,
dim1=key_value_dim,
),
f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
f"model.layers.{layer_i}.input_layernorm.weight": loaded[
f"layers.{layer_i}.attention_norm.weight"
],
f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
f"layers.{layer_i}.ffn_norm.weight"
],
}
can fix the convert error, we may also need to update model.py to load huggingface model for
extra_heads.0.attention.wq.weight
extra_heads.0.attention.wk.weight
extra_heads.0.attention.wv.weight
extra_heads.0.attention.wo.weight
extra_heads.0.feed_forward.w1.weight
extra_heads.0.feed_forward.w2.weight
extra_heads.0.feed_forward.w3.weight
extra_heads.0.attention_norm.weight
extra_heads.0.ffn_norm.weight
extra_heads.1.attention.wq.weight
extra_heads.1.attention.wk.weight
extra_heads.1.attention.wv.weight
extra_heads.1.attention.wo.weight
extra_heads.1.feed_forward.w1.weight
extra_heads.1.feed_forward.w2.weight
extra_heads.1.feed_forward.w3.weight
extra_heads.1.attention_norm.weight
extra_heads.1.ffn_norm.weight
extra_heads.2.attention.wq.weight
extra_heads.2.attention.wk.weight
extra_heads.2.attention.wv.weight
extra_heads.2.attention.wo.weight
extra_heads.2.feed_forward.w1.weight
extra_heads.2.feed_forward.w2.weight
extra_heads.2.feed_forward.w3.weight
extra_heads.2.attention_norm.weight
extra_heads.2.ffn_norm.weight