Inference error: The size of tensor a (616) must match the size of tensor b (1231) at non-singleton dimension 3

#2
by mikeytrw - opened

I'm trying to run this on my 3090, the model loads just fine but I get the below error whether I use pipielines or transformer implementation:

RuntimeError: The size of tensor a (616) must match the size of tensor b (1231) at non-singleton dimension 3

Llava Hugging Face org

hi @mikeytrw we recently fixed a similar issue on transformers, can you try to use transformers==4.36.0 ? pip install -U transformers

Hey, thanks for the quick response. I'm using transformers 4.36.0.dev0

Here is the full output, the tensor sizes change if I change the input image:


RuntimeError Traceback (most recent call last)
Cell In[7], line 7
5 inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)
6 #inputs = processor(prompt, return_tensors='pt').to(0, torch.float16)
----> 7 output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
8 print(processor.decode(output[0][2:], skip_special_tokens=True))

File ~/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)

File ~/.local/lib/python3.10/site-packages/transformers/generation/utils.py:1718, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
1701 return self.assisted_decoding(
1702 input_ids,
1703 assistant_model=assistant_model,
(...)
1714 **model_kwargs,
1715 )
1716 if generation_mode == GenerationMode.GREEDY_SEARCH:
1717 # 11. run greedy search
-> 1718 return self.greedy_search(
1719 input_ids,
1720 logits_processor=logits_processor,
1721 stopping_criteria=stopping_criteria,
1722 pad_token_id=generation_config.pad_token_id,
1723 eos_token_id=generation_config.eos_token_id,
1724 output_scores=generation_config.output_scores,
1725 return_dict_in_generate=generation_config.return_dict_in_generate,
1726 synced_gpus=synced_gpus,
1727 streamer=streamer,
1728 **model_kwargs,
1729 )
1731 elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
1732 if not model_kwargs["use_cache"]:

File ~/.local/lib/python3.10/site-packages/transformers/generation/utils.py:2579, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
2576 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
2578 # forward pass to get next token
-> 2579 outputs = self(
2580 **model_inputs,
2581 return_dict=True,
2582 output_attentions=output_attentions,
2583 output_hidden_states=output_hidden_states,
2584 )
2586 if synced_gpus and this_peer_finished:
2587 continue # don't waste resources running the code we don't need

File ~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None

File ~/.local/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py:433, in LlavaForConditionalGeneration.forward(self, input_ids, pixel_values, attention_mask, position_ids, past_key_values, inputs_embeds, vision_feature_layer, vision_feature_select_strategy, labels, use_cache, output_attentions, output_hidden_states, return_dict)
430 attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
431 position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
--> 433 outputs = self.language_model(
434 attention_mask=attention_mask,
435 position_ids=position_ids,
436 past_key_values=past_key_values,
437 inputs_embeds=inputs_embeds,
438 use_cache=use_cache,
439 output_attentions=output_attentions,
440 output_hidden_states=output_hidden_states,
441 return_dict=return_dict,
442 )
444 logits = outputs[0]
446 loss = None

File ~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None

File ~/.local/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:1046, in MistralForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
1043 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1045 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1046 outputs = self.model(
1047 input_ids=input_ids,
1048 attention_mask=attention_mask,
1049 position_ids=position_ids,
1050 past_key_values=past_key_values,
1051 inputs_embeds=inputs_embeds,
1052 use_cache=use_cache,
1053 output_attentions=output_attentions,
1054 output_hidden_states=output_hidden_states,
1055 return_dict=return_dict,
1056 )
1058 hidden_states = outputs[0]
1059 logits = self.lm_head(hidden_states)

File ~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)

File ~/.local/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None

File ~/.local/lib/python3.10/site-packages/transformers/models/mistral/modeling_mistral.py:894, in MistralModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
891 attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
892 else:
893 # 4d mask is passed through the layers
--> 894 attention_mask = _prepare_4d_causal_attention_mask(
895 attention_mask,
896 (batch_size, seq_length),
897 inputs_embeds,
898 past_key_values_length,
899 sliding_window=self.config.sliding_window,
900 )
902 hidden_states = inputs_embeds
904 if self.gradient_checkpointing and self.training:

File ~/.local/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.py:217, in _prepare_4d_causal_attention_mask(attention_mask, input_shape, inputs_embeds, past_key_values_length, sliding_window)
215 # 4d mask is passed through the layers
216 if attention_mask is not None:
--> 217 attention_mask = attn_mask_converter.to_4d(
218 attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype
219 )
220 else:
221 attention_mask = attn_mask_converter.to_causal_4d(
222 input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device
223 )

File ~/.local/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.py:136, in AttentionMaskConverter.to_4d(self, attention_mask_2d, query_length, dtype, key_value_length)
132 expanded_attn_mask = self._expand_mask(attention_mask_2d, dtype, tgt_len=input_shape[-1]).to(
133 attention_mask_2d.device
134 )
135 if causal_4d_mask is not None:
--> 136 expanded_attn_mask = causal_4d_mask.masked_fill(expanded_attn_mask.bool(), torch.finfo(dtype).min)
138 # expanded_attn_mask + causal_4d_mask can cause some overflow
139 expanded_4d_mask = expanded_attn_mask

RuntimeError: The size of tensor a (595) must match the size of tensor b (1189) at non-singleton dimension 3

Llava Hugging Face org

Attention mask seems to be wrong, but I think we fixed this on main

Llava Hugging Face org

@mikeytrw the fix should be addressed recently in a PR by @ArthurZ , can you try to uninstall transformers and re-install it? pip uninstall transformers && pip install -U transformers

it's now working! thanks

ArthurZ changed discussion status to closed

Sign up or log in to comment