Patching hf bug that creates wrong cache length if only inputs_embeds are passed to the model (#19)

Browse files

- Patching hf bug that creates wrong cache length if only inputs_embeds are passed to the model (775f6527d3cfd402c46b03c5fbf355b4f262b705)

Co-authored-by: Tomer Ronen <tomer-nv@users.noreply.huggingface.co>

Files changed (1) hide show

modeling_decilm.py +45 -1

modeling_decilm.py CHANGED Viewed

@@ -25,7 +25,7 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers import GenerationConfig
-from transformers.generation.utils import GenerationMixin, NEED_SETUP_CACHE_CLASSES_MAPPING
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     add_start_docstrings,
@@ -1311,6 +1311,50 @@ class DeciLMForCausalLM(DeciLMPreTrainedModel, GenerationMixin):
         )
         return model_inputs
 @add_start_docstrings(
     """

 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers import GenerationConfig
+from transformers.generation.utils import NEED_SETUP_CACHE_CLASSES_MAPPING, GenerationMixin, GenerateOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     add_start_docstrings,
         )
         return model_inputs
+    def _maybe_initialize_input_ids_for_generation(
+            self,
+            inputs: Optional[torch.Tensor] = None,
+            bos_token_id: Optional[torch.Tensor] = None,
+            model_kwargs: Optional[dict[str, torch.Tensor]] = None,
+    ) -> torch.LongTensor:
+        """
+        Patching hf bug that creates wrong cache length if only inputs_embeds are passed to the model
+        """
+        input_ids = super()._maybe_initialize_input_ids_for_generation(
+            inputs=inputs, bos_token_id=bos_token_id, model_kwargs=model_kwargs)
+        if (
+                "inputs_embeds" in model_kwargs
+                and input_ids is not None
+                and input_ids.shape[1] == 0
+        ):
+            batch_size, input_sequence_length = model_kwargs["inputs_embeds"].shape[:2]
+            input_ids = torch.zeros((batch_size, input_sequence_length), dtype=torch.long, device=self.device)
+        return input_ids
+    def generate(
+            self,
+            inputs: Optional[torch.Tensor] = None,
+            *args,
+            **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        """
+        Patching hf bug that creates wrong cache length if only inputs_embeds are passed to the model
+        """
+        only_passed_inputs_embeds = (
+                "inputs_embeds" in kwargs and
+                "input_ids" not in kwargs and
+                inputs is None
+        )
+        if only_passed_inputs_embeds:
+            input_sequence_length = kwargs["inputs_embeds"].shape[1]
+        generation_output = super().generate(inputs=inputs, *args, **kwargs)
+        if only_passed_inputs_embeds and isinstance(generation_output, torch.Tensor):
+            generation_output = generation_output[:, input_sequence_length:]
+        return generation_output
 @add_start_docstrings(
     """