Crystalcareai
/

Quiet-Star-Custom

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Apr 8

Commit

969568c

•

1 Parent(s): 7b223b3

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +121 -113

modeling_quiet.py CHANGED Viewed

@@ -1024,16 +1024,14 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         # Update the attention mask
         if attention_mask is not None:
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-        else:
-            attention_mask = torch.ones((batch_size, seq_len)).to(input_ids.device)
         # Generate the continuation
         continuation_length = self.n_ahead - 2
         new_key_values = past_key_values
         # Initialize next_token_id with a default value
         next_token_id = torch.zeros(batch_size, dtype=torch.long).to(input_ids.device)
         start_time = time.time()
         for continuation_idx in range(continuation_length):
             outputs = self.model(
@@ -1059,106 +1057,79 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
             next_token_id = torch.argmax(next_token_logits, dim=-1)
             # Append the generated token to the input sequence
-            input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
             seq_len += 1
             # Update the attention mask
-            attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Append the end thought token to the input sequence
-            end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
-            input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
-            seq_len += 1
-            # Update the attention mask
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
-            # Get the hidden states before and after the thought
-            outputs_before = self.model(
-                input_ids=original_input_ids,
-                attention_mask=original_attention_mask,
-                position_ids=position_ids,
-                past_key_values=past_key_values,
-                inputs_embeds=inputs_embeds,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-            hidden_states_before = outputs_before[0][:, -1:, :]
-            # two new tokens: last continuation token and end thought token
-            outputs_after = self.model(
-                input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1),
-                attention_mask=attention_mask[:, -2:],
-                position_ids=position_ids,
-                past_key_values=new_key_values,
-                inputs_embeds=inputs_embeds,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-            hidden_states_after = outputs_after[0][:, -1:, :]
-            # Apply the talk head to get the mixing weight
-            mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
-            # Apply the mixing weight to the hidden states
-            mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
-            # Apply the language model head to get the final logits
-            logits = self.lm_head(mixed_hidden_states)
-            return logits
-    # @torch.no_grad()
-    # def generate(
-    #     self,
-    #     input_ids: torch.LongTensor,
-    #     attention_mask: Optional[torch.Tensor] = None,
-    #     max_new_tokens: Optional[int] = None,
-    #     temperature: float = 1.0,
-    #     **kwargs,
-    # ):
-    #     if isinstance(input_ids, str):
-    #         input_ids = self.tokenizer(input_ids, return_tensors="pt").input_ids
-    #     if attention_mask is None:
-    #         attention_mask = torch.ones_like(input_ids)
-    #     batch_size, seq_len = input_ids.shape
-    #     max_length = seq_len + max_new_tokens if max_new_tokens is not None else self.config.max_length
-    #     position_ids = torch.arange(seq_len, dtype=torch.long, device=input_ids.device)
-    #     position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
-    #     past_key_values = None
-    #     hidden_states = None
-    #     all_hidden_states = ()
-    #     for _ in range(max_length - seq_len):
-    #         logits = self.infer(
-    #             input_ids=input_ids,
-    #             attention_mask=attention_mask,
-    #             position_ids=position_ids,
-    #             past_key_values=past_key_values,
-    #             inputs_embeds=hidden_states,
-    #             use_cache=True,
-    #             output_attentions=False,
-    #             output_hidden_states=False,
-    #             return_dict=False,
-    #         )
-    #         next_token_logits = logits[:, -1, :] / temperature
-    #         next_token_id = torch.argmax(next_token_logits, dim=-1)
-    #         input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1)], dim=-1)
-    #         attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1), device=attention_mask.device)], dim=-1)
-    #         position_ids = torch.cat([position_ids, (position_ids[:, -1] + 1).unsqueeze(-1)], dim=-1)
-    #         all_hidden_states = all_hidden_states + (hidden_states,)
-    #     return input_ids, all_hidden_states
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1891,16 +1862,12 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
         torch.cuda.empty_cache()
-        return self.infer(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
         )
@@ -1908,18 +1875,59 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-        return {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "inputs_embeds": inputs_embeds,
-        }
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):

         # Update the attention mask
         if attention_mask is not None:
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Generate the continuation
         continuation_length = self.n_ahead - 2
         new_key_values = past_key_values
         # Initialize next_token_id with a default value
         next_token_id = torch.zeros(batch_size, dtype=torch.long).to(input_ids.device)
         start_time = time.time()
         for continuation_idx in range(continuation_length):
             outputs = self.model(
             next_token_id = torch.argmax(next_token_logits, dim=-1)
             # Append the generated token to the input sequence
+            # input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
             seq_len += 1
             # Update the attention mask
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
         # Append the end thought token to the input sequence
+        end_thought_token_id = self.tokenizer.convert_tokens_to_ids("<|endthought|>")
+        input_ids = torch.cat([input_ids, torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1)
+        seq_len += 1
+        # Update the attention mask
+        if attention_mask is not None:
             attention_mask = torch.cat([attention_mask, torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1)
+        # Get the hidden states before and after the thought
+        outputs_before = self.model(
+            input_ids=original_input_ids,
+            attention_mask=original_attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states_before = outputs_before[0][:, -1:, :]
+        # two new tokens: last continuation token and end thought token
+        outputs_after = self.model(
+            input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1),
+            attention_mask=torch.cat([attention_mask[:, -1:], torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1),
+            position_ids=position_ids,
+            past_key_values=new_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states_after = outputs_after[0][:, -1:, :]
+        # Apply the talk head to get the mixing weight
+        mixing_weight = self.talk_head[0](torch.cat([hidden_states_before, hidden_states_after], dim=-1))
+        # Apply the mixing weight to the hidden states
+        mixed_hidden_states = (1 - mixing_weight) * hidden_states_before + mixing_weight * hidden_states_after
+        # Apply the language model head to get the final logits
+        logits = self.lm_head(mixed_hidden_states)
+        return logits
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.LongTensor = torch.LongTensor(),
+        attention_mask: Optional[torch.Tensor] = None,
+        max_new_tokens: Optional[int] = None,
+        temperature: float = 1.1,
+        **kwargs,
+    ):
+        if isinstance(input_ids, str):
+            input_ids = self.tokenizer(input_ids, return_tensors="pt").input_ids
+        if attention_mask is None:
+            # Create a default attention mask if not provided
+            attention_mask = torch.ones_like(input_ids)
+        from .generate import generate
+        return generate(self, input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, temperature=temperature, **kwargs)
     @add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         torch.cuda.empty_cache()
+        return CausalLMOutputWithPast(
+            loss=loss if loss is not None else None,
+            logits=(rm_logits if self.n_ahead > 1 else logits) if not self.output_logits_at_the_end else logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing inputs_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):