openbmb
/

MiniCPM3-4B

@@ -22,12 +22,14 @@ import math
 import warnings
 from typing import List, Optional, Tuple, Union, Dict
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import (
@@ -1248,6 +1250,9 @@ class MiniCPM3ForCausalLM(MiniCPM3PreTrainedModel):
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1426,11 +1431,52 @@ class MiniCPM3ForCausalLM(MiniCPM3PreTrainedModel):
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
         return reordered_past
     @torch.inference_mode()
-    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
-             max_length: int = 4096, num_beams=1, do_sample=True, top_p=0.8, temperature=0.3, logits_processor=None,
-             **kwargs):
         if history is None:
             history = []
         if logits_processor:
@@ -1443,12 +1489,22 @@ class MiniCPM3ForCausalLM(MiniCPM3PreTrainedModel):
         history.append({"role": role, "content": query})
         history_str = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
         inputs = tokenizer(history_str, return_tensors='pt').to(self.device)
-        outputs = self.generate(**inputs, **gen_kwargs)
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
-        response = tokenizer.decode(outputs)
-        history.append({"role": "assistant", "content": response})
-        return response, history
 @add_start_docstrings(
     """

 import warnings
 from typing import List, Optional, Tuple, Union, Dict
+from threading import Thread
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import TextIteratorStreamer
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import (
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # List of terminator tokens used to indicate the end of a sequence or conversation.
+        self.terminators = ['</s>', '<|im_end|>']
         # Initialize weights and apply final processing
         self.post_init()
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
         return reordered_past
+    # Internal function to handle streaming of generated text using TextIteratorStreamer.
+    def _decode_stream(self, input_ids, tokenizer, **kwargs):
+        # Convert terminators to token IDs
+        terminators_ids = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
+        # Initialize TextIteratorStreamer for handling streaming output
+        streamer = TextIteratorStreamer(tokenizer=tokenizer,skip_prompt=True, skip_special_tokens=True)
+        # Set up generation parameters, including input IDs, eos token IDs, and streamer
+        generation_kwargs = {
+            'input_ids': input_ids,
+            'eos_token_id': terminators_ids,
+            'streamer': streamer
+        }
+        generation_kwargs.update(kwargs)
+        # Run the generation task in a separate thread to enable streaming output
+        thread = Thread(target=self.generate, kwargs=generation_kwargs)
+        thread.start()
+        # Return the streamer instance for later access to streamed text
+        return streamer
     @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", max_length: int = 4096, num_beams=1,
+             do_sample=True, logits_processor=None, stream=False, top_p=0.8, temperature=0.3, **kwargs):
+        """
+        Main function for handling dialogue generation based on the input query and history.
+        Parameters:
+        - tokenizer: Tokenizer instance used for encoding and decoding.
+        - query: The user input query string.
+        - history: Dialogue history, a list of dictionaries where each dictionary contains role and content.
+        - role: The current role, default is "user".
+        - max_length: Maximum length of the generated text.
+        - num_beams: Number of beams for beam search.
+        - do_sample: Whether to use sampling for generation.
+        - logits_processor: Function for processing logits (if any).
+        - stream: Whether to use streaming output.
+        - top_p: Nucleus sampling parameter.
+        - temperature: Temperature parameter for generation.
+        - **kwargs: Additional arguments for generation.
+        Returns:
+        - If stream is True, returns a generator function to get the generated text incrementally.
+        - If stream is False, returns the complete generated response string.
+        """
         if history is None:
             history = []
         if logits_processor:
         history.append({"role": role, "content": query})
         history_str = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
         inputs = tokenizer(history_str, return_tensors='pt').to(self.device)
+        if stream:
+            res = self._decode_stream(inputs["input_ids"], tokenizer, **gen_kwargs)
+            def stream_gen():
+                for text in res:
+                    # Remove terminators from the text
+                    for term in self.terminators:
+                        text = text.replace(term, '')
+                    yield text
+            return stream_gen()
+        else:
+            outputs = self.generate(**inputs, **gen_kwargs)
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+            response = tokenizer.decode(outputs)
+            return response
 @add_start_docstrings(
     """