Spaces:

ljsabc
/

Fujisaki

Runtime error

App Files Files Community

ljsabc commited on Mar 27, 2023

Commit

300a211

•

1 Parent(s): f494a07

Feature: chat to chihiro.

Browse files

Files changed (2) hide show

app.py +58 -24
modeling_chatglm.py +68 -30

app.py CHANGED Viewed

@@ -58,29 +58,63 @@ def evaluate(context, temperature, top_p, top_k):
         )
         out_text = tokenizer.decode(out[0]).split("Answer: ")[1]
         return out_text
 import gradio as gr
-gr.Interface(
-    fn=evaluate,
-    inputs=[
-        gr.components.Textbox(
-            lines=2, label="问题", placeholder="最近过得怎么样？",
-            info="可以在这里输入你的问题。也可以什么都不填写生成随机数据。"
-        ),
-        #gr.components.Textbox(lines=2, label="Input", placeholder="none"),
-        gr.components.Slider(minimum=0, maximum=1.1, value=1.0, label="Temperature",
-            info="温度参数，越高的温度生成的内容越丰富，但是有可能出现语法问题。"),
-        gr.components.Slider(minimum=0.5, maximum=1.0, value=0.98, label="Top p",
-            info="top-p参数，只输出前p>top-p的文字，建议不要修改。"),
-        gr.components.Slider(minimum=1, maximum=200, step=1, value=40, label="Top k",
-            info="top-k参数，下一个输出的文字会从top-k个文字中进行选择，越大生成的内容越丰富，但也可能出现语法问题。数字越小似乎上下文的衔接性越好。"),
-    ],
-    outputs=[
-        gr.inputs.Textbox(
-            lines=5,
-            label="Output",
-        )
-    ],
-    title="李萌萌（Alter Ego）",
-    description="这是一个通过ChatGLM模型训练的李萌萌的数字分身，你可以在问题栏目填入内容，或者什么都不填，来观察李萌萌到底会说些什么。因为是在CPU上进行运行，速度会比较慢。",
-).launch()

         )
         out_text = tokenizer.decode(out[0]).split("Answer: ")[1]
         return out_text
+def evaluate_stream(msg, history, temperature, top_p):
+    generation_config = GenerationConfig(
+        temperature=temperature,
+        top_p=top_p,
+        #repetition_penalty=1.1,
+        num_beams=1,
+        do_sample=True,
+    )
+    history.append([msg, None])
+    context = ""
+    if len(history) > 5:
+        history.pop(0)
+    for j in range(len(history)):
+        history[j][0] = history[j][0].replace("<br>", "")
+    # concatenate context
+    for h in history[:-1]:
+        context += h[0] + "\n" + h[1] + "\n"
+    context += history[-1][0]
+    context = context.replace(r'<br>', '')
+    h = []
+    print("History:", history)
+    print("Context:", context)
+    for response, h in model.stream_chat(tokenizer, context, h, max_length=160, top_p=top_p, temperature=temperature):
+        history[-1][1] = response
+        yield history, ""
+    #return response
 import gradio as gr
+with gr.Blocks() as demo:
+    state = gr.State()
+    with gr.Row():
+        with gr.Column(scale=2):
+            temp = gr.components.Slider(minimum=0, maximum=1.1, value=0.9, label="Temperature",
+                info="温度参数，越高的温度生成的内容越丰富，但是有可能出现语法问题。")
+            top_p = gr.components.Slider(minimum=0.5, maximum=1.0, value=0.97, label="Top-p",
+                info="top-p参数，只输出前p>top-p的文字，越大生成的内容越丰富，但也可能出现语法问题。数字越小似乎上下文的衔接性越好。")
+            #code = gr.Textbox(label="temp_output", info="解码器输出")
+            #top_k = gr.components.Slider(minimum=1, maximum=200, step=1, value=25, label="Top k",
+            #    info="top-k参数，下一个输出的文字会从top-k个文字中进行选择，越大生成的内容越丰富，但也可能出现语法问题。数字越小似乎上下文的衔接性越好。")
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot(label="聊天框", info="")
+            msg = gr.Textbox(label="输入框", placeholder="最近过得怎么样？",
+                info="输入你的内容，按[Enter]发送。也可以什么都不填写生成随机数据。聊天会追随上下文，如果要换个话题建议按下按钮清除聊天。")
+            clear = gr.Button("清除聊天")
+    msg.submit(evaluate_stream, [msg, chatbot, temp, top_p], [chatbot, msg])
+    clear.click(lambda: None, None, chatbot, queue=False)
+demo.queue()
+demo.launch(debug=False)

modeling_chatglm.py CHANGED Viewed

@@ -4,6 +4,8 @@ import math
 import copy
 import os
 import warnings
 import torch
 import torch.utils.checkpoint
@@ -31,10 +33,12 @@ from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaL
 from configuration_chatglm import ChatGLMConfig
 # flags required to enable jit fusion kernels
-torch._C._jit_set_profiling_mode(False)
-torch._C._jit_set_profiling_executor(False)
-torch._C._jit_override_can_fuse_on_cpu(True)
-torch._C._jit_override_can_fuse_on_gpu(True)
 logger = logging.get_logger(__name__)
@@ -51,7 +55,7 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         if torch.isnan(scores).any() or torch.isinf(scores).any():
             scores.zero_()
-            scores[..., 20005] = 1e5
         return scores
@@ -265,7 +269,7 @@ def attention_fn(
         if not (attention_mask == 0).all():
             # if auto-regressive, skip
             attention_scores.masked_fill_(attention_mask, -10000.0)
-        dtype = attention_scores.type()
         attention_scores = attention_scores.float()
         attention_scores = attention_scores * query_key_layer_scaling_coeff
@@ -610,8 +614,8 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
     a simple interface for downloading and loading pretrained models.
     """
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
     config_class = ChatGLMConfig
     base_model_prefix = "transformer"
     _no_split_modules = ["GLM6BBlock"]
@@ -619,13 +623,10 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
-    def _init_weights(self, module):
         return
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (GLMBlock)):
-            module.gradient_checkpointing = value
 CHATGLM_6B_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
@@ -722,7 +723,6 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         self.inner_hidden_size = config.inner_hidden_size
         self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
         self.position_encoding_2d = config.position_encoding_2d
-        self.model_parallel = True
         self.word_embeddings = skip_init(
             torch.nn.Embedding,
@@ -757,9 +757,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
     def set_input_embeddings(self, new_embeddings: torch.Tensor):
         self.word_embeddings = new_embeddings
-    @staticmethod
-    def get_masks(seq, device):
-        context_length = seq.index(150004) + 1
         attention_mask = torch.ones((1, len(seq), len(seq)), device=device)
         attention_mask.tril_()
@@ -770,9 +769,9 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         return attention_mask
     def get_position_ids(self, seq, mask_position, device, gmask=False):
-        context_length = len(seq)
         if self.position_encoding_2d:
-            seq_length = seq.index(150004)
             position_ids = torch.arange(context_length, dtype=torch.long, device=device)
             if not gmask:
                 position_ids[seq_length:] = mask_position
@@ -827,14 +826,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         if past_key_values is None:
             past_key_values = tuple([None] * len(self.layers))
-            MASK, gMASK = 150000, 150001
-            mask_token = MASK if MASK in input_ids else gMASK
-            use_gmask = False if MASK in input_ids else gMASK
             seq = input_ids[0].tolist()
-            mask_position = seq.index(mask_token)
             if attention_mask is None:
                 attention_mask = self.get_masks(
                     seq=seq,
@@ -842,6 +835,11 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
                 )
             if position_ids is None:
                 position_ids = self.get_position_ids(
                     seq=seq,
                     mask_position=mask_position,
@@ -940,12 +938,12 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
     def get_masks_and_position_ids(self, seq, mask_position, context_length, device, gmask=False):
         attention_mask = torch.ones((1, context_length, context_length), device=device)
         attention_mask.tril_()
-        attention_mask[..., :mask_position - 1] = 1
         attention_mask.unsqueeze_(1)
         attention_mask = (attention_mask < 0.5).bool()
         if self.position_encoding_2d:
-            seq_length = seq.index(150004)
             position_ids = torch.arange(context_length, dtype=torch.long, device=device)
             if not gmask:
                 position_ids[seq_length:] = mask_position
@@ -983,7 +981,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         # only last token for input_ids if past is not None
         if past is not None or past_key_values is not None:
-            context_length = seq.index(150004)
             last_token = input_ids[:, -1].unsqueeze(-1)
             if self.position_encoding_2d:
                 position_ids = torch.tensor([[[mask_position], [len(seq) - context_length]]], dtype=torch.long,
@@ -1091,6 +1089,21 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
             for layer_past in past
         )
     @torch.no_grad()
     def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, num_beams=1,
              do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
@@ -1113,11 +1126,35 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         outputs = self.generate(**input_ids, **gen_kwargs)
         outputs = outputs.tolist()[0][len(input_ids["input_ids"][0]):]
         response = tokenizer.decode(outputs)
-        response = response.strip()
-        response = response.replace("[[训练时间]]", "2023年")
         history = history + [(query, response)]
         return response, history
     @torch.no_grad()
     def stream_generate(
@@ -1220,6 +1257,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
             if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
                 break
             yield input_ids
     def quantize(self, bits: int):
         from .quantization import quantize
         self.transformer = quantize(self.transformer, bits)

 import copy
 import os
 import warnings
+import re
+import sys
 import torch
 import torch.utils.checkpoint
 from configuration_chatglm import ChatGLMConfig
 # flags required to enable jit fusion kernels
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
 logger = logging.get_logger(__name__)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         if torch.isnan(scores).any() or torch.isinf(scores).any():
             scores.zero_()
+            scores[..., 20005] = 5e4
         return scores
         if not (attention_mask == 0).all():
             # if auto-regressive, skip
             attention_scores.masked_fill_(attention_mask, -10000.0)
+        dtype = attention_scores.dtype
         attention_scores = attention_scores.float()
         attention_scores = attention_scores * query_key_layer_scaling_coeff
     a simple interface for downloading and loading pretrained models.
     """
+    is_parallelizable = False
+    supports_gradient_checkpointing = False
     config_class = ChatGLMConfig
     base_model_prefix = "transformer"
     _no_split_modules = ["GLM6BBlock"]
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
         return
 CHATGLM_6B_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
         self.inner_hidden_size = config.inner_hidden_size
         self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
         self.position_encoding_2d = config.position_encoding_2d
         self.word_embeddings = skip_init(
             torch.nn.Embedding,
     def set_input_embeddings(self, new_embeddings: torch.Tensor):
         self.word_embeddings = new_embeddings
+    def get_masks(self, seq, device):
+        context_length = seq.index(self.config.bos_token_id) + 1
         attention_mask = torch.ones((1, len(seq), len(seq)), device=device)
         attention_mask.tril_()
         return attention_mask
     def get_position_ids(self, seq, mask_position, device, gmask=False):
+        context_length = seq.index(self.config.bos_token_id) + 1
         if self.position_encoding_2d:
+            seq_length = seq.index(self.config.bos_token_id)
             position_ids = torch.arange(context_length, dtype=torch.long, device=device)
             if not gmask:
                 position_ids[seq_length:] = mask_position
         if past_key_values is None:
             past_key_values = tuple([None] * len(self.layers))
             seq = input_ids[0].tolist()
             if attention_mask is None:
                 attention_mask = self.get_masks(
                     seq=seq,
                 )
             if position_ids is None:
+                MASK, gMASK = 150000, 150001
+                mask_token = MASK if MASK in input_ids else gMASK
+                use_gmask = False if MASK in input_ids else gMASK
+                mask_position = seq.index(mask_token)
                 position_ids = self.get_position_ids(
                     seq=seq,
                     mask_position=mask_position,
     def get_masks_and_position_ids(self, seq, mask_position, context_length, device, gmask=False):
         attention_mask = torch.ones((1, context_length, context_length), device=device)
         attention_mask.tril_()
+        attention_mask[..., :context_length - 1] = 1
         attention_mask.unsqueeze_(1)
         attention_mask = (attention_mask < 0.5).bool()
         if self.position_encoding_2d:
+            seq_length = seq.index(self.config.bos_token_id)
             position_ids = torch.arange(context_length, dtype=torch.long, device=device)
             if not gmask:
                 position_ids[seq_length:] = mask_position
         # only last token for input_ids if past is not None
         if past is not None or past_key_values is not None:
+            context_length = seq.index(self.config.bos_token_id)
             last_token = input_ids[:, -1].unsqueeze(-1)
             if self.position_encoding_2d:
                 position_ids = torch.tensor([[[mask_position], [len(seq) - context_length]]], dtype=torch.long,
             for layer_past in past
         )
+    def process_response(self, response):
+        response = response.strip()
+        response = response.replace("[[训练时间]]", "2023年")
+        punkts = [
+            [",", "，"],
+            ["!", "！"],
+            [":", "："],
+            [";", "；"],
+            ["\?", "？"],
+        ]
+        for item in punkts:
+            response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+            response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+        return response
     @torch.no_grad()
     def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, num_beams=1,
              do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
         outputs = self.generate(**input_ids, **gen_kwargs)
         outputs = outputs.tolist()[0][len(input_ids["input_ids"][0]):]
         response = tokenizer.decode(outputs)
+        response = self.process_response(response)
         history = history + [(query, response)]
         return response, history
+    @torch.no_grad()
+    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048,
+                    do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if not history:
+            prompt = query
+        else:
+            prompt = ""
+            for i, (old_query, response) in enumerate(history):
+                prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
+            prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
+        input_ids = tokenizer([prompt], return_tensors="pt", padding=True)
+        input_ids = input_ids.to(self.device)
+        for outputs in self.stream_generate(**input_ids, **gen_kwargs):
+            outputs = outputs.tolist()[0][len(input_ids["input_ids"][0]):]
+            response = tokenizer.decode(outputs)
+            response = self.process_response(response)
+            new_history = history + [(query, response)]
+            yield response, new_history
     @torch.no_grad()
     def stream_generate(
             if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
                 break
             yield input_ids
     def quantize(self, bits: int):
         from .quantization import quantize
         self.transformer = quantize(self.transformer, bits)