SeaLLM-7B-v2.5-simple

Running on Zero

App Files Files Community

nxphi47 commited on Jan 22

Commit

30c4f8d

•

1 Parent(s): 37d11bf

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -43

app.py CHANGED Viewed

@@ -217,34 +217,6 @@ MODEL_TITLE = """
       </div>
 </div>
 """
-# <a href='https://arxiv.org/pdf/2312.00738.pdf'><img src='https://img.shields.io/badge/Paper-PDF-red'></a>
-# MODEL_DESC = """
-# <div style='display:flex; gap: 0.25rem; '>
-# <a href='https://github.com/SeaLLMs/SeaLLMs'><img src='https://img.shields.io/badge/Github-Code-success'></a>
-# <a href='https://huggingface.co/spaces/SeaLLMs/SeaLLM-Chat-13b'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
-# <a href='https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
-# </div>
-# <span style="font-size: larger">
-# This is <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">SeaLLM-13B-Chat</a> - a chatbot assistant optimized for Southeast Asian Languages. It produces helpful responses in English 🇬🇧, Vietnamese 🇻🇳, Indonesian 🇮🇩 and Thai 🇹🇭.
-# Explore <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">our article</a> for more details.
-# </span>
-# <br>
-# <span >
-# NOTE: The chatbot may produce inaccurate and harmful information about people, places, or facts.
-# <span style="color: red">By using our service, you are required to agree to our <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b/blob/main/LICENSE" target="_blank" style="color: red">SeaLLM Terms Of Use</a>, which include:</span><br>
-# <ul>
-# <li >
-# You must not use our service to generate any harmful, unethical or illegal content that violates locally applicable and international laws or regulations,
-# including but not limited to hate speech, violence, pornography and deception.</li>
-# <li >
-# The service collects user dialogue data for testing and performance improvement, and reserves the right to distribute it under
-# <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution (CC-BY)</a> or similar license. So do not enter any personal information!
-# </li>
-# </ul>
-# </span>
-# """.strip()
-# <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">SeaLLM-13B-Chat</a> - a helpful chatbot assistant for Southeast Asian Languages. It supports English 🇬🇧, Vietnamese 🇻🇳, Indonesian 🇮🇩, Thai 🇹🇭, Malay 🇲🇾, Khmer🇰🇭, Lao🇱🇦, Tagalog🇵🇭 and Burmese🇲🇲.
 MODEL_DESC = f"""
@@ -1047,11 +1019,28 @@ class CustomTabbedInterface(gr.Blocks):
-def vllm_abort(self: Any):
     sh = self.llm_engine.scheduler
     for g in (sh.waiting + sh.running + sh.swapped):
         sh.abort_seq_group(g.request_id)
     from vllm.sequence import SequenceStatus
     scheduler = self.llm_engine.scheduler
     for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
@@ -1195,6 +1184,35 @@ def safety_check(text, history=None, ) -> Optional[str]:
     return None
 def chat_response_stream_multiturn(
     message: str,
     history: List[Tuple[str, str]],
@@ -1242,9 +1260,12 @@ def chat_response_stream_multiturn(
         return
     # history will be appended with message later on
-    full_prompt = llama_chat_multiturn_sys_input_seq_constructor(
-        message, history, sys_prompt=system_prompt
-    )
     if len(tokenizer.encode(full_prompt, add_special_tokens=False)) >= 4050:
         raise gr.Error(f"Conversation or prompt is too long, please clear the chatbox or try shorter input.")
@@ -1254,13 +1275,14 @@ def chat_response_stream_multiturn(
         max_tokens=max_tokens,
         frequency_penalty=frequency_penalty,
         presence_penalty=presence_penalty,
-        stop=['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]']
     )
     cur_out = None
     for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
         if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
-            cur_out = cur_out.replace("\\n", "\n")
             # optionally check safety, and respond
             if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
@@ -1569,7 +1591,7 @@ def batch_inference(
         max_tokens: int,
         frequency_penalty: float,
         presence_penalty: float,
-        stop_strings: str = "[STOP],<s>,</s>",
         current_time: Optional[float] = None,
         system_prompt: Optional[str] = SYSTEM_PROMPT_1
 ):
@@ -1603,11 +1625,11 @@ def batch_inference(
     remove_gradio_cache(exclude_names=['upload_chat.json', 'upload_few_shot.json'])
     if prompt_mode == 'chat':
-        prompt_format_fn = llama_chat_multiturn_sys_input_seq_constructor
     elif prompt_mode == 'few-shot':
         from functools import partial
         prompt_format_fn = partial(
-            llama_chat_multiturn_sys_input_seq_constructor, include_end_instruct=False
         )
     else:
         raise gr.Error(f'Wrong mode {prompt_mode}')
@@ -1702,7 +1724,7 @@ def launch():
         f'\n| frequence_penalty={frequence_penalty} '
         f'\n| presence_penalty={presence_penalty} '
         f'\n| temperature={temperature} '
-        f'\n| hf_model_name={hf_model_name} '
         f'\n| model_path={model_path} '
         f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
         f'\n| gpu_memory_utilization={gpu_memory_utilization} '
@@ -1748,9 +1770,9 @@ def launch():
             print(f'Cannot print model worker: {e}')
         try:
-            llm.llm_engine.scheduler_config.max_model_len = 4096
-            llm.llm_engine.scheduler_config.max_num_batched_tokens = 4096
-            llm.llm_engine.tokenizer.add_special_tokens = False
         except Exception as e:
             print(f'Cannot set parameters: {e}')
@@ -1902,4 +1924,4 @@ def main():
 if __name__ == "__main__":
-    main()

       </div>
 </div>
 """
 MODEL_DESC = f"""
+# def vllm_abort(self: Any):
+#     sh = self.llm_engine.scheduler
+#     for g in (sh.waiting + sh.running + sh.swapped):
+#         sh.abort_seq_group(g.request_id)
+#     from vllm.sequence import SequenceStatus
+#     scheduler = self.llm_engine.scheduler
+#     for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
+#         for seq_group in state_queue:
+#             # if seq_group.request_id == request_id:
+#             # Remove the sequence group from the state queue.
+#             state_queue.remove(seq_group)
+#             for seq in seq_group.seqs:
+#                 if seq.is_finished():
+#                     continue
+#                 scheduler.free_seq(seq, SequenceStatus.FINISHED_ABORTED)
+def vllm_abort(self):
     sh = self.llm_engine.scheduler
     for g in (sh.waiting + sh.running + sh.swapped):
         sh.abort_seq_group(g.request_id)
     from vllm.sequence import SequenceStatus
     scheduler = self.llm_engine.scheduler
     for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
     return None
+TURN_TEMPLATE = "<|im_start|>{role}\n{content}</s>"
+TURN_PREFIX = "<|im_start|>{role}\n"
+def chatml_chat_convo_format(conversations, add_assistant_prefix: bool, default_system=SYSTEM_PROMPT_1):
+    if conversations[0]['role'] != 'system':
+        conversations = [{"role": "system", "content": default_system}] + conversations
+    text = ''
+    for turn_id, turn in enumerate(conversations):
+        prompt = TURN_TEMPLATE.format(role=turn['role'], content=turn['content'])
+        text += prompt
+    if add_assistant_prefix:
+        prompt = TURN_PREFIX.format(role='assistant')
+        text += prompt
+    return text
+def chatml_format(message, history=None, system_prompt=None):
+    conversations = []
+    system_prompt = system_prompt or "You are a helpful assistant."
+    if history is not None and len(history) > 0:
+        for i, (prompt, res) in enumerate(history):
+            conversations.append({"role": "user", "content": prompt.strip()})
+            conversations.append({"role": "assistant", "content": res.strip()})
+    conversations.append({"role": "user", "content": message.strip()})
+    return chatml_chat_convo_format(conversations, True, default_system=system_prompt)
 def chat_response_stream_multiturn(
     message: str,
     history: List[Tuple[str, str]],
         return
     # history will be appended with message later on
+    # full_prompt = llama_chat_multiturn_sys_input_seq_constructor(
+    #     message, history, sys_prompt=system_prompt
+    # )
+    full_prompt = chatml_format(message.strip(), history=history, system_prompt=system_prompt)
+    print(full_prompt)
     if len(tokenizer.encode(full_prompt, add_special_tokens=False)) >= 4050:
         raise gr.Error(f"Conversation or prompt is too long, please clear the chatbox or try shorter input.")
         max_tokens=max_tokens,
         frequency_penalty=frequency_penalty,
         presence_penalty=presence_penalty,
+        # stop=['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]'],
+        stop=['<s>', '</s>', '<|im_start|>', '<|im_end|>'],
     )
     cur_out = None
     for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
         if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
+            # cur_out = cur_out.replace("\\n", "\n")
             # optionally check safety, and respond
             if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
         max_tokens: int,
         frequency_penalty: float,
         presence_penalty: float,
+        stop_strings: str = "[STOP],<s>,</s>,<|im_start|>",
         current_time: Optional[float] = None,
         system_prompt: Optional[str] = SYSTEM_PROMPT_1
 ):
     remove_gradio_cache(exclude_names=['upload_chat.json', 'upload_few_shot.json'])
     if prompt_mode == 'chat':
+        prompt_format_fn = chatml_format
     elif prompt_mode == 'few-shot':
         from functools import partial
         prompt_format_fn = partial(
+            chatml_format, include_end_instruct=False
         )
     else:
         raise gr.Error(f'Wrong mode {prompt_mode}')
         f'\n| frequence_penalty={frequence_penalty} '
         f'\n| presence_penalty={presence_penalty} '
         f'\n| temperature={temperature} '
+        # f'\n| hf_model_name={hf_model_name} '
         f'\n| model_path={model_path} '
         f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
         f'\n| gpu_memory_utilization={gpu_memory_utilization} '
             print(f'Cannot print model worker: {e}')
         try:
+            llm.llm_engine.scheduler_config.max_model_len = 8192
+            llm.llm_engine.scheduler_config.max_num_batched_tokens = 8192
+            # llm.llm_engine.tokenizer.add_special_tokens = False
         except Exception as e:
             print(f'Cannot set parameters: {e}')
 if __name__ == "__main__":
+    main()