Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -217,34 +217,6 @@ MODEL_TITLE = """
|
|
217 |
</div>
|
218 |
</div>
|
219 |
"""
|
220 |
-
# <a href='https://arxiv.org/pdf/2312.00738.pdf'><img src='https://img.shields.io/badge/Paper-PDF-red'></a>
|
221 |
-
# MODEL_DESC = """
|
222 |
-
# <div style='display:flex; gap: 0.25rem; '>
|
223 |
-
# <a href='https://github.com/SeaLLMs/SeaLLMs'><img src='https://img.shields.io/badge/Github-Code-success'></a>
|
224 |
-
# <a href='https://huggingface.co/spaces/SeaLLMs/SeaLLM-Chat-13b'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
|
225 |
-
# <a href='https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue'></a>
|
226 |
-
# </div>
|
227 |
-
# <span style="font-size: larger">
|
228 |
-
# This is <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">SeaLLM-13B-Chat</a> - a chatbot assistant optimized for Southeast Asian Languages. It produces helpful responses in English 🇬🇧, Vietnamese 🇻🇳, Indonesian 🇮🇩 and Thai 🇹🇭.
|
229 |
-
# Explore <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">our article</a> for more details.
|
230 |
-
# </span>
|
231 |
-
# <br>
|
232 |
-
# <span >
|
233 |
-
# NOTE: The chatbot may produce inaccurate and harmful information about people, places, or facts.
|
234 |
-
# <span style="color: red">By using our service, you are required to agree to our <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b/blob/main/LICENSE" target="_blank" style="color: red">SeaLLM Terms Of Use</a>, which include:</span><br>
|
235 |
-
# <ul>
|
236 |
-
# <li >
|
237 |
-
# You must not use our service to generate any harmful, unethical or illegal content that violates locally applicable and international laws or regulations,
|
238 |
-
# including but not limited to hate speech, violence, pornography and deception.</li>
|
239 |
-
# <li >
|
240 |
-
# The service collects user dialogue data for testing and performance improvement, and reserves the right to distribute it under
|
241 |
-
# <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution (CC-BY)</a> or similar license. So do not enter any personal information!
|
242 |
-
# </li>
|
243 |
-
# </ul>
|
244 |
-
# </span>
|
245 |
-
# """.strip()
|
246 |
-
|
247 |
-
# <a href="https://huggingface.co/SeaLLMs/SeaLLM-Chat-13b" target="_blank">SeaLLM-13B-Chat</a> - a helpful chatbot assistant for Southeast Asian Languages. It supports English 🇬🇧, Vietnamese 🇻🇳, Indonesian 🇮🇩, Thai 🇹🇭, Malay 🇲🇾, Khmer🇰🇭, Lao🇱🇦, Tagalog🇵🇭 and Burmese🇲🇲.
|
248 |
|
249 |
|
250 |
MODEL_DESC = f"""
|
@@ -1047,11 +1019,28 @@ class CustomTabbedInterface(gr.Blocks):
|
|
1047 |
|
1048 |
|
1049 |
|
1050 |
-
def vllm_abort(self: Any):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1051 |
sh = self.llm_engine.scheduler
|
1052 |
for g in (sh.waiting + sh.running + sh.swapped):
|
1053 |
sh.abort_seq_group(g.request_id)
|
1054 |
-
|
1055 |
from vllm.sequence import SequenceStatus
|
1056 |
scheduler = self.llm_engine.scheduler
|
1057 |
for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
|
@@ -1195,6 +1184,35 @@ def safety_check(text, history=None, ) -> Optional[str]:
|
|
1195 |
return None
|
1196 |
|
1197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1198 |
def chat_response_stream_multiturn(
|
1199 |
message: str,
|
1200 |
history: List[Tuple[str, str]],
|
@@ -1242,9 +1260,12 @@ def chat_response_stream_multiturn(
|
|
1242 |
return
|
1243 |
|
1244 |
# history will be appended with message later on
|
1245 |
-
|
1246 |
-
|
1247 |
-
|
|
|
|
|
|
|
1248 |
|
1249 |
if len(tokenizer.encode(full_prompt, add_special_tokens=False)) >= 4050:
|
1250 |
raise gr.Error(f"Conversation or prompt is too long, please clear the chatbox or try shorter input.")
|
@@ -1254,13 +1275,14 @@ def chat_response_stream_multiturn(
|
|
1254 |
max_tokens=max_tokens,
|
1255 |
frequency_penalty=frequency_penalty,
|
1256 |
presence_penalty=presence_penalty,
|
1257 |
-
stop=['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]']
|
|
|
1258 |
)
|
1259 |
cur_out = None
|
1260 |
|
1261 |
for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
|
1262 |
if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
|
1263 |
-
cur_out = cur_out.replace("\\n", "\n")
|
1264 |
|
1265 |
# optionally check safety, and respond
|
1266 |
if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
|
@@ -1569,7 +1591,7 @@ def batch_inference(
|
|
1569 |
max_tokens: int,
|
1570 |
frequency_penalty: float,
|
1571 |
presence_penalty: float,
|
1572 |
-
stop_strings: str = "[STOP],<s>,</s
|
1573 |
current_time: Optional[float] = None,
|
1574 |
system_prompt: Optional[str] = SYSTEM_PROMPT_1
|
1575 |
):
|
@@ -1603,11 +1625,11 @@ def batch_inference(
|
|
1603 |
remove_gradio_cache(exclude_names=['upload_chat.json', 'upload_few_shot.json'])
|
1604 |
|
1605 |
if prompt_mode == 'chat':
|
1606 |
-
prompt_format_fn =
|
1607 |
elif prompt_mode == 'few-shot':
|
1608 |
from functools import partial
|
1609 |
prompt_format_fn = partial(
|
1610 |
-
|
1611 |
)
|
1612 |
else:
|
1613 |
raise gr.Error(f'Wrong mode {prompt_mode}')
|
@@ -1702,7 +1724,7 @@ def launch():
|
|
1702 |
f'\n| frequence_penalty={frequence_penalty} '
|
1703 |
f'\n| presence_penalty={presence_penalty} '
|
1704 |
f'\n| temperature={temperature} '
|
1705 |
-
f'\n| hf_model_name={hf_model_name} '
|
1706 |
f'\n| model_path={model_path} '
|
1707 |
f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
|
1708 |
f'\n| gpu_memory_utilization={gpu_memory_utilization} '
|
@@ -1748,9 +1770,9 @@ def launch():
|
|
1748 |
print(f'Cannot print model worker: {e}')
|
1749 |
|
1750 |
try:
|
1751 |
-
llm.llm_engine.scheduler_config.max_model_len =
|
1752 |
-
llm.llm_engine.scheduler_config.max_num_batched_tokens =
|
1753 |
-
llm.llm_engine.tokenizer.add_special_tokens = False
|
1754 |
except Exception as e:
|
1755 |
print(f'Cannot set parameters: {e}')
|
1756 |
|
@@ -1902,4 +1924,4 @@ def main():
|
|
1902 |
|
1903 |
|
1904 |
if __name__ == "__main__":
|
1905 |
-
main()
|
|
|
217 |
</div>
|
218 |
</div>
|
219 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
|
222 |
MODEL_DESC = f"""
|
|
|
1019 |
|
1020 |
|
1021 |
|
1022 |
+
# def vllm_abort(self: Any):
|
1023 |
+
# sh = self.llm_engine.scheduler
|
1024 |
+
# for g in (sh.waiting + sh.running + sh.swapped):
|
1025 |
+
# sh.abort_seq_group(g.request_id)
|
1026 |
+
|
1027 |
+
# from vllm.sequence import SequenceStatus
|
1028 |
+
# scheduler = self.llm_engine.scheduler
|
1029 |
+
# for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
|
1030 |
+
# for seq_group in state_queue:
|
1031 |
+
# # if seq_group.request_id == request_id:
|
1032 |
+
# # Remove the sequence group from the state queue.
|
1033 |
+
# state_queue.remove(seq_group)
|
1034 |
+
# for seq in seq_group.seqs:
|
1035 |
+
# if seq.is_finished():
|
1036 |
+
# continue
|
1037 |
+
# scheduler.free_seq(seq, SequenceStatus.FINISHED_ABORTED)
|
1038 |
+
|
1039 |
+
|
1040 |
+
def vllm_abort(self):
|
1041 |
sh = self.llm_engine.scheduler
|
1042 |
for g in (sh.waiting + sh.running + sh.swapped):
|
1043 |
sh.abort_seq_group(g.request_id)
|
|
|
1044 |
from vllm.sequence import SequenceStatus
|
1045 |
scheduler = self.llm_engine.scheduler
|
1046 |
for state_queue in [scheduler.waiting, scheduler.running, scheduler.swapped]:
|
|
|
1184 |
return None
|
1185 |
|
1186 |
|
1187 |
+
|
1188 |
+
TURN_TEMPLATE = "<|im_start|>{role}\n{content}</s>"
|
1189 |
+
TURN_PREFIX = "<|im_start|>{role}\n"
|
1190 |
+
|
1191 |
+
|
1192 |
+
def chatml_chat_convo_format(conversations, add_assistant_prefix: bool, default_system=SYSTEM_PROMPT_1):
|
1193 |
+
if conversations[0]['role'] != 'system':
|
1194 |
+
conversations = [{"role": "system", "content": default_system}] + conversations
|
1195 |
+
text = ''
|
1196 |
+
for turn_id, turn in enumerate(conversations):
|
1197 |
+
prompt = TURN_TEMPLATE.format(role=turn['role'], content=turn['content'])
|
1198 |
+
text += prompt
|
1199 |
+
if add_assistant_prefix:
|
1200 |
+
prompt = TURN_PREFIX.format(role='assistant')
|
1201 |
+
text += prompt
|
1202 |
+
return text
|
1203 |
+
|
1204 |
+
|
1205 |
+
def chatml_format(message, history=None, system_prompt=None):
|
1206 |
+
conversations = []
|
1207 |
+
system_prompt = system_prompt or "You are a helpful assistant."
|
1208 |
+
if history is not None and len(history) > 0:
|
1209 |
+
for i, (prompt, res) in enumerate(history):
|
1210 |
+
conversations.append({"role": "user", "content": prompt.strip()})
|
1211 |
+
conversations.append({"role": "assistant", "content": res.strip()})
|
1212 |
+
conversations.append({"role": "user", "content": message.strip()})
|
1213 |
+
return chatml_chat_convo_format(conversations, True, default_system=system_prompt)
|
1214 |
+
|
1215 |
+
|
1216 |
def chat_response_stream_multiturn(
|
1217 |
message: str,
|
1218 |
history: List[Tuple[str, str]],
|
|
|
1260 |
return
|
1261 |
|
1262 |
# history will be appended with message later on
|
1263 |
+
|
1264 |
+
# full_prompt = llama_chat_multiturn_sys_input_seq_constructor(
|
1265 |
+
# message, history, sys_prompt=system_prompt
|
1266 |
+
# )
|
1267 |
+
full_prompt = chatml_format(message.strip(), history=history, system_prompt=system_prompt)
|
1268 |
+
print(full_prompt)
|
1269 |
|
1270 |
if len(tokenizer.encode(full_prompt, add_special_tokens=False)) >= 4050:
|
1271 |
raise gr.Error(f"Conversation or prompt is too long, please clear the chatbox or try shorter input.")
|
|
|
1275 |
max_tokens=max_tokens,
|
1276 |
frequency_penalty=frequency_penalty,
|
1277 |
presence_penalty=presence_penalty,
|
1278 |
+
# stop=['<s>', '</s>', '<<SYS>>', '<</SYS>>', '[INST]', '[/INST]'],
|
1279 |
+
stop=['<s>', '</s>', '<|im_start|>', '<|im_end|>'],
|
1280 |
)
|
1281 |
cur_out = None
|
1282 |
|
1283 |
for j, gen in enumerate(vllm_generate_stream(llm, full_prompt, sampling_params)):
|
1284 |
if cur_out is not None and (STREAM_YIELD_MULTIPLE < 1 or j % STREAM_YIELD_MULTIPLE == 0) and j > 0:
|
1285 |
+
# cur_out = cur_out.replace("\\n", "\n")
|
1286 |
|
1287 |
# optionally check safety, and respond
|
1288 |
if STREAM_CHECK_MULTIPLE > 0 and j % STREAM_CHECK_MULTIPLE == 0:
|
|
|
1591 |
max_tokens: int,
|
1592 |
frequency_penalty: float,
|
1593 |
presence_penalty: float,
|
1594 |
+
stop_strings: str = "[STOP],<s>,</s>,<|im_start|>",
|
1595 |
current_time: Optional[float] = None,
|
1596 |
system_prompt: Optional[str] = SYSTEM_PROMPT_1
|
1597 |
):
|
|
|
1625 |
remove_gradio_cache(exclude_names=['upload_chat.json', 'upload_few_shot.json'])
|
1626 |
|
1627 |
if prompt_mode == 'chat':
|
1628 |
+
prompt_format_fn = chatml_format
|
1629 |
elif prompt_mode == 'few-shot':
|
1630 |
from functools import partial
|
1631 |
prompt_format_fn = partial(
|
1632 |
+
chatml_format, include_end_instruct=False
|
1633 |
)
|
1634 |
else:
|
1635 |
raise gr.Error(f'Wrong mode {prompt_mode}')
|
|
|
1724 |
f'\n| frequence_penalty={frequence_penalty} '
|
1725 |
f'\n| presence_penalty={presence_penalty} '
|
1726 |
f'\n| temperature={temperature} '
|
1727 |
+
# f'\n| hf_model_name={hf_model_name} '
|
1728 |
f'\n| model_path={model_path} '
|
1729 |
f'\n| DOWNLOAD_SNAPSHOT={DOWNLOAD_SNAPSHOT} '
|
1730 |
f'\n| gpu_memory_utilization={gpu_memory_utilization} '
|
|
|
1770 |
print(f'Cannot print model worker: {e}')
|
1771 |
|
1772 |
try:
|
1773 |
+
llm.llm_engine.scheduler_config.max_model_len = 8192
|
1774 |
+
llm.llm_engine.scheduler_config.max_num_batched_tokens = 8192
|
1775 |
+
# llm.llm_engine.tokenizer.add_special_tokens = False
|
1776 |
except Exception as e:
|
1777 |
print(f'Cannot set parameters: {e}')
|
1778 |
|
|
|
1924 |
|
1925 |
|
1926 |
if __name__ == "__main__":
|
1927 |
+
main()
|