Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from typing import List, Literal, Sequence, TypedDict | |
Role = Literal["system", "user", "assistant"] | |
class Message(TypedDict): | |
role: Role | |
content: str | |
Dialog = Sequence[Message] | |
class ChatFormat: | |
def encode_header(self, message: Message) -> str: | |
return f"{message['role']}\n\n" | |
def encode_message(self, message: Message) -> str: | |
header = self.encode_header(message) | |
return f"{header}{message['content'].strip()}" | |
def encode_dialog_prompt(self, dialog: Dialog) -> str: | |
dialog_str = "" | |
for message in dialog: | |
dialog_str += self.encode_message(message) | |
dialog_str += self.encode_header({"role": "assistant", "content": ""}) | |
return dialog_str | |
class MedS_Llama3: | |
def __init__(self, model_path: str): | |
# 加载模型到CPU | |
self.model = AutoModelForCausalLM.from_pretrained( | |
model_path, | |
device_map='cpu', # 指定加载到CPU | |
torch_dtype=torch.float32 # 使用标准的float32精度 | |
) | |
self.model.config.pad_token_id = self.model.config.eos_token_id = 128009 | |
self.tokenizer = AutoTokenizer.from_pretrained( | |
model_path, | |
model_max_length=2048, | |
padding_side="right" | |
) | |
self.tokenizer.pad_token = self.tokenizer.eos_token | |
self.model.eval() | |
self.prompt_engine = ChatFormat() | |
print('Model and tokenizer loaded on CPU!') | |
def __build_inputs_for_llama3(self, query: str, instruction: str) -> str: | |
input_ss = [ | |
{"role": 'system', "content": instruction}, | |
{"role": 'user', "content": query} | |
] | |
return self.prompt_engine.encode_dialog_prompt(input_ss) | |
def chat(self, query: str, instruction: str, max_output_tokens: int) -> str: | |
formatted_query = f"Input:\n{query}\nOutput:\n" | |
input_sentence = self.__build_inputs_for_llama3(formatted_query, instruction) | |
input_tokens = self.tokenizer( | |
input_sentence, | |
return_tensors="pt", | |
padding=True, | |
truncation=True | |
) | |
output = self.model.generate( | |
**input_tokens, | |
max_new_tokens=max_output_tokens, | |
eos_token_id=128009 | |
) | |
generated_text = self.tokenizer.decode( | |
output[0][input_tokens['input_ids'].shape[1]:], | |
skip_special_tokens=True | |
) | |
return generated_text.strip() | |
# 实例化模型 | |
model_path = "Henrychur/MMedS-Llama-3-8B" # 确保这里是模型的正确路径 | |
chat_model = MedS_Llama3(model_path) | |
# 定义 Gradio 接口中使用的响应函数 | |
def respond(message, system_message, max_output_tokens): | |
# 每次对话结束后清空历史,只使用当前输入和系统指令 | |
response = chat_model.chat(query=message, instruction=system_message, max_output_tokens=max_output_tokens) | |
yield response | |
# 设置 Gradio 聊天界面 | |
demo = gr.Interface( | |
fn=respond, | |
inputs=[ | |
gr.Textbox(value="What is the treatment for diabetes?", label="Your Input"), | |
gr.Textbox(value="If you are a doctor, please perform clinical consulting with the patient.", label="System message"), | |
gr.Slider(minimum=1, maximum=1024, value=512, step=1, label="Max Output Tokens") | |
], | |
outputs="text" | |
) | |
if __name__ == "__main__": | |
demo.launch() | |