from fastapi import FastAPI
from fastapi.responses import HTMLResponse
from transformers import AutoTokenizer
from pydantic import BaseModel
from llama_cpp import Llama
import time
class Message(BaseModel):
content: str
token: int
class System(BaseModel):
sys_prompt: str
app = FastAPI()
@app.get("/", response_class=HTMLResponse)
def greet_json():
return '''
FastAPI Chatbot
'''
llm = Llama.from_pretrained(
repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
filename="qwen2.5-1.5b-instruct-q8_0.gguf",
verbose=True,n_ctx=1024
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
messages = []
@app.post("/chat")
def chat(req: Message):
a = time.time()
messages.append({"role": "user", "content": req.content})
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
output = llm(text,max_tokens=req.token,echo=False)
response = output['choices'][0]['text']
messages.append({"role": "assistant", "content": response})
b = time.time()
return {"response": response, "time": b-a}
@app.post("/setSystemPrompt")
def chat(req: System):
global conversation_history
conversation_history = []
messages.append({"role": "user", "content": req.sys_prompt})
return {"response": "System has been set"}