from fastapi import FastAPI from fastapi.responses import HTMLResponse from transformers import AutoTokenizer from pydantic import BaseModel from llama_cpp import Llama import time class Message(BaseModel): content: str token: int class System(BaseModel): sys_prompt: str app = FastAPI() @app.get("/", response_class=HTMLResponse) def greet_json(): return ''' FastAPI Chatbot
''' llm = Llama.from_pretrained( repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF", filename="qwen2.5-1.5b-instruct-q8_0.gguf", verbose=True,n_ctx=1024 ) tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct") messages = [] @app.post("/chat") def chat(req: Message): a = time.time() messages.append({"role": "user", "content": req.content}) text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) output = llm(text,max_tokens=req.token,echo=False) response = output['choices'][0]['text'] messages.append({"role": "assistant", "content": response}) b = time.time() return {"response": response, "time": b-a} @app.post("/setSystemPrompt") def chat(req: System): global conversation_history conversation_history = [] messages.append({"role": "user", "content": req.sys_prompt}) return {"response": "System has been set"}