from fastapi import FastAPI from fastapi.responses import HTMLResponse from transformers import AutoTokenizer from pydantic import BaseModel from llama_cpp import Llama class Message(BaseModel): content: str token: int class System(BaseModel): sys_prompt: str app = FastAPI() @app.get("/, response_class=HTMLResponse") def greet_json(): return ''' FastAPI Chatbot
''' llm = Llama.from_pretrained( repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF", filename="qwen2.5-1.5b-instruct-q8_0.gguf", verbose=False ) tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct") messages = [] @app.post("/chat") def chat(req: Message): messages.append({"role": "user", "content": req.content}) text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) output = llm(text,max_tokens=req.token,echo=False) response = output['choices'][0]['text'] messages.append({"role": "assistant", "content": response}) return {"response": response_text} @app.post("/setSystemPrompt") def chat(req: System): messages.append({"role": "user", "content": req.sys_prompt}) return {"response": "System has been set"} @app.post("/clear_chat") def clear_chat(): global conversation_history conversation_history = [] return {"message": "Chat history cleared"}