File size: 3,678 Bytes
c625a8c 654eaa0 30b9c64 654eaa0 f88f764 c625a8c 6a34b4c 52b60b9 609ebbf 6a34b4c 609ebbf 654eaa0 609ebbf c9e4960 654eaa0 c625a8c e3f2c3c 96cc7ba c625a8c f88f764 c625a8c 609ebbf 08499cc c625a8c c4894e1 c625a8c 609ebbf c4894e1 4b8eb16 ef6577b c625a8c efb4830 ce5ddf6 5b0eb6a 66da31c 5b0eb6a 66da31c 5b0eb6a f181ae2 5b0eb6a c625a8c da2cdb2 542af36 5b0eb6a c625a8c 1322444 2a139b2 08499cc 96cc7ba 08499cc 1322444 609ebbf 1322444 08499cc 1322444 96cc7ba ac1d2a7 1322444 6c8cc78 c625a8c 6c8cc78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import fastapi
from fastapi.responses import JSONResponse
from time import time
#MODEL_PATH = "./qwen1_5-0_5b-chat-q4_0.gguf" #"./qwen1_5-0_5b-chat-q4_0.gguf"
import logging
import llama_cpp
import llama_cpp.llama_tokenizer
from pydantic import BaseModel
class GenModel(BaseModel):
question: str
system: str = "You are a helpful medical assistant."
temperature: float = 0.8
seed: int = 101
llm_chat = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q4_0.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
verbose=False,
n_ctx=1024,
n_gpu_layers=0,
#chat_format="llama-2"
)
llm_generate = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q4_0.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
verbose=False,
n_ctx=4096,
n_gpu_layers=0,
#chat_format="llama-2"
)
# Logger setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = fastapi.FastAPI(
title="OpenGenAI",
description="Your Excellect AI Physician")
@app.get("/")
def index():
return fastapi.responses.RedirectResponse(url="/docs")
@app.get("/health")
def health():
return {"status": "ok"}
# Chat Completion API
@app.post("/chat/")
async def chat(gen:GenModel):
try:
messages=[
{"role": "system", "content": gen.system},
]
st = time()
output = llm_chat.create_chat_completion(
messages = messages,
temperature=gen.temperature,
seed=gen.seed,
#stream=True
)
messages.append({"role": "user", "content": gen.question},)
print(output)
"""
for chunk in output:
delta = chunk['choices'][0]['delta']
if 'role' in delta:
print(delta['role'], end=': ')
elif 'content' in delta:
print(delta['content'], end='')
print(chunk)
"""
et = time()
output["time"] = et - st
messages.append({'role': "assistant", "content": output['choices'][0]['message']})
print(messages)
return output
except Exception as e:
logger.error(f"Error in /complete endpoint: {e}")
return JSONResponse(
status_code=500, content={"message": "Internal Server Error"}
)
# Chat Completion API
@app.post("/generate")
async def generate(gen:GenModel):
gen.system = "You are an AI assistant."
gen.temperature = 0.5
gen.seed: int = 42
try:
st = time()
output = llm_generate.create_chat_completion(
messages=[
{"role": "system", "content": gen.system},
{"role": "user", "content": gen.question},
],
temperature = gen.temperature,
seed= gen.seed,
stream=True
)
for chunk in output:
delta = chunk['choices'][0]['delta']
if 'role' in delta:
print(delta['role'], end=': ')
elif 'content' in delta:
print(delta['content'], end='')
print(chunk)
et = time()
output["time"] = et - st
return output
except Exception as e:
logger.error(f"Error in /complete endpoint: {e}")
return JSONResponse(
status_code=500, content={"message": "Internal Server Error"}
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860) |