File size: 3,842 Bytes
c625a8c 1dfd50d 654eaa0 30b9c64 654eaa0 f88f764 c625a8c 6a34b4c ef7bf1f 609ebbf aad9e06 b9c177c 6a34b4c 609ebbf 1dfd50d 609ebbf 654eaa0 609ebbf ef7bf1f c064f32 1dfd50d 654eaa0 c625a8c e3f2c3c 96cc7ba 1ede826 ef7bf1f 1ede826 c625a8c f88f764 c625a8c 609ebbf 08499cc c625a8c c4894e1 9427292 c4894e1 c625a8c 609ebbf c4894e1 4b8eb16 ef6577b c625a8c aad9e06 ce5ddf6 c625a8c aad9e06 1dfd50d 5b0eb6a c625a8c 1322444 2a139b2 08499cc aad9e06 96cc7ba aad9e06 1322444 d861c90 1322444 08499cc 1322444 96cc7ba ef7bf1f 1322444 ef7bf1f 1322444 a161c80 ef7bf1f 1322444 aad9e06 3cc3cf4 1322444 7e33769 1322444 6c8cc78 c625a8c 6c8cc78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import fastapi
from fastapi.responses import JSONResponse
from time import time
#from fastapi.middleware.cors import CORSMiddleware
#MODEL_PATH = "./qwen1_5-0_5b-chat-q4_0.gguf" #"./qwen1_5-0_5b-chat-q4_0.gguf"
import logging
import llama_cpp
import llama_cpp.llama_tokenizer
from pydantic import BaseModel
class GenModel(BaseModel):
question: str
system: str = "You are a helpful medical AI assistant. Help as much as you can. Remember, response in English."
temperature: float = 0.8
seed: int = 101
mirostat_mode: int=2
mirostat_tau: float=4.0
mirostat_eta: float=1.1
llm_chat = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q4_0.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
verbose=False,
n_ctx=1024,
n_gpu_layers=0,
chat_format="llama-2"
)
llm_generate = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
filename="*q4_0.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
verbose=False,
n_ctx=4096,
n_gpu_layers=0,
mirostat_mode=2,
mirostat_tau=4.0,
mirostat_eta=1.1,
chat_format="llama-2"
)
# Logger setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = fastapi.FastAPI(
title="OpenGenAI",
description="Your Excellect AI Physician")
"""
app.add_middleware(
CORSMiddleware,
allow_origins = ["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"]
)
"""
@app.get("/")
def index():
return fastapi.responses.RedirectResponse(url="/docs")
@app.get("/health")
def health():
return {"status": "ok"}
# Chat Completion API
@app.post("/chat/")
async def chat(gen:GenModel):
try:
messages=[
{"role": "assistant", "content": gen.system},
]
st = time()
output = llm_chat.create_chat_completion(
messages = messages,
temperature=gen.temperature,
seed=gen.seed,
#stream=True
)
messages.append({"role": "user", "content": gen.question})
print(output)
et = time()
output["time"] = et - st
messages.append({'role': "assistant", "content": output['choices'][0]['message']['content']})
#print(messages)
return output
except Exception as e:
logger.error(f"Error in /complete endpoint: {e}")
return JSONResponse(
status_code=500, content={"message": "Internal Server Error"}
)
# Chat Completion API
@app.post("/generate")
async def generate(gen:GenModel):
gen.system = "You are an helpful medical AI assistant."
gen.temperature = 0.5
gen.seed = 42
try:
st = time()
output = llm_generate.create_chat_completion(
messages=[
{"role": "system", "content": gen.system},
{"role": "user", "content": gen.question},
],
temperature = gen.temperature,
seed= gen.seed,
#stream=True,
#echo=True
)
"""
for chunk in output:
delta = chunk['choices'][0]['delta']
if 'role' in delta:
print(delta['role'], end=': ')
elif 'content' in delta:
print(delta['content'], end='')
#print(chunk)
"""
et = time()
output["time"] = et - st
return output
except Exception as e:
logger.error(f"Error in /generate endpoint: {e}")
return JSONResponse(
status_code=500, content={"message": "Internal Server Error"}
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860) |