File size: 3,842 Bytes
c625a8c
 
 
1dfd50d
654eaa0
30b9c64
654eaa0
 
f88f764
c625a8c
6a34b4c
 
 
ef7bf1f
609ebbf
aad9e06
 
 
b9c177c
6a34b4c
609ebbf
 
 
 
 
 
 
1dfd50d
609ebbf
 
654eaa0
 
 
 
 
609ebbf
ef7bf1f
 
c064f32
1dfd50d
654eaa0
c625a8c
 
 
 
e3f2c3c
 
96cc7ba
1ede826
ef7bf1f
 
 
 
 
 
 
1ede826
c625a8c
 
 
 
 
 
 
 
f88f764
c625a8c
609ebbf
08499cc
c625a8c
c4894e1
9427292
c4894e1
c625a8c
609ebbf
c4894e1
4b8eb16
 
ef6577b
c625a8c
aad9e06
ce5ddf6
c625a8c
 
aad9e06
1dfd50d
5b0eb6a
c625a8c
 
 
 
 
 
1322444
2a139b2
08499cc
aad9e06
96cc7ba
aad9e06
1322444
 
d861c90
1322444
08499cc
 
1322444
96cc7ba
 
ef7bf1f
 
1322444
ef7bf1f
1322444
 
 
 
 
 
a161c80
ef7bf1f
1322444
aad9e06
3cc3cf4
1322444
7e33769
1322444
 
 
 
 
6c8cc78
c625a8c
 
6c8cc78
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import fastapi
from fastapi.responses import JSONResponse
from time import time
#from fastapi.middleware.cors import CORSMiddleware
#MODEL_PATH = "./qwen1_5-0_5b-chat-q4_0.gguf" #"./qwen1_5-0_5b-chat-q4_0.gguf"
import logging
import llama_cpp
import llama_cpp.llama_tokenizer
from pydantic import BaseModel


class GenModel(BaseModel):
    question: str
    system: str = "You are a helpful medical AI assistant. Help as much as you can. Remember, response in English."
    temperature: float = 0.8
    seed: int = 101
    mirostat_mode: int=2
    mirostat_tau: float=4.0
    mirostat_eta: float=1.1
    
llm_chat = llama_cpp.Llama.from_pretrained(
    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
    filename="*q4_0.gguf",
    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
    verbose=False,
     n_ctx=1024,
     n_gpu_layers=0,
    chat_format="llama-2"
)
llm_generate = llama_cpp.Llama.from_pretrained(
    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
    filename="*q4_0.gguf",
    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"),
    verbose=False,
     n_ctx=4096,
     n_gpu_layers=0,
    mirostat_mode=2,
    mirostat_tau=4.0,
    mirostat_eta=1.1,
    chat_format="llama-2"
)
# Logger setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = fastapi.FastAPI(
    title="OpenGenAI",
    description="Your Excellect AI Physician")
"""
app.add_middleware(
    CORSMiddleware,
    allow_origins = ["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"]
)
"""
@app.get("/")
def index():
    return fastapi.responses.RedirectResponse(url="/docs")


@app.get("/health")
def health():
    return {"status": "ok"}
    
# Chat Completion API
@app.post("/chat/")
async def chat(gen:GenModel):
    try:
        messages=[
                {"role": "assistant", "content": gen.system},
            ]
        st = time()
        output = llm_chat.create_chat_completion(
            messages = messages,
            temperature=gen.temperature,
            seed=gen.seed,
            #stream=True
        )
        messages.append({"role": "user", "content": gen.question})
        print(output)
        et = time()
        output["time"] = et - st
        messages.append({'role': "assistant", "content": output['choices'][0]['message']['content']})
        #print(messages)
        return output
    except Exception as e:
        logger.error(f"Error in /complete endpoint: {e}")
        return JSONResponse(
            status_code=500, content={"message": "Internal Server Error"}
        )

# Chat Completion API
@app.post("/generate")
async def generate(gen:GenModel):
    gen.system = "You are an helpful medical AI assistant."
    gen.temperature = 0.5
    gen.seed = 42
    try:
        st = time()
        output = llm_generate.create_chat_completion(
            messages=[
                {"role": "system", "content": gen.system},
                {"role": "user", "content": gen.question},
            ],
            temperature = gen.temperature,
            seed= gen.seed,
            #stream=True,
            #echo=True
        )
        """
        for chunk in output:
            delta = chunk['choices'][0]['delta']
            if 'role' in delta:
                print(delta['role'], end=': ')
            elif 'content' in delta:
                print(delta['content'], end='')
            #print(chunk)
        """
        et = time()
        output["time"] = et - st
        return output
    except Exception as e:
        logger.error(f"Error in /generate endpoint: {e}")
        return JSONResponse(
            status_code=500, content={"message": "Internal Server Error"}
        )



if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)