from typing import List
import fastapi
import markdown
import uvicorn
from ctransformers import AutoModelForCausalLM
from fastapi.responses import HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from sse_starlette.sse import EventSourceResponse
from pydantic import BaseModel, Field
from typing_extensions import Literal
from dialogue import DialogueTemplate
llm = AutoModelForCausalLM.from_pretrained("TheBloke/starchat-beta-GGML",
model_file="starchat-beta.ggmlv3.q4_0.bin",
model_type="starcoder")
app = fastapi.FastAPI(title="Starchat Beta")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def index():
with open("README.md", "r", encoding="utf-8") as readme_file:
md_template_string = readme_file.read()
html_content = markdown.markdown(md_template_string)
return HTMLResponse(content=html_content, status_code=200)
@app.get("/demo")
async def demo():
html_content = """
starchat-alpha-q4.0
"""
return HTMLResponse(content=html_content, status_code=200)
@app.get("/stream")
async def chat(prompt = "<|user|> Write an express server with server sent events. <|assistant|>"):
tokens = llm.tokenize(prompt)
async def server_sent_events(chat_chunks, llm):
yield prompt
for chat_chunk in llm.generate(chat_chunks):
yield llm.detokenize(chat_chunk)
yield ""
return EventSourceResponse(server_sent_events(tokens, llm))
class ChatCompletionRequestMessage(BaseModel):
role: Literal["system", "user", "assistant"] = Field(
default="user", description="The role of the message."
)
content: str = Field(default="", description="The content of the message.")
class ChatCompletionRequest(BaseModel):
messages: List[ChatCompletionRequestMessage] = Field(
default=[], description="A list of messages to generate completions for."
)
system_message = "Below is a conversation between a human user and a helpful AI coding assistant."
@app.post("/v1/chat/completions")
async def chat(request: ChatCompletionRequest, response_mode=None):
kwargs = request.dict()
dialogue_template = DialogueTemplate(
system=system_message, messages=kwargs['messages']
)
prompt = dialogue_template.get_inference_prompt()
tokens = llm.tokenize(prompt)
async def server_sent_events(chat_chunks, llm):
for token in llm.generate(chat_chunks):
yield dict(data=llm.detokenize(token))
yield dict(data="[DONE]")
return EventSourceResponse(server_sent_events(tokens, llm))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)