Spaces:
Running
Running
import gradio as gr | |
from llama_cpp import Llama | |
llm = Llama( | |
model_path="gemma-2b-uk.gguf", | |
chat_format="gemma" | |
) | |
def convert_history(message, history): | |
chat_history = [] | |
for block in history[-1:]: | |
chat_history.append({ | |
"role": "user", | |
"content": block[0] | |
}) | |
chat_history.append({ | |
"role": "model", | |
"content": block[1] | |
}) | |
chat_history.append({ | |
"role": "user", | |
"content": message | |
}) | |
return chat_history | |
def ask(message, history): | |
chat_history = convert_history(message, history) | |
chunks = llm.create_chat_completion( | |
messages = chat_history, | |
temperature = 0, | |
stream = True, | |
repeat_penalty = 1.05, | |
max_tokens = 128, | |
) | |
response = "" | |
for chunk in chunks: | |
delta = chunk["choices"][0]["delta"] | |
if "content" not in delta: | |
continue | |
response += delta["content"] | |
yield response | |
demo = gr.ChatInterface(ask) | |
if __name__ == "__main__": | |
demo.queue().launch() |