gemma-2b-uk / app.py
theodotus's picture
Only short responses
c62a826
raw
history blame
1.09 kB
import gradio as gr
from llama_cpp import Llama
llm = Llama(
model_path="gemma-2b-uk.gguf",
chat_format="gemma"
)
def convert_history(message, history):
chat_history = []
for block in history[-1:]:
chat_history.append({
"role": "user",
"content": block[0]
})
chat_history.append({
"role": "model",
"content": block[1]
})
chat_history.append({
"role": "user",
"content": message
})
return chat_history
def ask(message, history):
chat_history = convert_history(message, history)
chunks = llm.create_chat_completion(
messages = chat_history,
temperature = 0,
stream = True,
repeat_penalty = 1.05,
max_tokens = 128,
)
response = ""
for chunk in chunks:
delta = chunk["choices"][0]["delta"]
if "content" not in delta:
continue
response += delta["content"]
yield response
demo = gr.ChatInterface(ask)
if __name__ == "__main__":
demo.queue().launch()