import gradio as gr from llama_cpp import Llama llm = Llama( model_path="gemma-2b-uk.gguf", chat_format="gemma" ) def convert_history(message, history): chat_history = [] for block in history[-1:]: chat_history.append({ "role": "user", "content": block[0] }) chat_history.append({ "role": "model", "content": block[1] }) chat_history.append({ "role": "user", "content": message }) return chat_history def ask(message, history): chat_history = convert_history(message, history) chunks = llm.create_chat_completion( messages = chat_history, temperature = 0, stream = True, repeat_penalty = 1.05, max_tokens = 128, ) response = "" for chunk in chunks: delta = chunk["choices"][0]["delta"] if "content" not in delta: continue response += delta["content"] yield response demo = gr.ChatInterface(ask) if __name__ == "__main__": demo.queue().launch()