Spaces:

theodotus
/

gemma-2b-uk

Running

gemma-2b-uk / app.py

Only short responses

c62a826 9 months ago

1.09 kB

	import gradio as gr
	from llama_cpp import Llama




	llm = Llama(
	model_path="gemma-2b-uk.gguf",
	chat_format="gemma"
	)




	def convert_history(message, history):
	chat_history = []
	for block in history[-1:]:
	chat_history.append({
	"role": "user",
	"content": block[0]
	})
	chat_history.append({
	"role": "model",
	"content": block[1]
	})
	chat_history.append({
	"role": "user",
	"content": message
	})
	return chat_history


	def ask(message, history):
	chat_history = convert_history(message, history)
	chunks = llm.create_chat_completion(
	messages = chat_history,
	temperature = 0,
	stream = True,
	repeat_penalty = 1.05,
	max_tokens = 128,
	)

	response = ""
	for chunk in chunks:
	delta = chunk["choices"][0]["delta"]
	if "content" not in delta:
	continue
	response += delta["content"]
	yield response




	demo = gr.ChatInterface(ask)

	if __name__ == "__main__":
	demo.queue().launch()