import gradio as gr | |
#from llama_cpp import Llama | |
import random | |
import subprocess | |
# Initialize model | |
#llm = Llama(model_path="/stablelm-2-zephyr-1_6b-Q4_0.gguf", n_gpu_layers=0, seed=random.randint(1, 2**31)) | |
""" | |
def generate_response(user_message): | |
encodeduserm = b"### Human: " + user_message.encode('utf-8') + b"\n### Assistant:" | |
tokens = llm.tokenize(encodeduserm) | |
output = b"" | |
count = 0 | |
for token in llm.generate(tokens, top_k=40, top_p=0.95, temp=0.72, repeat_penalty=1.1): | |
text = llm.detokenize([token]) | |
output += text | |
count += 1 | |
if count >= 500 or (token == llm.token_eos()): | |
break | |
return output.decode() | |
""" | |
def generate_response(user_message): | |
cmd = [ | |
"./llama.cpp/main", # Path to the executable | |
"-m", "llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf", | |
"-p", user_message, | |
"-n", "400", | |
"-e" | |
] | |
result = subprocess.run(cmd, capture_output=True, text=True) | |
return result.stdout | |
iface = gr.Interface( | |
fn=generate_response, | |
inputs=gr.Textbox(lines=2, placeholder="Type your message here..."), | |
outputs="text", | |
title="LLaMA Chat Interface", | |
description="Enter your message and get a response from the LLaMA model.", | |
flagging_dir="/usr/src/app/flagged", | |
) | |
iface.launch(share=True) |