|
import gradio as gr |
|
import psutil |
|
import subprocess |
|
import time |
|
|
|
def generate_response(user_message): |
|
cmd = [ |
|
"/app/llama.cpp/main", |
|
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf", |
|
"-p", user_message, |
|
"-n", "400", |
|
"-e" |
|
] |
|
|
|
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1) |
|
process_monitor = psutil.Process(process.pid) |
|
|
|
|
|
start_time = time.time() |
|
monitor_start_time = time.time() |
|
alltokens = "" |
|
token_buffer = '' |
|
tokencount = 0 |
|
try: |
|
while True: |
|
|
|
char = process.stdout.read(1) |
|
if char == '' and process.poll() is not None: |
|
break |
|
if char != '': |
|
token_buffer += char |
|
if char == ' ' or char == '\n': |
|
elapsed_time = time.time() - start_time |
|
alltokens += token_buffer |
|
tokencount += 1 |
|
yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds | Tokens: { tokencount }]" |
|
token_buffer = '' |
|
|
|
if time.time() - monitor_start_time > 60: |
|
cpu_usage = process_monitor.cpu_percent() |
|
memory_usage = process_monitor.memory_info().rss |
|
print(f"Subprocess CPU Usage: {cpu_usage}%, Memory Usage: {memory_usage / 1024 ** 2} MB") |
|
monitor_start_time = time.time() |
|
|
|
|
|
if token_buffer: |
|
elapsed_time = time.time() - start_time |
|
alltokens += token_buffer |
|
yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds | Average Tokens per second: { tokencount / elapsed_time}]" |
|
|
|
finally: |
|
try: |
|
|
|
process.wait(timeout=60) |
|
except subprocess.TimeoutExpired: |
|
print("Process didn't complete within the timeout. Killing it.") |
|
process.kill() |
|
process.wait() |
|
|
|
process.stdout.close() |
|
process.stderr.close() |
|
|
|
|
|
if process.returncode != 0: |
|
error_message = process.stderr.read() |
|
print(f"Error: {error_message}") |
|
|
|
|
|
def custom_generate_response(cust_user_message): |
|
cust_user_message = CustomPrompts[0] + '\n\n' + cust_user_message + '\n\n' |
|
yield from generate_response(cust_user_message) |
|
|
|
def custom_generate_response1(cust_user_message): |
|
cust_user_message = CustomPrompts[1] + '\n\n' + cust_user_message + '\n\n' |
|
yield from generate_response(cust_user_message) |
|
|
|
def custom_generate_response2(cust_user_message): |
|
cust_user_message = CustomPrompts[2] + '\n' + cust_user_message + '\n\n' |
|
yield from generate_response(cust_user_message) |
|
|
|
CustomPrompts = [ |
|
"Write a Class Diagram based on the following text:", |
|
"Write a Pydot code based on the following text:", |
|
"Describe what a standard happy scene in any movie would be planned in great detail, based on the following text:", |
|
] |
|
|
|
with gr.Blocks() as iface: |
|
gr.Interface( |
|
fn=generate_response, |
|
inputs=gr.Textbox(lines=2, placeholder="Type your message here..."), |
|
outputs="text", |
|
title="Stable LM 2 Zephyr (1.6b) LLama.cpp Interface Test", |
|
description="No Prompt template used yet (Essentially autocomplete). No Message History for now - Enter your message and get a response.", |
|
flagging_dir="/usr/src/app/flagged", |
|
) |
|
|
|
with gr.Group(): |
|
gr.HTML("Test for wrapping generator (Instead of buttons tabs and dropdowns?)") |
|
MainOutput = gr.TextArea(placeholder='Output will show here') |
|
CustomButtonInput = gr.TextArea(lines=1, placeholder='Prompt goes here') |
|
CustomButtonClassDiagram = gr.Button(CustomPrompts[0]) |
|
CustomButtonPydotcode = gr.Button(CustomPrompts[1]) |
|
CustomButtonHappyMovieScene = gr.Button(CustomPrompts[2]) |
|
CustomButtonClassDiagram.click(custom_generate_response, inputs=[CustomButtonInput], outputs=MainOutput) |
|
CustomButtonPydotcode.click(custom_generate_response1, inputs=[CustomButtonInput], outputs=MainOutput) |
|
CustomButtonHappyMovieScene.click(custom_generate_response2, inputs=[CustomButtonInput], outputs=MainOutput) |
|
|
|
iface.queue().launch(server_name="0.0.0.0", share=True) |