kwabs22
commited on
Commit
•
d9e0520
1
Parent(s):
03936f4
after bufsize=1 change, exploring word or token level stream
Browse files
app.py
CHANGED
@@ -34,6 +34,45 @@ def generate_response(user_message): #Figure Out the parameters later and find a
|
|
34 |
print(f"Error: {error_message}")
|
35 |
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
def custom_generate_response(cust_user_message):
|
38 |
cust_user_message = CustomPrompts[0] + '\n\n' + cust_user_message
|
39 |
yield from generate_response(cust_user_message)
|
@@ -52,6 +91,7 @@ with gr.Blocks() as iface:
|
|
52 |
description="No Message History for now - Enter your message and get a response. (One sentence every 20s)",
|
53 |
flagging_dir="/usr/src/app/flagged",
|
54 |
)
|
|
|
55 |
with gr.Group():
|
56 |
gr.HTML("Test for wrapping generator (20 seconds a piece of the response)")
|
57 |
MainOutput = gr.TextArea(placeholder='Output will show here')
|
|
|
34 |
print(f"Error: {error_message}")
|
35 |
|
36 |
|
37 |
+
def generate_response_token_by_token(user_message):
|
38 |
+
cmd = [
|
39 |
+
"/app/llama.cpp/main", # Path to the executable
|
40 |
+
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
|
41 |
+
"-p", user_message,
|
42 |
+
"-n", "400",
|
43 |
+
"-e"
|
44 |
+
]
|
45 |
+
|
46 |
+
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)
|
47 |
+
|
48 |
+
start_time = time.time()
|
49 |
+
token_buffer = ''
|
50 |
+
while True:
|
51 |
+
# Read one character at a time
|
52 |
+
char = process.stdout.read(1)
|
53 |
+
if char == '' and process.poll() is not None:
|
54 |
+
break
|
55 |
+
if char != '':
|
56 |
+
token_buffer += char
|
57 |
+
if char == ' ' or char == '\n': # Token delimiters
|
58 |
+
elapsed_time = time.time() - start_time # Calculate elapsed time
|
59 |
+
yield f"{token_buffer} [Inference time: {elapsed_time:.2f} seconds]"
|
60 |
+
token_buffer = '' # Reset token buffer
|
61 |
+
|
62 |
+
# Yield the last token if there is any
|
63 |
+
if token_buffer:
|
64 |
+
elapsed_time = time.time() - start_time # Calculate elapsed time
|
65 |
+
yield f"{token_buffer} [Inference time: {elapsed_time:.2f} seconds]"
|
66 |
+
|
67 |
+
# Wait for the subprocess to finish if it hasn't already
|
68 |
+
process.wait()
|
69 |
+
|
70 |
+
# Check for any errors
|
71 |
+
if process.returncode != 0:
|
72 |
+
error_message = process.stderr.read()
|
73 |
+
print(f"Error: {error_message}")
|
74 |
+
|
75 |
+
|
76 |
def custom_generate_response(cust_user_message):
|
77 |
cust_user_message = CustomPrompts[0] + '\n\n' + cust_user_message
|
78 |
yield from generate_response(cust_user_message)
|
|
|
91 |
description="No Message History for now - Enter your message and get a response. (One sentence every 20s)",
|
92 |
flagging_dir="/usr/src/app/flagged",
|
93 |
)
|
94 |
+
gr.Interface(fn=generate_response_token_by_token, inputs=gr.Textbox(lines=2, placeholder='Type prompt here...'), outputs="text", description="More Responsive streaming test")
|
95 |
with gr.Group():
|
96 |
gr.HTML("Test for wrapping generator (20 seconds a piece of the response)")
|
97 |
MainOutput = gr.TextArea(placeholder='Output will show here')
|