kwabs22 commited on
Commit
d9e0520
1 Parent(s): 03936f4

after bufsize=1 change, exploring word or token level stream

Browse files
Files changed (1) hide show
  1. app.py +40 -0
app.py CHANGED
@@ -34,6 +34,45 @@ def generate_response(user_message): #Figure Out the parameters later and find a
34
  print(f"Error: {error_message}")
35
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def custom_generate_response(cust_user_message):
38
  cust_user_message = CustomPrompts[0] + '\n\n' + cust_user_message
39
  yield from generate_response(cust_user_message)
@@ -52,6 +91,7 @@ with gr.Blocks() as iface:
52
  description="No Message History for now - Enter your message and get a response. (One sentence every 20s)",
53
  flagging_dir="/usr/src/app/flagged",
54
  )
 
55
  with gr.Group():
56
  gr.HTML("Test for wrapping generator (20 seconds a piece of the response)")
57
  MainOutput = gr.TextArea(placeholder='Output will show here')
 
34
  print(f"Error: {error_message}")
35
 
36
 
37
+ def generate_response_token_by_token(user_message):
38
+ cmd = [
39
+ "/app/llama.cpp/main", # Path to the executable
40
+ "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
41
+ "-p", user_message,
42
+ "-n", "400",
43
+ "-e"
44
+ ]
45
+
46
+ process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)
47
+
48
+ start_time = time.time()
49
+ token_buffer = ''
50
+ while True:
51
+ # Read one character at a time
52
+ char = process.stdout.read(1)
53
+ if char == '' and process.poll() is not None:
54
+ break
55
+ if char != '':
56
+ token_buffer += char
57
+ if char == ' ' or char == '\n': # Token delimiters
58
+ elapsed_time = time.time() - start_time # Calculate elapsed time
59
+ yield f"{token_buffer} [Inference time: {elapsed_time:.2f} seconds]"
60
+ token_buffer = '' # Reset token buffer
61
+
62
+ # Yield the last token if there is any
63
+ if token_buffer:
64
+ elapsed_time = time.time() - start_time # Calculate elapsed time
65
+ yield f"{token_buffer} [Inference time: {elapsed_time:.2f} seconds]"
66
+
67
+ # Wait for the subprocess to finish if it hasn't already
68
+ process.wait()
69
+
70
+ # Check for any errors
71
+ if process.returncode != 0:
72
+ error_message = process.stderr.read()
73
+ print(f"Error: {error_message}")
74
+
75
+
76
  def custom_generate_response(cust_user_message):
77
  cust_user_message = CustomPrompts[0] + '\n\n' + cust_user_message
78
  yield from generate_response(cust_user_message)
 
91
  description="No Message History for now - Enter your message and get a response. (One sentence every 20s)",
92
  flagging_dir="/usr/src/app/flagged",
93
  )
94
+ gr.Interface(fn=generate_response_token_by_token, inputs=gr.Textbox(lines=2, placeholder='Type prompt here...'), outputs="text", description="More Responsive streaming test")
95
  with gr.Group():
96
  gr.HTML("Test for wrapping generator (20 seconds a piece of the response)")
97
  MainOutput = gr.TextArea(placeholder='Output will show here')