gstaff commited on
Commit
ad27c4b
1 Parent(s): 882c1d9

Initial commit.

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +66 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Token Per Second Simulator
3
- emoji: 🔥
4
  colorFrom: purple
5
  colorTo: purple
6
  sdk: gradio
 
1
  ---
2
  title: Token Per Second Simulator
3
+ emoji: ⏱️
4
  colorFrom: purple
5
  colorTo: purple
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+ from transformers import AutoTokenizer
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
6
+
7
+ starter_text = """# Abstract
8
+ Within thirty years, we will have the technological means to create superhuman intelligence. Shortly after,
9
+ the human era will be ended.
10
+ Is such progress avoidable? If not to be avoided, can events be guided so that we may survive? These questions
11
+ are investigated. Some possible answers (and some further dangers) are presented.
12
+ """
13
+
14
+
15
+ def calculate_wait_seconds(tokens_per_second):
16
+ return 1 / tokens_per_second
17
+
18
+
19
+ def get_tokens(prompt):
20
+ tokens = tokenizer.tokenize(prompt)
21
+ return [x.replace('▁', ' ').replace('<0x0A>', '\n') for x in tokens]
22
+
23
+
24
+ def echo(message, history, prompt, tokens_per_second, time_to_first_token, stream):
25
+ wait_seconds = calculate_wait_seconds(tokens_per_second)
26
+
27
+ response = f"{prompt}"
28
+ tokens = get_tokens(response)
29
+
30
+ if time_to_first_token:
31
+ time.sleep(time_to_first_token / 1000)
32
+ partial_message = ""
33
+ for new_token in tokens:
34
+ time.sleep(wait_seconds)
35
+ if '<' in new_token:
36
+ # Gradio chat chokes on HTML-like elements
37
+ continue
38
+ partial_message += str(new_token)
39
+ if stream:
40
+ yield partial_message
41
+
42
+ if not stream:
43
+ yield partial_message
44
+
45
+
46
+ with gr.Blocks(title='Tokens per Second Simulator') as demo:
47
+ gr.Markdown('# ⏱️ Tokens per Second Simulator')
48
+ gr.Markdown('Compare the feel of different response speeds for a chat bot')
49
+ gr.Markdown('Reading speeds vary but in English 5-10 tokens per second is considered normal reading speed')
50
+ gr.Markdown(
51
+ 'References for further research:\n'
52
+ '- https://www.perplexity.ai/search/How-many-tokens-1d7VyXCDQuWf3pJnK4.0iw?s=c\n'
53
+ '- https://www.databricks.com/blog/llm-inference-performance-engineering-best-practices\n'
54
+ '- https://news.ycombinator.com/item?id=35978864\n'
55
+ '- https://www.reddit.com/r/LocalLLaMA/comments/162pgx9/what_do_yall_consider_acceptable_tokens_per/')
56
+
57
+ prompt = gr.Textbox(starter_text, label="Prompt to Echo")
58
+ tps_slider = gr.Slider(1, 50, render=True, value=8, label='Tokens per second (TPS)')
59
+ ttft_slider = gr.Slider(0, 5000, render=True, value=0,
60
+ label='Time to first token (TTFT) in milliseconds')
61
+ stream_checkbox = gr.Checkbox(label='Stream Response', value=True)
62
+
63
+ gr.ChatInterface(echo, additional_inputs=[prompt, tps_slider, ttft_slider, stream_checkbox],
64
+ description='Submit any text to echo the prompt above at the selected speed.')
65
+
66
+ demo.queue().launch()