gstaff's picture
Initial commit.
ad27c4b
import gradio as gr
import time
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
starter_text = """# Abstract
Within thirty years, we will have the technological means to create superhuman intelligence. Shortly after,
the human era will be ended.
Is such progress avoidable? If not to be avoided, can events be guided so that we may survive? These questions
are investigated. Some possible answers (and some further dangers) are presented.
"""
def calculate_wait_seconds(tokens_per_second):
return 1 / tokens_per_second
def get_tokens(prompt):
tokens = tokenizer.tokenize(prompt)
return [x.replace('▁', ' ').replace('<0x0A>', '\n') for x in tokens]
def echo(message, history, prompt, tokens_per_second, time_to_first_token, stream):
wait_seconds = calculate_wait_seconds(tokens_per_second)
response = f"{prompt}"
tokens = get_tokens(response)
if time_to_first_token:
time.sleep(time_to_first_token / 1000)
partial_message = ""
for new_token in tokens:
time.sleep(wait_seconds)
if '<' in new_token:
# Gradio chat chokes on HTML-like elements
continue
partial_message += str(new_token)
if stream:
yield partial_message
if not stream:
yield partial_message
with gr.Blocks(title='Tokens per Second Simulator') as demo:
gr.Markdown('# ⏱️ Tokens per Second Simulator')
gr.Markdown('Compare the feel of different response speeds for a chat bot')
gr.Markdown('Reading speeds vary but in English 5-10 tokens per second is considered normal reading speed')
gr.Markdown(
'References for further research:\n'
'- https://www.perplexity.ai/search/How-many-tokens-1d7VyXCDQuWf3pJnK4.0iw?s=c\n'
'- https://www.databricks.com/blog/llm-inference-performance-engineering-best-practices\n'
'- https://news.ycombinator.com/item?id=35978864\n'
'- https://www.reddit.com/r/LocalLLaMA/comments/162pgx9/what_do_yall_consider_acceptable_tokens_per/')
prompt = gr.Textbox(starter_text, label="Prompt to Echo")
tps_slider = gr.Slider(1, 50, render=True, value=8, label='Tokens per second (TPS)')
ttft_slider = gr.Slider(0, 5000, render=True, value=0,
label='Time to first token (TTFT) in milliseconds')
stream_checkbox = gr.Checkbox(label='Stream Response', value=True)
gr.ChatInterface(echo, additional_inputs=[prompt, tps_slider, ttft_slider, stream_checkbox],
description='Submit any text to echo the prompt above at the selected speed.')
demo.queue().launch()