Spaces:
Runtime error
Runtime error
import gradio as gr | |
import time | |
from transformers import AutoTokenizer | |
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") | |
starter_text = """# Abstract | |
Within thirty years, we will have the technological means to create superhuman intelligence. Shortly after, | |
the human era will be ended. | |
Is such progress avoidable? If not to be avoided, can events be guided so that we may survive? These questions | |
are investigated. Some possible answers (and some further dangers) are presented. | |
""" | |
def calculate_wait_seconds(tokens_per_second): | |
return 1 / tokens_per_second | |
def get_tokens(prompt): | |
tokens = tokenizer.tokenize(prompt) | |
return [x.replace('▁', ' ').replace('<0x0A>', '\n') for x in tokens] | |
def echo(message, history, prompt, tokens_per_second, time_to_first_token, stream): | |
wait_seconds = calculate_wait_seconds(tokens_per_second) | |
response = f"{prompt}" | |
tokens = get_tokens(response) | |
if time_to_first_token: | |
time.sleep(time_to_first_token / 1000) | |
partial_message = "" | |
for new_token in tokens: | |
time.sleep(wait_seconds) | |
if '<' in new_token: | |
# Gradio chat chokes on HTML-like elements | |
continue | |
partial_message += str(new_token) | |
if stream: | |
yield partial_message | |
if not stream: | |
yield partial_message | |
with gr.Blocks(title='Tokens per Second Simulator') as demo: | |
gr.Markdown('# ⏱️ Tokens per Second Simulator') | |
gr.Markdown('Compare the feel of different response speeds for a chat bot') | |
gr.Markdown('Reading speeds vary but in English 5-10 tokens per second is considered normal reading speed') | |
gr.Markdown( | |
'References for further research:\n' | |
'- https://www.perplexity.ai/search/How-many-tokens-1d7VyXCDQuWf3pJnK4.0iw?s=c\n' | |
'- https://www.databricks.com/blog/llm-inference-performance-engineering-best-practices\n' | |
'- https://news.ycombinator.com/item?id=35978864\n' | |
'- https://www.reddit.com/r/LocalLLaMA/comments/162pgx9/what_do_yall_consider_acceptable_tokens_per/') | |
prompt = gr.Textbox(starter_text, label="Prompt to Echo") | |
tps_slider = gr.Slider(1, 50, render=True, value=8, label='Tokens per second (TPS)') | |
ttft_slider = gr.Slider(0, 5000, render=True, value=0, | |
label='Time to first token (TTFT) in milliseconds') | |
stream_checkbox = gr.Checkbox(label='Stream Response', value=True) | |
gr.ChatInterface(echo, additional_inputs=[prompt, tps_slider, ttft_slider, stream_checkbox], | |
description='Submit any text to echo the prompt above at the selected speed.') | |
demo.queue().launch() | |