Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import tiktoken
|
3 |
+
import random
|
4 |
+
|
5 |
+
# Load the tokenizers
|
6 |
+
enc_gpt4o = tiktoken.encoding_for_model("gpt-4o")
|
7 |
+
enc_gpt3_5turbo = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
8 |
+
|
9 |
+
def get_color_mapping(tokens):
|
10 |
+
unique_tokens = list(set(tokens))
|
11 |
+
colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in unique_tokens]
|
12 |
+
color_mapping = dict(zip(unique_tokens, colors))
|
13 |
+
return color_mapping
|
14 |
+
|
15 |
+
def process_model(text, encoder, model_name):
|
16 |
+
token_ids = encoder.encode(text)
|
17 |
+
tokens = [encoder.decode([id]) for id in token_ids]
|
18 |
+
num_tokens = len(tokens)
|
19 |
+
|
20 |
+
color_mapping = get_color_mapping(tokens)
|
21 |
+
|
22 |
+
modelname_html = f'<h2>{model_name}</h2>'
|
23 |
+
|
24 |
+
tokens_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token}</span>' for token in tokens]
|
25 |
+
token_ids_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token_id}</span>' for token, token_id in zip(tokens, token_ids)]
|
26 |
+
|
27 |
+
tokens_html = f'<h3>{model_name} Tokens</h3>' + ' '.join(tokens_colored)
|
28 |
+
num_tokens_html = f'<h3>Number of Tokens: <span style="font-size: 20px; font-weight: bold;">{num_tokens}</span></h3>'
|
29 |
+
token_ids_html = f'<h3>{model_name} Token IDs</h3>' + ' '.join(map(str, token_ids_colored))
|
30 |
+
|
31 |
+
return modelname_html + num_tokens_html + tokens_html + token_ids_html
|
32 |
+
|
33 |
+
def tokenize_input(text):
|
34 |
+
gpt4o_result = process_model(text, enc_gpt4o, "GPT-4o")
|
35 |
+
gpt35turbo_result = process_model(text, enc_gpt3_5turbo, "GPT-3.5-turbo")
|
36 |
+
num_chars = len(text)
|
37 |
+
num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>'
|
38 |
+
return num_chars_html, gpt4o_result, gpt35turbo_result
|
39 |
+
|
40 |
+
# Create the Gradio interface using Blocks
|
41 |
+
with gr.Blocks() as demo:
|
42 |
+
gr.Markdown("## GPT Tokenizer Comparison App")
|
43 |
+
with gr.Row():
|
44 |
+
input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize and compare results between GPT-4o and GPT-3.5-turbo tokenizers.")
|
45 |
+
num_chars_output = gr.HTML()
|
46 |
+
with gr.Row():
|
47 |
+
gpt4o_output = gr.HTML(label="GPT-4o")
|
48 |
+
gpt35turbo_output = gr.HTML(label="GPT-3.5-turbo")
|
49 |
+
|
50 |
+
input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt35turbo_output])
|
51 |
+
input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt35turbo_output])
|
52 |
+
|
53 |
+
# Launch the app
|
54 |
+
demo.launch()
|