gojiteji commited on
Commit
d798650
1 Parent(s): 60b6630

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -0
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tiktoken
3
+ import random
4
+
5
+ # Load the tokenizers
6
+ enc_gpt4o = tiktoken.encoding_for_model("gpt-4o")
7
+ enc_gpt3_5turbo = tiktoken.encoding_for_model("gpt-3.5-turbo")
8
+
9
+ def get_color_mapping(tokens):
10
+ unique_tokens = list(set(tokens))
11
+ colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in unique_tokens]
12
+ color_mapping = dict(zip(unique_tokens, colors))
13
+ return color_mapping
14
+
15
+ def process_model(text, encoder, model_name):
16
+ token_ids = encoder.encode(text)
17
+ tokens = [encoder.decode([id]) for id in token_ids]
18
+ num_tokens = len(tokens)
19
+
20
+ color_mapping = get_color_mapping(tokens)
21
+
22
+ modelname_html = f'<h2>{model_name}</h2>'
23
+
24
+ tokens_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token}</span>' for token in tokens]
25
+ token_ids_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token_id}</span>' for token, token_id in zip(tokens, token_ids)]
26
+
27
+ tokens_html = f'<h3>{model_name} Tokens</h3>' + ' '.join(tokens_colored)
28
+ num_tokens_html = f'<h3>Number of Tokens: <span style="font-size: 20px; font-weight: bold;">{num_tokens}</span></h3>'
29
+ token_ids_html = f'<h3>{model_name} Token IDs</h3>' + ' '.join(map(str, token_ids_colored))
30
+
31
+ return modelname_html + num_tokens_html + tokens_html + token_ids_html
32
+
33
+ def tokenize_input(text):
34
+ gpt4o_result = process_model(text, enc_gpt4o, "GPT-4o")
35
+ gpt35turbo_result = process_model(text, enc_gpt3_5turbo, "GPT-3.5-turbo")
36
+ num_chars = len(text)
37
+ num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>'
38
+ return num_chars_html, gpt4o_result, gpt35turbo_result
39
+
40
+ # Create the Gradio interface using Blocks
41
+ with gr.Blocks() as demo:
42
+ gr.Markdown("## GPT Tokenizer Comparison App")
43
+ with gr.Row():
44
+ input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize and compare results between GPT-4o and GPT-3.5-turbo tokenizers.")
45
+ num_chars_output = gr.HTML()
46
+ with gr.Row():
47
+ gpt4o_output = gr.HTML(label="GPT-4o")
48
+ gpt35turbo_output = gr.HTML(label="GPT-3.5-turbo")
49
+
50
+ input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt35turbo_output])
51
+ input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt35turbo_output])
52
+
53
+ # Launch the app
54
+ demo.launch()