Jordan Legg commited on
Commit
a71870f
1 Parent(s): b39e76c

working great!

Browse files
Files changed (2) hide show
  1. app.py +67 -19
  2. requirements.txt +2 -1
app.py CHANGED
@@ -4,27 +4,75 @@ from transformers import T5TokenizerFast, CLIPTokenizer
4
  def count_tokens(text):
5
  # Load the common tokenizers
6
  t5_tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl", legacy=False)
7
- clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
8
 
9
- # Get token counts directly using the encode method
10
- t5_count = len(t5_tokenizer.encode(text))
11
- clip_count = len(clip_tokenizer.encode(text))
12
 
13
- return f"T5: {t5_count} tokens", f"CLIP: {clip_count} tokens"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # Create a Gradio interface
16
- iface = gr.Interface(
17
- fn=count_tokens,
18
- inputs=[
19
- gr.Textbox(label="Text", placeholder="Enter text here...")
20
- ],
21
- outputs=[
22
- gr.Textbox(label="T5 Tokenizer"),
23
- gr.Textbox(label="CLIP Tokenizer")
24
- ],
25
- title="Common Diffusion Model Token Counter",
26
- description="Enter text to count tokens using T5 and CLIP tokenizers, commonly used in diffusion models."
27
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  # Launch the app
30
- iface.launch()
 
4
  def count_tokens(text):
5
  # Load the common tokenizers
6
  t5_tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl", legacy=False)
7
+ clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
8
 
9
+ # Get tokens and their IDs
10
+ t5_tokens = t5_tokenizer.encode(text, return_tensors="pt")[0].tolist()
11
+ clip_tokens = clip_tokenizer.encode(text)
12
 
13
+ # Decode individual tokens for display, replacing whitespace with visible characters
14
+ t5_decoded = []
15
+ for token in t5_tokens:
16
+ decoded = t5_tokenizer.decode([token])
17
+ # Replace whitespace with visible characters and empty strings with special markers
18
+ if decoded.isspace():
19
+ decoded = "␣" # visible space marker
20
+ elif decoded == "":
21
+ decoded = "∅" # empty token marker
22
+ t5_decoded.append(decoded)
23
+
24
+ clip_decoded = []
25
+ for token in clip_tokens:
26
+ decoded = clip_tokenizer.decode([token])
27
+ if decoded.isspace():
28
+ decoded = "␣"
29
+ elif decoded == "":
30
+ decoded = "∅"
31
+ clip_decoded.append(decoded)
32
+
33
+ # Create highlighted text tuples (text, label)
34
+ t5_highlights = [(token, f"Token {i}") for i, token in enumerate(t5_decoded)]
35
+ clip_highlights = [(token, f"Token {i}") for i, token in enumerate(clip_decoded)]
36
+
37
+ return (
38
+ # T5 outputs
39
+ len(t5_tokens),
40
+ t5_highlights,
41
+ str(t5_tokens),
42
+ # CLIP outputs
43
+ len(clip_tokens),
44
+ clip_highlights,
45
+ str(clip_tokens)
46
+ )
47
 
48
+ # Create a Gradio interface with custom layout
49
+ with gr.Blocks(title="Common Diffusion Model Token Counter") as iface:
50
+ gr.Markdown("# Common Diffusion Model Token Counter")
51
+ gr.Markdown("Enter text to count tokens using T5 and CLIP tokenizers, commonly used in diffusion models.")
52
+
53
+ with gr.Row():
54
+ text_input = gr.Textbox(label="Diffusion Prompt", placeholder="Enter your prompt here...")
55
+
56
+ with gr.Row():
57
+ # T5 Column
58
+ with gr.Column():
59
+ gr.Markdown("### T5 Tokenizer Results")
60
+ t5_count = gr.Number(label="T5 Token Count")
61
+ t5_highlights = gr.HighlightedText(label="T5 Tokens", show_legend=True)
62
+ t5_ids = gr.Textbox(label="T5 Token IDs", lines=2)
63
+
64
+ # CLIP Column
65
+ with gr.Column():
66
+ gr.Markdown("### CLIP Tokenizer Results")
67
+ clip_count = gr.Number(label="CLIP Token Count")
68
+ clip_highlights = gr.HighlightedText(label="CLIP Tokens", show_legend=True)
69
+ clip_ids = gr.Textbox(label="CLIP Token IDs", lines=2)
70
+
71
+ text_input.change(
72
+ fn=count_tokens,
73
+ inputs=[text_input],
74
+ outputs=[t5_count, t5_highlights, t5_ids, clip_count, clip_highlights, clip_ids]
75
+ )
76
 
77
  # Launch the app
78
+ iface.launch(show_error=True)
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gradio
2
  transformers
3
  protobuf
4
- sentencepiece
 
 
1
  gradio
2
  transformers
3
  protobuf
4
+ sentencepiece
5
+ torch