mponty commited on
Commit
7f50701
·
verified ·
1 Parent(s): 2d37735

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -3
app.py CHANGED
@@ -4,12 +4,14 @@ import itertools
4
  from word_piece_tokenizer import WordPieceTokenizer
5
  import tiktoken
6
  from transformers import AutoTokenizer
 
7
 
8
  from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer
9
 
10
  qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
11
  ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
12
  aya_tokenizer = AutoTokenizer.from_pretrained('mlx-community/aya-expanse-32b-8bit')
 
13
 
14
  ### User Interface ###
15
  st.title("Tokenization")
@@ -30,7 +32,8 @@ tokenizer = st.selectbox(
30
  "Qwen2.5 Tokenizer",
31
  "RuAdapt Tokenizer",
32
  "Aya-Expanse Tokenizer",
33
- "Byte Pair Encoding (Open AI GPT-4o)",
 
34
  ),
35
  index=None,
36
  placeholder="Select a tokenizer",
@@ -140,8 +143,8 @@ elif tokenizer == "Aya-Expanse Tokenizer":
140
  color = itertools.cycle(colors)
141
  st.write(stream_wp_token_ids)
142
 
143
- elif tokenizer == "Byte Pair Encoding (Open AI GPT-4o)":
144
- with st.expander("About Byte Pair Encoding (BPE)"):
145
  st.write(bpe_desc)
146
  encoding = tiktoken.encoding_for_model("gpt-4o")
147
  ids = encoding.encode(txt)
@@ -153,4 +156,14 @@ elif tokenizer == "Byte Pair Encoding (Open AI GPT-4o)":
153
  color = itertools.cycle(colors)
154
  st.write(stream_wp_token_ids)
155
 
 
 
 
 
 
 
 
 
 
 
156
  st.write(num_tokens(txt))
 
4
  from word_piece_tokenizer import WordPieceTokenizer
5
  import tiktoken
6
  from transformers import AutoTokenizer
7
+ from transformers import GPT2TokenizerFast
8
 
9
  from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer
10
 
11
  qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
12
  ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
13
  aya_tokenizer = AutoTokenizer.from_pretrained('mlx-community/aya-expanse-32b-8bit')
14
+ claude_tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
15
 
16
  ### User Interface ###
17
  st.title("Tokenization")
 
32
  "Qwen2.5 Tokenizer",
33
  "RuAdapt Tokenizer",
34
  "Aya-Expanse Tokenizer",
35
+ "Open AI GPT-4o Tokenizer",
36
+ "Anthropic Claude Tokenizer",
37
  ),
38
  index=None,
39
  placeholder="Select a tokenizer",
 
143
  color = itertools.cycle(colors)
144
  st.write(stream_wp_token_ids)
145
 
146
+ elif tokenizer == ""Open AI GPT-4o Tokenizer"":
147
+ with st.expander("About "Open AI GPT-4o Tokenizer""):
148
  st.write(bpe_desc)
149
  encoding = tiktoken.encoding_for_model("gpt-4o")
150
  ids = encoding.encode(txt)
 
156
  color = itertools.cycle(colors)
157
  st.write(stream_wp_token_ids)
158
 
159
+ elif tokenizer == "Anthropic Claude Tokenizer":
160
+ with st.expander("About Anthropic Claude Tokenizer"):
161
+ st.write('')
162
+ ids = claude_tokenizer.encode(txt)
163
+ split_tokens = [claude_tokenizer.decode([t]) for t in ids]
164
+ st.write(stream_data)
165
+ if token_id == True:
166
+ color = itertools.cycle(colors)
167
+ st.write(stream_wp_token_ids)
168
+
169
  st.write(num_tokens(txt))