mponty commited on
Commit
18b414e
·
verified ·
1 Parent(s): b3d9d60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -0
app.py CHANGED
@@ -12,6 +12,10 @@ qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
12
  ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
13
  aya_tokenizer = AutoTokenizer.from_pretrained('mlx-community/aya-expanse-32b-8bit')
14
  claude_tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
 
 
 
 
15
 
16
  ### User Interface ###
17
  st.title("Tokenization")
@@ -34,6 +38,8 @@ tokenizer = st.selectbox(
34
  "Aya-Expanse Tokenizer",
35
  "Open AI GPT-4o Tokenizer",
36
  "Anthropic Claude Tokenizer",
 
 
37
  ),
38
  index=None,
39
  placeholder="Select a tokenizer",
@@ -166,4 +172,25 @@ elif tokenizer == "Anthropic Claude Tokenizer":
166
  color = itertools.cycle(colors)
167
  st.write(stream_wp_token_ids)
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  st.write(num_tokens(txt))
 
12
  ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
13
  aya_tokenizer = AutoTokenizer.from_pretrained('mlx-community/aya-expanse-32b-8bit')
14
  claude_tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
15
+ xlmv_tokenizer = AutoTokenizer.from_pretrained('facebook/xlm-v-base')
16
+ nllb_tokenizer = AutoTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
17
+
18
+
19
 
20
  ### User Interface ###
21
  st.title("Tokenization")
 
38
  "Aya-Expanse Tokenizer",
39
  "Open AI GPT-4o Tokenizer",
40
  "Anthropic Claude Tokenizer",
41
+ "XLM-V Tokenizer",
42
+ "NLLB-200 Tokenizer",
43
  ),
44
  index=None,
45
  placeholder="Select a tokenizer",
 
172
  color = itertools.cycle(colors)
173
  st.write(stream_wp_token_ids)
174
 
175
+ elif tokenizer == "XLM-V Tokenizer":
176
+ with st.expander("About XLM-V Tokenizer"):
177
+ st.write('')
178
+ ids = xlmv_tokenizer.encode(txt)
179
+ split_tokens = [xlmv_tokenizer.decode([t]) for t in ids]
180
+ st.write(stream_data)
181
+ if token_id == True:
182
+ color = itertools.cycle(colors)
183
+ st.write(stream_wp_token_ids)
184
+
185
+
186
+ elif tokenizer == "NLLB-200 Tokenizer":
187
+ with st.expander("About NLLB-200 Tokenizer"):
188
+ st.write('')
189
+ ids = nllb_tokenizer.encode(txt)
190
+ split_tokens = [nllb_tokenizer.decode([t]) for t in ids]
191
+ st.write(stream_data)
192
+ if token_id == True:
193
+ color = itertools.cycle(colors)
194
+ st.write(stream_wp_token_ids)
195
+
196
  st.write(num_tokens(txt))