Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,10 @@ qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
|
|
12 |
ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
|
13 |
aya_tokenizer = AutoTokenizer.from_pretrained('mlx-community/aya-expanse-32b-8bit')
|
14 |
claude_tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
|
|
|
|
|
|
|
|
|
15 |
|
16 |
### User Interface ###
|
17 |
st.title("Tokenization")
|
@@ -34,6 +38,8 @@ tokenizer = st.selectbox(
|
|
34 |
"Aya-Expanse Tokenizer",
|
35 |
"Open AI GPT-4o Tokenizer",
|
36 |
"Anthropic Claude Tokenizer",
|
|
|
|
|
37 |
),
|
38 |
index=None,
|
39 |
placeholder="Select a tokenizer",
|
@@ -166,4 +172,25 @@ elif tokenizer == "Anthropic Claude Tokenizer":
|
|
166 |
color = itertools.cycle(colors)
|
167 |
st.write(stream_wp_token_ids)
|
168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
st.write(num_tokens(txt))
|
|
|
12 |
ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
|
13 |
aya_tokenizer = AutoTokenizer.from_pretrained('mlx-community/aya-expanse-32b-8bit')
|
14 |
claude_tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
|
15 |
+
xlmv_tokenizer = AutoTokenizer.from_pretrained('facebook/xlm-v-base')
|
16 |
+
nllb_tokenizer = AutoTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
|
17 |
+
|
18 |
+
|
19 |
|
20 |
### User Interface ###
|
21 |
st.title("Tokenization")
|
|
|
38 |
"Aya-Expanse Tokenizer",
|
39 |
"Open AI GPT-4o Tokenizer",
|
40 |
"Anthropic Claude Tokenizer",
|
41 |
+
"XLM-V Tokenizer",
|
42 |
+
"NLLB-200 Tokenizer",
|
43 |
),
|
44 |
index=None,
|
45 |
placeholder="Select a tokenizer",
|
|
|
172 |
color = itertools.cycle(colors)
|
173 |
st.write(stream_wp_token_ids)
|
174 |
|
175 |
+
elif tokenizer == "XLM-V Tokenizer":
|
176 |
+
with st.expander("About XLM-V Tokenizer"):
|
177 |
+
st.write('')
|
178 |
+
ids = xlmv_tokenizer.encode(txt)
|
179 |
+
split_tokens = [xlmv_tokenizer.decode([t]) for t in ids]
|
180 |
+
st.write(stream_data)
|
181 |
+
if token_id == True:
|
182 |
+
color = itertools.cycle(colors)
|
183 |
+
st.write(stream_wp_token_ids)
|
184 |
+
|
185 |
+
|
186 |
+
elif tokenizer == "NLLB-200 Tokenizer":
|
187 |
+
with st.expander("About NLLB-200 Tokenizer"):
|
188 |
+
st.write('')
|
189 |
+
ids = nllb_tokenizer.encode(txt)
|
190 |
+
split_tokens = [nllb_tokenizer.decode([t]) for t in ids]
|
191 |
+
st.write(stream_data)
|
192 |
+
if token_id == True:
|
193 |
+
color = itertools.cycle(colors)
|
194 |
+
st.write(stream_wp_token_ids)
|
195 |
+
|
196 |
st.write(num_tokens(txt))
|