mponty commited on
Commit
8456741
·
verified ·
1 Parent(s): 5d72c5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -0
app.py CHANGED
@@ -9,6 +9,7 @@ from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokeni
9
 
10
  qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
11
  ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
 
12
 
13
  ### User Interface ###
14
  st.title("Tokenization")
@@ -128,6 +129,16 @@ elif tokenizer == "RuAdapt Tokenizer":
128
  color = itertools.cycle(colors)
129
  st.write(stream_wp_token_ids)
130
 
 
 
 
 
 
 
 
 
 
 
131
  elif tokenizer == "Byte Pair Encoding (Open AI GPT-4o)":
132
  with st.expander("About Byte Pair Encoding (BPE)"):
133
  st.write(bpe_desc)
 
9
 
10
  qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
11
  ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
12
+ aya_tokenizer = AutoTokenizer.from_pretrained('mlx-community/aya-expanse-32b-8bit')
13
 
14
  ### User Interface ###
15
  st.title("Tokenization")
 
129
  color = itertools.cycle(colors)
130
  st.write(stream_wp_token_ids)
131
 
132
+ elif tokenizer == "Aya-Expanse Tokenizer":
133
+ with st.expander("About Aya-Expanse Tokenizer"):
134
+ st.write('')
135
+ ids = aya_tokenizer.encode(txt)
136
+ split_tokens = [aya_tokenizer.decode([t]) for t in ids]
137
+ st.write(stream_data)
138
+ if token_id == True:
139
+ color = itertools.cycle(colors)
140
+ st.write(stream_wp_token_ids)
141
+
142
  elif tokenizer == "Byte Pair Encoding (Open AI GPT-4o)":
143
  with st.expander("About Byte Pair Encoding (BPE)"):
144
  st.write(bpe_desc)