mponty commited on
Commit
96b0faa
·
verified ·
1 Parent(s): 7f914e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -24
app.py CHANGED
@@ -3,9 +3,13 @@ import streamlit as st
3
  import itertools
4
  from word_piece_tokenizer import WordPieceTokenizer
5
  import tiktoken
 
6
 
7
  from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer
8
 
 
 
 
9
  ### User Interface ###
10
  st.title("Tokenization")
11
 
@@ -22,9 +26,8 @@ tokenizer = st.selectbox(
22
  "Tokenizer",
23
  (
24
  "White Space",
25
- "Penn Treebank (NLTK Default)",
26
- "Tweet Tokenizer (NLTK)",
27
- "WordPiece (BERT)",
28
  "Byte Pair Encoding (Open AI GPT-4o)",
29
  ),
30
  index=None,
@@ -96,31 +99,21 @@ if tokenizer == "White Space":
96
  unique_tokens = unique_list(split_tokens)
97
  st.write(stream_token_ids)
98
 
99
- elif tokenizer == "Penn Treebank (NLTK Default)":
100
- with st.expander("About Penn Treebank Tokenizer"):
101
- st.write(treebank_desc)
102
- split_tokens = TreebankWordTokenizer().tokenize(txt)
103
- st.write(stream_data)
104
- if token_id == True:
105
- color = itertools.cycle(colors)
106
- unique_tokens = unique_list(split_tokens)
107
- st.write(stream_token_ids)
108
-
109
- elif tokenizer == "Tweet Tokenizer (NLTK)":
110
- with st.expander("About Tweet Tokenizer"):
111
- st.write(tweet_desc)
112
- split_tokens = TweetTokenizer().tokenize(txt)
113
  st.write(stream_data)
114
  if token_id == True:
115
  color = itertools.cycle(colors)
116
- unique_tokens = unique_list(split_tokens)
117
- st.write(stream_token_ids)
118
 
119
- elif tokenizer == "WordPiece (BERT)":
120
- with st.expander("About WordPiece Tokenizer"):
121
- st.write(wordpiece_desc)
122
- ids = WordPieceTokenizer().tokenize(txt)
123
- split_tokens = WordPieceTokenizer().convert_ids_to_tokens(ids)
124
  st.write(stream_data)
125
  if token_id == True:
126
  color = itertools.cycle(colors)
 
3
  import itertools
4
  from word_piece_tokenizer import WordPieceTokenizer
5
  import tiktoken
6
+ from transformers import AutoTokenizer
7
 
8
  from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer
9
 
10
+ qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
11
+ ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
12
+
13
  ### User Interface ###
14
  st.title("Tokenization")
15
 
 
26
  "Tokenizer",
27
  (
28
  "White Space",
29
+ "Qwen2.5 Tokenizer",
30
+ "RuAdapt Tokenizer",
 
31
  "Byte Pair Encoding (Open AI GPT-4o)",
32
  ),
33
  index=None,
 
99
  unique_tokens = unique_list(split_tokens)
100
  st.write(stream_token_ids)
101
 
102
+ elif tokenizer == "Qwen2.5 Tokenizer":
103
+ with st.expander("About Qwen2.5 Tokenizer"):
104
+ st.write('')
105
+ ids = qwen_tokenizer.encode(txt)
106
+ split_tokens = qwen_tokenizer.tokenize(txt)
 
 
 
 
 
 
 
 
 
107
  st.write(stream_data)
108
  if token_id == True:
109
  color = itertools.cycle(colors)
110
+ st.write(stream_wp_token_ids)
 
111
 
112
+ elif tokenizer == "RuAdapt Tokenizer":
113
+ with st.expander("About RuAdapt Tokenizer"):
114
+ st.write('')
115
+ ids = ruadapt_tokenizer.encode(txt)
116
+ split_tokens = ruadapt_tokenizer.tokenize(txt)
117
  st.write(stream_data)
118
  if token_id == True:
119
  color = itertools.cycle(colors)