Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,9 +3,13 @@ import streamlit as st
|
|
3 |
import itertools
|
4 |
from word_piece_tokenizer import WordPieceTokenizer
|
5 |
import tiktoken
|
|
|
6 |
|
7 |
from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer
|
8 |
|
|
|
|
|
|
|
9 |
### User Interface ###
|
10 |
st.title("Tokenization")
|
11 |
|
@@ -22,9 +26,8 @@ tokenizer = st.selectbox(
|
|
22 |
"Tokenizer",
|
23 |
(
|
24 |
"White Space",
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"WordPiece (BERT)",
|
28 |
"Byte Pair Encoding (Open AI GPT-4o)",
|
29 |
),
|
30 |
index=None,
|
@@ -96,31 +99,21 @@ if tokenizer == "White Space":
|
|
96 |
unique_tokens = unique_list(split_tokens)
|
97 |
st.write(stream_token_ids)
|
98 |
|
99 |
-
elif tokenizer == "
|
100 |
-
with st.expander("About
|
101 |
-
st.write(
|
102 |
-
|
103 |
-
|
104 |
-
if token_id == True:
|
105 |
-
color = itertools.cycle(colors)
|
106 |
-
unique_tokens = unique_list(split_tokens)
|
107 |
-
st.write(stream_token_ids)
|
108 |
-
|
109 |
-
elif tokenizer == "Tweet Tokenizer (NLTK)":
|
110 |
-
with st.expander("About Tweet Tokenizer"):
|
111 |
-
st.write(tweet_desc)
|
112 |
-
split_tokens = TweetTokenizer().tokenize(txt)
|
113 |
st.write(stream_data)
|
114 |
if token_id == True:
|
115 |
color = itertools.cycle(colors)
|
116 |
-
|
117 |
-
st.write(stream_token_ids)
|
118 |
|
119 |
-
elif tokenizer == "
|
120 |
-
with st.expander("About
|
121 |
-
st.write(
|
122 |
-
ids =
|
123 |
-
split_tokens =
|
124 |
st.write(stream_data)
|
125 |
if token_id == True:
|
126 |
color = itertools.cycle(colors)
|
|
|
3 |
import itertools
|
4 |
from word_piece_tokenizer import WordPieceTokenizer
|
5 |
import tiktoken
|
6 |
+
from transformers import AutoTokenizer
|
7 |
|
8 |
from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer
|
9 |
|
10 |
+
qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
|
11 |
+
ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
|
12 |
+
|
13 |
### User Interface ###
|
14 |
st.title("Tokenization")
|
15 |
|
|
|
26 |
"Tokenizer",
|
27 |
(
|
28 |
"White Space",
|
29 |
+
"Qwen2.5 Tokenizer",
|
30 |
+
"RuAdapt Tokenizer",
|
|
|
31 |
"Byte Pair Encoding (Open AI GPT-4o)",
|
32 |
),
|
33 |
index=None,
|
|
|
99 |
unique_tokens = unique_list(split_tokens)
|
100 |
st.write(stream_token_ids)
|
101 |
|
102 |
+
elif tokenizer == "Qwen2.5 Tokenizer":
|
103 |
+
with st.expander("About Qwen2.5 Tokenizer"):
|
104 |
+
st.write('')
|
105 |
+
ids = qwen_tokenizer.encode(txt)
|
106 |
+
split_tokens = qwen_tokenizer.tokenize(txt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
st.write(stream_data)
|
108 |
if token_id == True:
|
109 |
color = itertools.cycle(colors)
|
110 |
+
st.write(stream_wp_token_ids)
|
|
|
111 |
|
112 |
+
elif tokenizer == "RuAdapt Tokenizer":
|
113 |
+
with st.expander("About RuAdapt Tokenizer"):
|
114 |
+
st.write('')
|
115 |
+
ids = ruadapt_tokenizer.encode(txt)
|
116 |
+
split_tokens = ruadapt_tokenizer.tokenize(txt)
|
117 |
st.write(stream_data)
|
118 |
if token_id == True:
|
119 |
color = itertools.cycle(colors)
|