File size: 7,033 Bytes
7f914e8
 
 
 
 
96b0faa
7f50701
7f914e8
 
 
96b0faa
 
8456741
7f50701
18b414e
 
 
 
96b0faa
7f914e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96b0faa
 
2d37735
7f50701
 
18b414e
 
7f914e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d7fcdc
 
5d72c5f
1d7fcdc
 
 
 
 
 
7f914e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96b0faa
 
 
 
e71d3bd
7f914e8
 
 
96b0faa
7f914e8
96b0faa
 
 
 
e71d3bd
7f914e8
 
 
 
 
8456741
 
 
 
 
 
 
 
 
 
1e27b62
c5bd373
7f914e8
 
 
 
b3d9d60
7f914e8
 
 
 
 
1d7fcdc
7f50701
 
 
 
 
 
 
 
 
 
18b414e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd3c467
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
### Import Libraries ###
import streamlit as st
import itertools
from word_piece_tokenizer import WordPieceTokenizer
import tiktoken
from transformers import AutoTokenizer
from transformers import GPT2TokenizerFast

from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer

qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
aya_tokenizer = AutoTokenizer.from_pretrained('mlx-community/aya-expanse-32b-8bit')
claude_tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
xlmv_tokenizer = AutoTokenizer.from_pretrained('facebook/xlm-v-base')
nllb_tokenizer = AutoTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')



### User Interface ###
st.title("Tokenization")

st.write(
    """Tokenization is the first step of many natural language processing tasks. A tokenizer breaks down the text into smaller parts, 
         called tokens. For example, a token could be an entire word or a sub-word made of a sequence of letters. After the tokens are created, they are
         translated into a set of numerical IDs in order to be processed. Choosing a tokenizer affects the speed and quality of your results. When using a large language model (LLM), 
         the tokenizer used to train the model should be used to ensure compatibility."""
)

txt = st.text_area("Paste text to tokenize", max_chars=1000)

tokenizer = st.selectbox(
    "Tokenizer",
    (
        "White Space",
        "Qwen2.5 Tokenizer",
        "RuAdapt Tokenizer",
        "Aya-Expanse Tokenizer",
        "Open AI GPT-4o Tokenizer",
        "Anthropic Claude Tokenizer",
        "XLM-V Tokenizer",
        "NLLB-200 Tokenizer",
    ),
    index=None,
    placeholder="Select a tokenizer",
)

token_id = st.checkbox("Translate tokens into IDs", value=False)

### Helper Functions ###


def white_space_tokenizer(txt):
    return txt.split()


def treebank_tokenizer(txt):
    return TreebankWordTokenizer().tokenize(txt)


## Write tokenized output to screen ##

# Output colors to cycle through
colors = ["blue", "green", "orange", "red", "violet"]
color = itertools.cycle(colors)


# Stream data to screen
def stream_data():
    for token in split_tokens:
        yield f":{next(color)}-background[{token}]  "


def unique_list(token_list):
    token_set = set(token_list)
    return list(token_set)


def stream_token_ids():
    st.write(f"Unique tokens: {len(unique_tokens)}")
    for token in split_tokens:
        yield f":{next(color)}-background[{unique_tokens.index(token)}]  "


def stream_wp_token_ids():
    st.write(f"Unique tokens: {len(unique_list(ids))}")
    for id in ids:
        yield f":{next(color)}-background[{id}]  "

def num_tokens(txt):
    words = white_space_tokenizer(txt)
    n_words = len(words) if len(words) else 1
    try:
        return f'Token count {len(ids)}, f-rate {len(ids)/n_words}'    
    except:
        return ''
        
    

### Tokenizer Descriptions ###

white_space_desc = """A basic word-level tokenizer that splits text based on white space. This tokenizer is simple and fast, but it will not handle punctuation or special characters."""
treebank_desc = """The Penn Treebank tokenizer is the default word-level tokenizer in the Natural Language Toolkit (NLTK). It is a more advanced tokenizer that can handle punctuation and special characters."""
tweet_desc = """The TweetTokenizer is a specialized word-level tokenizer that is designed to handle text from social media platforms. It is able to handle hashtags, mentions, and emojis."""
wordpiece_desc = """Word Piece is a sub-word tokenizer that is used in BERT and other transformer models. It breaks down words into smaller sub-word units, which can be useful for handling rare or out-of-vocabulary words."""
bpe_desc = """Byte Pair Encoding (BPE) is a sub-word tokenizer that is used in models like Open AI's GPT-4o. It breaks down words into smaller sub-word units based on the frequency of character pairs in the text."""

# Create a dictionary of tokenized words

## Tokenizer Selection ##

if tokenizer == "White Space":
    with st.expander("About White Space Tokenizer"):
        st.write(white_space_desc)
    split_tokens = white_space_tokenizer(txt)
    st.write(stream_data)
    if token_id == True:
        color = itertools.cycle(colors)
        unique_tokens = unique_list(split_tokens)
        st.write(stream_token_ids)

elif tokenizer == "Qwen2.5 Tokenizer":
    with st.expander("About Qwen2.5 Tokenizer"):
        st.write('')
    ids = qwen_tokenizer.encode(txt)
    split_tokens = [qwen_tokenizer.decode([t]) for t in ids]
    st.write(stream_data)
    if token_id == True:
        color = itertools.cycle(colors)
        st.write(stream_wp_token_ids)

elif tokenizer == "RuAdapt Tokenizer":
    with st.expander("About RuAdapt Tokenizer"):
        st.write('')
    ids = ruadapt_tokenizer.encode(txt)
    split_tokens = [ruadapt_tokenizer.decode([t]) for t in ids]
    st.write(stream_data)
    if token_id == True:
        color = itertools.cycle(colors)
        st.write(stream_wp_token_ids)

elif tokenizer == "Aya-Expanse Tokenizer":
    with st.expander("About Aya-Expanse Tokenizer"):
        st.write('')
    ids = aya_tokenizer.encode(txt)
    split_tokens = [aya_tokenizer.decode([t]) for t in ids]
    st.write(stream_data)
    if token_id == True:
        color = itertools.cycle(colors)
        st.write(stream_wp_token_ids)

elif tokenizer == "Open AI GPT-4o Tokenizer":
    with st.expander("About Open AI GPT-4o Tokenizer"):
        st.write(bpe_desc)
    encoding = tiktoken.encoding_for_model("gpt-4o")
    ids = encoding.encode(txt)
    split_tokens = [
        encoding.decode_single_token_bytes(id).decode("utf-8", errors='ignore') for id in ids
    ]
    st.write(stream_data)
    if token_id == True:
        color = itertools.cycle(colors)
        st.write(stream_wp_token_ids)

elif tokenizer == "Anthropic Claude Tokenizer":
    with st.expander("About Anthropic Claude Tokenizer"):
        st.write('')
    ids = claude_tokenizer.encode(txt)
    split_tokens = [claude_tokenizer.decode([t]) for t in ids]
    st.write(stream_data)
    if token_id == True:
        color = itertools.cycle(colors)
        st.write(stream_wp_token_ids)

elif tokenizer == "XLM-V Tokenizer":
    with st.expander("About XLM-V Tokenizer"):
        st.write('')
    ids = xlmv_tokenizer.encode(txt)
    split_tokens = [xlmv_tokenizer.decode([t]) for t in ids]
    st.write(stream_data)
    if token_id == True:
        color = itertools.cycle(colors)
        st.write(stream_wp_token_ids)


elif tokenizer == "NLLB-200 Tokenizer":
    with st.expander("About NLLB-200 Tokenizer"):
        st.write('')
    ids = nllb_tokenizer.encode(txt)
    split_tokens = [nllb_tokenizer.decode([t]) for t in ids]
    st.write(stream_data)
    if token_id == True:
        color = itertools.cycle(colors)
        st.write(stream_wp_token_ids)

st.write(num_tokens(txt))