File size: 4,890 Bytes
7f914e8
 
 
 
 
96b0faa
7f914e8
 
 
96b0faa
 
 
7f914e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96b0faa
 
7f914e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96b0faa
 
 
 
 
7f914e8
 
 
96b0faa
7f914e8
96b0faa
 
 
 
 
7f914e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
### Import Libraries ###
import streamlit as st
import itertools
from word_piece_tokenizer import WordPieceTokenizer
import tiktoken
from transformers import AutoTokenizer

from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer

qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')

### User Interface ###
st.title("Tokenization")

st.write(
    """Tokenization is the first step of many natural language processing tasks. A tokenizer breaks down the text into smaller parts, 
         called tokens. For example, a token could be an entire word or a sub-word made of a sequence of letters. After the tokens are created, they are
         translated into a set of numerical IDs in order to be processed. Choosing a tokenizer affects the speed and quality of your results. When using a large language model (LLM), 
         the tokenizer used to train the model should be used to ensure compatibility."""
)

txt = st.text_area("Paste text to tokenize", max_chars=1000)

tokenizer = st.selectbox(
    "Tokenizer",
    (
        "White Space",
        "Qwen2.5 Tokenizer",
        "RuAdapt Tokenizer",
        "Byte Pair Encoding (Open AI GPT-4o)",
    ),
    index=None,
    placeholder="Select a tokenizer",
)

token_id = st.checkbox("Translate tokens into IDs", value=False)

### Helper Functions ###


def white_space_tokenizer(txt):
    return txt.split()


def treebank_tokenizer(txt):
    return TreebankWordTokenizer().tokenize(txt)


## Write tokenized output to screen ##

# Output colors to cycle through
colors = ["blue", "green", "orange", "red", "violet"]
color = itertools.cycle(colors)


# Stream data to screen
def stream_data():
    for token in split_tokens:
        yield f":{next(color)}-background[{token}]  "


def unique_list(token_list):
    token_set = set(token_list)
    return list(token_set)


def stream_token_ids():
    st.write(f"Unique tokens: {len(unique_tokens)}")
    for token in split_tokens:
        yield f":{next(color)}-background[{unique_tokens.index(token)}]  "


def stream_wp_token_ids():
    st.write(f"Unique tokens: {len(unique_list(ids))}")
    for id in ids:
        yield f":{next(color)}-background[{id}]  "


### Tokenizer Descriptions ###

white_space_desc = """A basic word-level tokenizer that splits text based on white space. This tokenizer is simple and fast, but it will not handle punctuation or special characters."""
treebank_desc = """The Penn Treebank tokenizer is the default word-level tokenizer in the Natural Language Toolkit (NLTK). It is a more advanced tokenizer that can handle punctuation and special characters."""
tweet_desc = """The TweetTokenizer is a specialized word-level tokenizer that is designed to handle text from social media platforms. It is able to handle hashtags, mentions, and emojis."""
wordpiece_desc = """Word Piece is a sub-word tokenizer that is used in BERT and other transformer models. It breaks down words into smaller sub-word units, which can be useful for handling rare or out-of-vocabulary words."""
bpe_desc = """Byte Pair Encoding (BPE) is a sub-word tokenizer that is used in models like Open AI's GPT-4o. It breaks down words into smaller sub-word units based on the frequency of character pairs in the text."""

# Create a dictionary of tokenized words

## Tokenizer Selection ##

if tokenizer == "White Space":
    with st.expander("About White Space Tokenizer"):
        st.write(white_space_desc)
    split_tokens = white_space_tokenizer(txt)
    st.write(stream_data)
    if token_id == True:
        color = itertools.cycle(colors)
        unique_tokens = unique_list(split_tokens)
        st.write(stream_token_ids)

elif tokenizer == "Qwen2.5 Tokenizer":
    with st.expander("About Qwen2.5 Tokenizer"):
        st.write('')
    ids = qwen_tokenizer.encode(txt)
    split_tokens = qwen_tokenizer.tokenize(txt)
    st.write(stream_data)
    if token_id == True:
        color = itertools.cycle(colors)
        st.write(stream_wp_token_ids)

elif tokenizer == "RuAdapt Tokenizer":
    with st.expander("About RuAdapt Tokenizer"):
        st.write('')
    ids = ruadapt_tokenizer.encode(txt)
    split_tokens = ruadapt_tokenizer.tokenize(txt)
    st.write(stream_data)
    if token_id == True:
        color = itertools.cycle(colors)
        st.write(stream_wp_token_ids)

elif tokenizer == "Byte Pair Encoding (Open AI GPT-4o)":
    with st.expander("About Byte Pair Encoding (BPE)"):
        st.write(bpe_desc)
    encoding = tiktoken.encoding_for_model("gpt-4o")
    ids = encoding.encode(txt)
    split_tokens = [
        encoding.decode_single_token_bytes(id).decode("utf-8") for id in ids
    ]
    st.write(stream_data)
    if token_id == True:
        color = itertools.cycle(colors)
        st.write(stream_wp_token_ids)