Spaces:
Running
Running
File size: 7,033 Bytes
7f914e8 96b0faa 7f50701 7f914e8 96b0faa 8456741 7f50701 18b414e 96b0faa 7f914e8 96b0faa 2d37735 7f50701 18b414e 7f914e8 1d7fcdc 5d72c5f 1d7fcdc 7f914e8 96b0faa e71d3bd 7f914e8 96b0faa 7f914e8 96b0faa e71d3bd 7f914e8 8456741 1e27b62 c5bd373 7f914e8 b3d9d60 7f914e8 1d7fcdc 7f50701 18b414e fd3c467 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
### Import Libraries ###
import streamlit as st
import itertools
from word_piece_tokenizer import WordPieceTokenizer
import tiktoken
from transformers import AutoTokenizer
from transformers import GPT2TokenizerFast
from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer
qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
aya_tokenizer = AutoTokenizer.from_pretrained('mlx-community/aya-expanse-32b-8bit')
claude_tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/claude-tokenizer')
xlmv_tokenizer = AutoTokenizer.from_pretrained('facebook/xlm-v-base')
nllb_tokenizer = AutoTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
### User Interface ###
st.title("Tokenization")
st.write(
"""Tokenization is the first step of many natural language processing tasks. A tokenizer breaks down the text into smaller parts,
called tokens. For example, a token could be an entire word or a sub-word made of a sequence of letters. After the tokens are created, they are
translated into a set of numerical IDs in order to be processed. Choosing a tokenizer affects the speed and quality of your results. When using a large language model (LLM),
the tokenizer used to train the model should be used to ensure compatibility."""
)
txt = st.text_area("Paste text to tokenize", max_chars=1000)
tokenizer = st.selectbox(
"Tokenizer",
(
"White Space",
"Qwen2.5 Tokenizer",
"RuAdapt Tokenizer",
"Aya-Expanse Tokenizer",
"Open AI GPT-4o Tokenizer",
"Anthropic Claude Tokenizer",
"XLM-V Tokenizer",
"NLLB-200 Tokenizer",
),
index=None,
placeholder="Select a tokenizer",
)
token_id = st.checkbox("Translate tokens into IDs", value=False)
### Helper Functions ###
def white_space_tokenizer(txt):
return txt.split()
def treebank_tokenizer(txt):
return TreebankWordTokenizer().tokenize(txt)
## Write tokenized output to screen ##
# Output colors to cycle through
colors = ["blue", "green", "orange", "red", "violet"]
color = itertools.cycle(colors)
# Stream data to screen
def stream_data():
for token in split_tokens:
yield f":{next(color)}-background[{token}] "
def unique_list(token_list):
token_set = set(token_list)
return list(token_set)
def stream_token_ids():
st.write(f"Unique tokens: {len(unique_tokens)}")
for token in split_tokens:
yield f":{next(color)}-background[{unique_tokens.index(token)}] "
def stream_wp_token_ids():
st.write(f"Unique tokens: {len(unique_list(ids))}")
for id in ids:
yield f":{next(color)}-background[{id}] "
def num_tokens(txt):
words = white_space_tokenizer(txt)
n_words = len(words) if len(words) else 1
try:
return f'Token count {len(ids)}, f-rate {len(ids)/n_words}'
except:
return ''
### Tokenizer Descriptions ###
white_space_desc = """A basic word-level tokenizer that splits text based on white space. This tokenizer is simple and fast, but it will not handle punctuation or special characters."""
treebank_desc = """The Penn Treebank tokenizer is the default word-level tokenizer in the Natural Language Toolkit (NLTK). It is a more advanced tokenizer that can handle punctuation and special characters."""
tweet_desc = """The TweetTokenizer is a specialized word-level tokenizer that is designed to handle text from social media platforms. It is able to handle hashtags, mentions, and emojis."""
wordpiece_desc = """Word Piece is a sub-word tokenizer that is used in BERT and other transformer models. It breaks down words into smaller sub-word units, which can be useful for handling rare or out-of-vocabulary words."""
bpe_desc = """Byte Pair Encoding (BPE) is a sub-word tokenizer that is used in models like Open AI's GPT-4o. It breaks down words into smaller sub-word units based on the frequency of character pairs in the text."""
# Create a dictionary of tokenized words
## Tokenizer Selection ##
if tokenizer == "White Space":
with st.expander("About White Space Tokenizer"):
st.write(white_space_desc)
split_tokens = white_space_tokenizer(txt)
st.write(stream_data)
if token_id == True:
color = itertools.cycle(colors)
unique_tokens = unique_list(split_tokens)
st.write(stream_token_ids)
elif tokenizer == "Qwen2.5 Tokenizer":
with st.expander("About Qwen2.5 Tokenizer"):
st.write('')
ids = qwen_tokenizer.encode(txt)
split_tokens = [qwen_tokenizer.decode([t]) for t in ids]
st.write(stream_data)
if token_id == True:
color = itertools.cycle(colors)
st.write(stream_wp_token_ids)
elif tokenizer == "RuAdapt Tokenizer":
with st.expander("About RuAdapt Tokenizer"):
st.write('')
ids = ruadapt_tokenizer.encode(txt)
split_tokens = [ruadapt_tokenizer.decode([t]) for t in ids]
st.write(stream_data)
if token_id == True:
color = itertools.cycle(colors)
st.write(stream_wp_token_ids)
elif tokenizer == "Aya-Expanse Tokenizer":
with st.expander("About Aya-Expanse Tokenizer"):
st.write('')
ids = aya_tokenizer.encode(txt)
split_tokens = [aya_tokenizer.decode([t]) for t in ids]
st.write(stream_data)
if token_id == True:
color = itertools.cycle(colors)
st.write(stream_wp_token_ids)
elif tokenizer == "Open AI GPT-4o Tokenizer":
with st.expander("About Open AI GPT-4o Tokenizer"):
st.write(bpe_desc)
encoding = tiktoken.encoding_for_model("gpt-4o")
ids = encoding.encode(txt)
split_tokens = [
encoding.decode_single_token_bytes(id).decode("utf-8", errors='ignore') for id in ids
]
st.write(stream_data)
if token_id == True:
color = itertools.cycle(colors)
st.write(stream_wp_token_ids)
elif tokenizer == "Anthropic Claude Tokenizer":
with st.expander("About Anthropic Claude Tokenizer"):
st.write('')
ids = claude_tokenizer.encode(txt)
split_tokens = [claude_tokenizer.decode([t]) for t in ids]
st.write(stream_data)
if token_id == True:
color = itertools.cycle(colors)
st.write(stream_wp_token_ids)
elif tokenizer == "XLM-V Tokenizer":
with st.expander("About XLM-V Tokenizer"):
st.write('')
ids = xlmv_tokenizer.encode(txt)
split_tokens = [xlmv_tokenizer.decode([t]) for t in ids]
st.write(stream_data)
if token_id == True:
color = itertools.cycle(colors)
st.write(stream_wp_token_ids)
elif tokenizer == "NLLB-200 Tokenizer":
with st.expander("About NLLB-200 Tokenizer"):
st.write('')
ids = nllb_tokenizer.encode(txt)
split_tokens = [nllb_tokenizer.decode([t]) for t in ids]
st.write(stream_data)
if token_id == True:
color = itertools.cycle(colors)
st.write(stream_wp_token_ids)
st.write(num_tokens(txt)) |