Spaces:
Sleeping
Sleeping
File size: 4,890 Bytes
7f914e8 96b0faa 7f914e8 96b0faa 7f914e8 96b0faa 7f914e8 96b0faa 7f914e8 96b0faa 7f914e8 96b0faa 7f914e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
### Import Libraries ###
import streamlit as st
import itertools
from word_piece_tokenizer import WordPieceTokenizer
import tiktoken
from transformers import AutoTokenizer
from nltk.tokenize import TreebankWordTokenizer, wordpunct_tokenize, TweetTokenizer
qwen_tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
ruadapt_tokenizer = AutoTokenizer.from_pretrained('msu-rcc-lair/RuadaptQwen2.5-32B-instruct')
### User Interface ###
st.title("Tokenization")
st.write(
"""Tokenization is the first step of many natural language processing tasks. A tokenizer breaks down the text into smaller parts,
called tokens. For example, a token could be an entire word or a sub-word made of a sequence of letters. After the tokens are created, they are
translated into a set of numerical IDs in order to be processed. Choosing a tokenizer affects the speed and quality of your results. When using a large language model (LLM),
the tokenizer used to train the model should be used to ensure compatibility."""
)
txt = st.text_area("Paste text to tokenize", max_chars=1000)
tokenizer = st.selectbox(
"Tokenizer",
(
"White Space",
"Qwen2.5 Tokenizer",
"RuAdapt Tokenizer",
"Byte Pair Encoding (Open AI GPT-4o)",
),
index=None,
placeholder="Select a tokenizer",
)
token_id = st.checkbox("Translate tokens into IDs", value=False)
### Helper Functions ###
def white_space_tokenizer(txt):
return txt.split()
def treebank_tokenizer(txt):
return TreebankWordTokenizer().tokenize(txt)
## Write tokenized output to screen ##
# Output colors to cycle through
colors = ["blue", "green", "orange", "red", "violet"]
color = itertools.cycle(colors)
# Stream data to screen
def stream_data():
for token in split_tokens:
yield f":{next(color)}-background[{token}] "
def unique_list(token_list):
token_set = set(token_list)
return list(token_set)
def stream_token_ids():
st.write(f"Unique tokens: {len(unique_tokens)}")
for token in split_tokens:
yield f":{next(color)}-background[{unique_tokens.index(token)}] "
def stream_wp_token_ids():
st.write(f"Unique tokens: {len(unique_list(ids))}")
for id in ids:
yield f":{next(color)}-background[{id}] "
### Tokenizer Descriptions ###
white_space_desc = """A basic word-level tokenizer that splits text based on white space. This tokenizer is simple and fast, but it will not handle punctuation or special characters."""
treebank_desc = """The Penn Treebank tokenizer is the default word-level tokenizer in the Natural Language Toolkit (NLTK). It is a more advanced tokenizer that can handle punctuation and special characters."""
tweet_desc = """The TweetTokenizer is a specialized word-level tokenizer that is designed to handle text from social media platforms. It is able to handle hashtags, mentions, and emojis."""
wordpiece_desc = """Word Piece is a sub-word tokenizer that is used in BERT and other transformer models. It breaks down words into smaller sub-word units, which can be useful for handling rare or out-of-vocabulary words."""
bpe_desc = """Byte Pair Encoding (BPE) is a sub-word tokenizer that is used in models like Open AI's GPT-4o. It breaks down words into smaller sub-word units based on the frequency of character pairs in the text."""
# Create a dictionary of tokenized words
## Tokenizer Selection ##
if tokenizer == "White Space":
with st.expander("About White Space Tokenizer"):
st.write(white_space_desc)
split_tokens = white_space_tokenizer(txt)
st.write(stream_data)
if token_id == True:
color = itertools.cycle(colors)
unique_tokens = unique_list(split_tokens)
st.write(stream_token_ids)
elif tokenizer == "Qwen2.5 Tokenizer":
with st.expander("About Qwen2.5 Tokenizer"):
st.write('')
ids = qwen_tokenizer.encode(txt)
split_tokens = qwen_tokenizer.tokenize(txt)
st.write(stream_data)
if token_id == True:
color = itertools.cycle(colors)
st.write(stream_wp_token_ids)
elif tokenizer == "RuAdapt Tokenizer":
with st.expander("About RuAdapt Tokenizer"):
st.write('')
ids = ruadapt_tokenizer.encode(txt)
split_tokens = ruadapt_tokenizer.tokenize(txt)
st.write(stream_data)
if token_id == True:
color = itertools.cycle(colors)
st.write(stream_wp_token_ids)
elif tokenizer == "Byte Pair Encoding (Open AI GPT-4o)":
with st.expander("About Byte Pair Encoding (BPE)"):
st.write(bpe_desc)
encoding = tiktoken.encoding_for_model("gpt-4o")
ids = encoding.encode(txt)
split_tokens = [
encoding.decode_single_token_bytes(id).decode("utf-8") for id in ids
]
st.write(stream_data)
if token_id == True:
color = itertools.cycle(colors)
st.write(stream_wp_token_ids)
|