File size: 6,187 Bytes
eec601a 2ae07a4 eec601a 9545158 2ae07a4 eec601a fa49f1f eec601a 85ba6af eec601a 80caa24 eec601a 85ba6af eec601a 85ba6af eec601a 85ba6af eec601a 85ba6af 4d1fdaf 85ba6af eec601a e805282 eec601a e805282 d655e4d eec601a 4d1fdaf 85ba6af eec601a e805282 eec601a 80caa24 d655e4d 80caa24 eec601a 4d1fdaf eec601a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import logging
# import os
import tiktoken
from transformers import AutoTokenizer
import gradio as gr
logger = logging.getLogger(__name__) # noqa
# hugging face
# hf_token = os.getenv('HUGGINGFACE_TOKEN')
# HfApi().login(token=hf_token)
def load_test_phrases(filename):
with open(f"./data/{filename}", "r", encoding="utf-8") as file:
return file.read().splitlines()
models = ["Xenova/claude-tokenizer", # Anthropic
"meta-llama/Llama-2-7b-chat-hf", # LLAMA-2
"beomi/llama-2-ko-7b", # LLAMA-2-ko
"ai4bharat/Airavata", # ARIVATA
"openaccess-ai-collective/tiny-mistral", # Mistral
"gpt-3.5-turbo", # GPT3.5
"meta-llama/Meta-Llama-3-8B-Instruct", # LLAMA-3
"CohereForAI/aya-23-8B", # AYA
"google/gemma-1.1-2b-it", # GEMMA
"gpt-4o", # GPT4o
"TWO/sutra-mlt256-v2", # SUTRA
"tamang0000/assamese-tokenizer-50k" # Assamese
]
test_phrase_set = [
"মই আজিৰ পাছত হ’ব লগা হাঁহিৰ বাবে ওলাই থাকিম",
"আমি চন্দ্ৰলৈ ৰকেট যাত্ৰাত আছোঁ",
"পাঁচখন বাক্যৰে নিউট্ৰন বিকিৰণৰ বৰ্ণনা দিয়ক", # Assamese
"আমাক পাঁচখন বাক্যৰে নিউট্ৰন বিকিৰণৰ বৰ্ণনা দিয়ক",
"মোৰ বন্ধুটোৱে চাৰিটা পুথি পঢ়িছে", # Assamese
"মোৰ ঘৰখন গাঁওখনৰ আটাইতকৈ বেছি ডাঙৰ", # Assamese
"আজিৰে পৰা মই সৰু সৰু কামবোৰ কৰি থাকিম", # Assamese
"তেওঁৰ মাতবোৰ আৰু শাৰীবোৰ সলনি হোৱা দেখি চমক লাগিল", # Assamese
]
test_phrase_set_long_1 = load_test_phrases('multilingualphrases01-as.txt')
test_phrase_set_long_2 = load_test_phrases('multilingualphrases02-as.txt')
# test_phrase_set_long_3 = load_test_phrases('multilingualphrases03.txt')
def generate_tokens_as_table(text):
table = []
for model in models:
if 'gpt' not in model:
tokenizer = AutoTokenizer.from_pretrained(model)
tokens = tokenizer.encode(text, add_special_tokens=False)
else:
tokenizer = tiktoken.encoding_for_model(model)
tokens = tokenizer.encode(text)
decoded = [tokenizer.decode([t]) for t in tokens]
table.append([model] + decoded)
return table
def generate_tokenizer_table(text):
if not text:
return []
token_counts = {model: 0 for model in models}
vocab_size = {model: 0 for model in models}
for model in models:
if 'gpt' not in model:
tokenizer = AutoTokenizer.from_pretrained(model)
vocab_size[model] = tokenizer.vocab_size
else:
tokenizer = tiktoken.encoding_for_model(model)
vocab_size[model] = tokenizer.n_vocab
token_counts[model] += len(tokenizer.encode(text))
word_count = len(text.split(' '))
output = []
for m in models:
row = [m, vocab_size[m], word_count, token_counts[m], f"{token_counts[m] / word_count:0.2f}"]
output.append(row)
return output
def generate_split_token_table(text):
if not text:
return gr.Dataframe()
table = generate_tokenizer_table(text)
return gr.Dataframe(
table,
headers=['tokenizer', 'v size', '#word', '#token', '#tokens/word'],
datatype=["str", "number", "str"],
row_count=len(models),
col_count=(5, "fixed"),
)
with gr.Blocks() as sutra_token_count:
gr.Markdown(
"""
# Assamese Tokenizer Specs & Stats.
## Tokenize paragraphs in multiple languages and compare token counts.
Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison
Number of Tokens (The less he better)
""")
textbox = gr.Textbox(label="Input Text")
submit_button = gr.Button("Submit")
output = gr.Dataframe()
examples = [
[' '.join(test_phrase_set_long_1)],
[' '.join(test_phrase_set_long_2)],
# [' '.join(test_phrase_set_long_3)],
]
gr.Examples(examples=examples, inputs=[textbox])
submit_button.click(generate_split_token_table, inputs=[textbox], outputs=[output])
def generate_tokens_table(text):
table = generate_tokens_as_table(text)
cols = len(table[0])
return gr.Dataframe(
table,
headers=['model'] + [str(i) for i in range(cols - 1)],
row_count=2,
col_count=(cols, "fixed"),
)
with gr.Blocks() as sutra_tokenize:
gr.Markdown(
"""
# Assamese Tokenizer Sentence Inspector.
## Tokenize a sentence with various tokenizers and inspect how it's broken down.
Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison)
Number of Tokens (The less the better)
""")
textbox = gr.Textbox(label="Input Text")
submit_button = gr.Button("Submit")
output = gr.Dataframe()
examples = test_phrase_set
gr.Examples(examples=examples, inputs=[textbox])
submit_button.click(generate_tokens_table, inputs=[textbox], outputs=[output])
if __name__ == '__main__':
with gr.Blocks(analytics_enabled=False) as demo:
with gr.Row():
gr.Markdown(
"""
## <img src="https://sagartamang.com/img/favicon.png" height="100%"/>
"""
)
with gr.Row():
gr.TabbedInterface(
interface_list=[sutra_tokenize, sutra_token_count],
tab_names=["Tokenize Text", "Tokenize Paragraphs"]
)
demo.queue(default_concurrency_limit=5).launch(
server_name="0.0.0.0",
allowed_paths=["/"],
)
|