tamang0000
commited on
Commit
·
80caa24
1
Parent(s):
eec601a
added assamese
Browse files
app.py
CHANGED
@@ -15,7 +15,7 @@ def load_test_phrases(filename):
|
|
15 |
|
16 |
models = ["Xenova/claude-tokenizer", # Anthropic
|
17 |
"meta-llama/Llama-2-7b-chat-hf", # LLAMA-2
|
18 |
-
"beomi/llama-2-ko-7b", # LLAMA-2-ko
|
19 |
"ai4bharat/Airavata", # ARIVATA
|
20 |
"openaccess-ai-collective/tiny-mistral", # Mistral
|
21 |
"gpt-3.5-turbo", # GPT3.5
|
@@ -23,7 +23,9 @@ models = ["Xenova/claude-tokenizer", # Anthropic
|
|
23 |
"CohereForAI/aya-23-8B", # AYA
|
24 |
"google/gemma-1.1-2b-it", # GEMMA
|
25 |
"gpt-4o", # GPT4o
|
26 |
-
"TWO/sutra-mlt256-v2"
|
|
|
|
|
27 |
|
28 |
test_phrase_set = [
|
29 |
"I am going for a walk later today",
|
@@ -111,8 +113,9 @@ def generate_split_token_table(text):
|
|
111 |
with gr.Blocks() as sutra_token_count:
|
112 |
gr.Markdown(
|
113 |
"""
|
114 |
-
#
|
115 |
## Tokenize paragraphs in multiple languages and compare token counts.
|
|
|
116 |
""")
|
117 |
textbox = gr.Textbox(label="Input Text")
|
118 |
submit_button = gr.Button("Submit")
|
@@ -140,9 +143,10 @@ def generate_tokens_table(text):
|
|
140 |
with gr.Blocks() as sutra_tokenize:
|
141 |
gr.Markdown(
|
142 |
"""
|
143 |
-
#
|
144 |
## Tokenize a sentence with various tokenizers and inspect how it's broken down.
|
145 |
-
|
|
|
146 |
textbox = gr.Textbox(label="Input Text")
|
147 |
submit_button = gr.Button("Submit")
|
148 |
output = gr.Dataframe()
|
@@ -156,7 +160,7 @@ if __name__ == '__main__':
|
|
156 |
with gr.Row():
|
157 |
gr.Markdown(
|
158 |
"""
|
159 |
-
## <img src="https://
|
160 |
"""
|
161 |
)
|
162 |
with gr.Row():
|
|
|
15 |
|
16 |
models = ["Xenova/claude-tokenizer", # Anthropic
|
17 |
"meta-llama/Llama-2-7b-chat-hf", # LLAMA-2
|
18 |
+
# "beomi/llama-2-ko-7b", # LLAMA-2-ko
|
19 |
"ai4bharat/Airavata", # ARIVATA
|
20 |
"openaccess-ai-collective/tiny-mistral", # Mistral
|
21 |
"gpt-3.5-turbo", # GPT3.5
|
|
|
23 |
"CohereForAI/aya-23-8B", # AYA
|
24 |
"google/gemma-1.1-2b-it", # GEMMA
|
25 |
"gpt-4o", # GPT4o
|
26 |
+
"TWO/sutra-mlt256-v2", # SUTRA
|
27 |
+
"tamang0000/assamese-tokenizer-50k" # Assamese
|
28 |
+
]
|
29 |
|
30 |
test_phrase_set = [
|
31 |
"I am going for a walk later today",
|
|
|
113 |
with gr.Blocks() as sutra_token_count:
|
114 |
gr.Markdown(
|
115 |
"""
|
116 |
+
# Multilingual Tokenizer Specs & Stats.
|
117 |
## Tokenize paragraphs in multiple languages and compare token counts.
|
118 |
+
Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison)
|
119 |
""")
|
120 |
textbox = gr.Textbox(label="Input Text")
|
121 |
submit_button = gr.Button("Submit")
|
|
|
143 |
with gr.Blocks() as sutra_tokenize:
|
144 |
gr.Markdown(
|
145 |
"""
|
146 |
+
# Multilingual Tokenizer Sentence Inspector.
|
147 |
## Tokenize a sentence with various tokenizers and inspect how it's broken down.
|
148 |
+
Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison)
|
149 |
+
""")
|
150 |
textbox = gr.Textbox(label="Input Text")
|
151 |
submit_button = gr.Button("Submit")
|
152 |
output = gr.Dataframe()
|
|
|
160 |
with gr.Row():
|
161 |
gr.Markdown(
|
162 |
"""
|
163 |
+
## <img src="https://raw.githubusercontent.com/SAGAR-TAMANG/sagar-tamang-official-website-new/master/img/pi.jpg" height="30"/>
|
164 |
"""
|
165 |
)
|
166 |
with gr.Row():
|