llm-jp-tokenizer-100k.ver3.0b1

Sleeping

App Files Files Community

gojiteji commited on Sep 9

Commit

222fca2

•

1 Parent(s): dfca87c

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -32

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-import tiktoken
 import random
 # License Information
@@ -10,15 +10,13 @@ import random
 #    - Copyright: 2020-2023, Gradio contributors
 #    - Full License: http://www.apache.org/licenses/LICENSE-2.0
 #
-# 2. tiktoken:
-#    - License: MIT License
-#    - Copyright: 2022, OpenAI, Shantanu Jain
-#    - Full License: https://opensource.org/licenses/MIT
-# Load the tokenizers
-enc_gpt4o = tiktoken.encoding_for_model("gpt-4o")
-enc_gpt3_5turbo = tiktoken.encoding_for_model("gpt-3.5-turbo")
 def get_color_mapping(tokens):
     unique_tokens = list(set(tokens))
@@ -26,9 +24,9 @@ def get_color_mapping(tokens):
     color_mapping = dict(zip(unique_tokens, colors))
     return color_mapping
-def process_model(text, encoder, model_name):
-    token_ids = encoder.encode(text)
-    tokens = [encoder.decode([id]) for id in token_ids]
     num_tokens = len(tokens)
     color_mapping = get_color_mapping(tokens)
@@ -45,24 +43,21 @@ def process_model(text, encoder, model_name):
     return modelname_html + num_tokens_html + tokens_html + token_ids_html
 def tokenize_input(text):
-    gpt4o_result = process_model(text, enc_gpt4o, "GPT-4o")
-    gpt35turbo_result = process_model(text, enc_gpt3_5turbo, "GPT-3.5-turbo")
     num_chars = len(text)
     num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>'
-    return num_chars_html, gpt4o_result, gpt35turbo_result
 with gr.Blocks() as demo:
-    gr.Markdown("## ChatGPT Token Comparison App")
     with gr.Row():
-        input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize and compare results between GPT-4o and GPT-3.5-turbo tokenizers.")
         num_chars_output = gr.HTML()
     with gr.Row():
-        gpt4o_output = gr.HTML(label="GPT-4o")
-        gpt35turbo_output = gr.HTML(label="GPT-3.5-turbo")
-    input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt35turbo_output])
-    input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt35turbo_output])
     gr.Markdown("""
         <hr>
@@ -75,16 +70,12 @@ with gr.Blocks() as demo:
        - Copyright: 2020-2023, Gradio contributors
        - Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
        - Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/)
-        2. **tiktoken**:
-       - License: MIT License
-       - Copyright: 2022, OpenAI, Shantanu Jain
-       - Full License: [MIT License](https://opensource.org/licenses/MIT)
-       - Repository: [tiktoken GitHub](https://github.com/openai/tiktoken)
         """)
 # Launch the app
-demo.launch()

 import gradio as gr
+from sentencepiece import SentencePieceProcessor
 import random
 # License Information
 #    - Copyright: 2020-2023, Gradio contributors
 #    - Full License: http://www.apache.org/licenses/LICENSE-2.0
 #
+# 2. SentencePiece:
+#    - License: Apache License 2.0
+#    - Copyright: 2018 Google Inc.
+#    - Full License: http://www.apache.org/licenses/LICENSE-2.0
+# Load the tokenizer
+sp = SentencePieceProcessor("models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model")
 def get_color_mapping(tokens):
     unique_tokens = list(set(tokens))
     color_mapping = dict(zip(unique_tokens, colors))
     return color_mapping
+def process_model(text, model_name):
+    token_ids = sp.encode(text)
+    tokens = [sp.id_to_piece(id) for id in token_ids]
     num_tokens = len(tokens)
     color_mapping = get_color_mapping(tokens)
     return modelname_html + num_tokens_html + tokens_html + token_ids_html
 def tokenize_input(text):
+    result = process_model(text, "SentencePiece Tokenizer")
     num_chars = len(text)
     num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>'
+    return num_chars_html, result
 with gr.Blocks() as demo:
+    gr.Markdown("## SentencePiece Tokenizer App")
     with gr.Row():
+        input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize using SentencePiece tokenizer.")
         num_chars_output = gr.HTML()
     with gr.Row():
+        tokenizer_output = gr.HTML(label="SentencePiece Tokenizer")
+    input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, tokenizer_output])
+    input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, tokenizer_output])
     gr.Markdown("""
         <hr>
        - Copyright: 2020-2023, Gradio contributors
        - Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
        - Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/)
+        2. **SentencePiece**:
+       - License: Apache License 2.0
+       - Copyright: 2018 Google Inc.
+       - Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
+       - Repository: [SentencePiece GitHub](https://github.com/google/sentencepiece)
         """)
 # Launch the app
+demo.launch()