gojiteji commited on
Commit
222fca2
1 Parent(s): dfca87c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -32
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- import tiktoken
3
  import random
4
 
5
  # License Information
@@ -10,15 +10,13 @@ import random
10
  # - Copyright: 2020-2023, Gradio contributors
11
  # - Full License: http://www.apache.org/licenses/LICENSE-2.0
12
  #
13
- # 2. tiktoken:
14
- # - License: MIT License
15
- # - Copyright: 2022, OpenAI, Shantanu Jain
16
- # - Full License: https://opensource.org/licenses/MIT
17
-
18
 
19
- # Load the tokenizers
20
- enc_gpt4o = tiktoken.encoding_for_model("gpt-4o")
21
- enc_gpt3_5turbo = tiktoken.encoding_for_model("gpt-3.5-turbo")
22
 
23
  def get_color_mapping(tokens):
24
  unique_tokens = list(set(tokens))
@@ -26,9 +24,9 @@ def get_color_mapping(tokens):
26
  color_mapping = dict(zip(unique_tokens, colors))
27
  return color_mapping
28
 
29
- def process_model(text, encoder, model_name):
30
- token_ids = encoder.encode(text)
31
- tokens = [encoder.decode([id]) for id in token_ids]
32
  num_tokens = len(tokens)
33
 
34
  color_mapping = get_color_mapping(tokens)
@@ -45,24 +43,21 @@ def process_model(text, encoder, model_name):
45
  return modelname_html + num_tokens_html + tokens_html + token_ids_html
46
 
47
  def tokenize_input(text):
48
- gpt4o_result = process_model(text, enc_gpt4o, "GPT-4o")
49
- gpt35turbo_result = process_model(text, enc_gpt3_5turbo, "GPT-3.5-turbo")
50
  num_chars = len(text)
51
  num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>'
52
- return num_chars_html, gpt4o_result, gpt35turbo_result
53
-
54
 
55
  with gr.Blocks() as demo:
56
- gr.Markdown("## ChatGPT Token Comparison App")
57
  with gr.Row():
58
- input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize and compare results between GPT-4o and GPT-3.5-turbo tokenizers.")
59
  num_chars_output = gr.HTML()
60
  with gr.Row():
61
- gpt4o_output = gr.HTML(label="GPT-4o")
62
- gpt35turbo_output = gr.HTML(label="GPT-3.5-turbo")
63
 
64
- input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt35turbo_output])
65
- input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, gpt4o_output, gpt35turbo_output])
66
 
67
  gr.Markdown("""
68
  <hr>
@@ -75,16 +70,12 @@ with gr.Blocks() as demo:
75
  - Copyright: 2020-2023, Gradio contributors
76
  - Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
77
  - Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/)
78
-
79
- 2. **tiktoken**:
80
- - License: MIT License
81
- - Copyright: 2022, OpenAI, Shantanu Jain
82
- - Full License: [MIT License](https://opensource.org/licenses/MIT)
83
- - Repository: [tiktoken GitHub](https://github.com/openai/tiktoken)
84
  """)
85
 
86
-
87
  # Launch the app
88
- demo.launch()
89
-
90
-
 
1
  import gradio as gr
2
+ from sentencepiece import SentencePieceProcessor
3
  import random
4
 
5
  # License Information
 
10
  # - Copyright: 2020-2023, Gradio contributors
11
  # - Full License: http://www.apache.org/licenses/LICENSE-2.0
12
  #
13
+ # 2. SentencePiece:
14
+ # - License: Apache License 2.0
15
+ # - Copyright: 2018 Google Inc.
16
+ # - Full License: http://www.apache.org/licenses/LICENSE-2.0
 
17
 
18
+ # Load the tokenizer
19
+ sp = SentencePieceProcessor("models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model")
 
20
 
21
  def get_color_mapping(tokens):
22
  unique_tokens = list(set(tokens))
 
24
  color_mapping = dict(zip(unique_tokens, colors))
25
  return color_mapping
26
 
27
+ def process_model(text, model_name):
28
+ token_ids = sp.encode(text)
29
+ tokens = [sp.id_to_piece(id) for id in token_ids]
30
  num_tokens = len(tokens)
31
 
32
  color_mapping = get_color_mapping(tokens)
 
43
  return modelname_html + num_tokens_html + tokens_html + token_ids_html
44
 
45
  def tokenize_input(text):
46
+ result = process_model(text, "SentencePiece Tokenizer")
 
47
  num_chars = len(text)
48
  num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>'
49
+ return num_chars_html, result
 
50
 
51
  with gr.Blocks() as demo:
52
+ gr.Markdown("## SentencePiece Tokenizer App")
53
  with gr.Row():
54
+ input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize using SentencePiece tokenizer.")
55
  num_chars_output = gr.HTML()
56
  with gr.Row():
57
+ tokenizer_output = gr.HTML(label="SentencePiece Tokenizer")
 
58
 
59
+ input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, tokenizer_output])
60
+ input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, tokenizer_output])
61
 
62
  gr.Markdown("""
63
  <hr>
 
70
  - Copyright: 2020-2023, Gradio contributors
71
  - Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
72
  - Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/)
73
+ 2. **SentencePiece**:
74
+ - License: Apache License 2.0
75
+ - Copyright: 2018 Google Inc.
76
+ - Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
77
+ - Repository: [SentencePiece GitHub](https://github.com/google/sentencepiece)
 
78
  """)
79
 
 
80
  # Launch the app
81
+ demo.launch()