File size: 3,352 Bytes
d798650 222fca2 d798650 1d1780a 222fca2 1d1780a 222fca2 d798650 222fca2 d798650 222fca2 d798650 222fca2 1d1780a d798650 222fca2 d798650 222fca2 d798650 222fca2 d798650 222fca2 d798650 2adc116 8a4996e 85a1a43 2adc116 222fca2 2adc116 d798650 222fca2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import gradio as gr
from sentencepiece import SentencePieceProcessor
import random
# License Information
# This application uses the following open-source libraries:
#
# 1. Gradio:
# - License: Apache License 2.0
# - Copyright: 2020-2023, Gradio contributors
# - Full License: http://www.apache.org/licenses/LICENSE-2.0
#
# 2. SentencePiece:
# - License: Apache License 2.0
# - Copyright: 2018 Google Inc.
# - Full License: http://www.apache.org/licenses/LICENSE-2.0
# Load the tokenizer
sp = SentencePieceProcessor("models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model")
def get_color_mapping(tokens):
unique_tokens = list(set(tokens))
colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in unique_tokens]
color_mapping = dict(zip(unique_tokens, colors))
return color_mapping
def process_model(text, model_name):
token_ids = sp.encode(text)
tokens = [sp.id_to_piece(id) for id in token_ids]
num_tokens = len(tokens)
color_mapping = get_color_mapping(tokens)
modelname_html = f'<h2>{model_name}</h2>'
tokens_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token}</span>' for token in tokens]
token_ids_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token_id}</span>' for token, token_id in zip(tokens, token_ids)]
tokens_html = f'<h3>{model_name} Tokens</h3>' + ' '.join(tokens_colored)
num_tokens_html = f'<h3>Number of Tokens: <span style="font-size: 20px; font-weight: bold;">{num_tokens}</span></h3>'
token_ids_html = f'<h3>{model_name} Token IDs</h3>' + ' '.join(map(str, token_ids_colored))
return modelname_html + num_tokens_html + tokens_html + token_ids_html
def tokenize_input(text):
result = process_model(text, "SentencePiece Tokenizer")
num_chars = len(text)
num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>'
return num_chars_html, result
with gr.Blocks() as demo:
gr.Markdown("## SentencePiece Tokenizer App")
with gr.Row():
input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize using SentencePiece tokenizer.")
num_chars_output = gr.HTML()
with gr.Row():
tokenizer_output = gr.HTML(label="SentencePiece Tokenizer")
input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, tokenizer_output])
input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, tokenizer_output])
gr.Markdown("""
<hr>
### License Information
This application uses the following open-source libraries:
1. **Gradio**:
- License: Apache License 2.0
- Copyright: 2020-2023, Gradio contributors
- Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
- Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/)
2. **SentencePiece**:
- License: Apache License 2.0
- Copyright: 2018 Google Inc.
- Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
- Repository: [SentencePiece GitHub](https://github.com/google/sentencepiece)
""")
# Launch the app
demo.launch() |