File size: 3,352 Bytes
d798650
222fca2
d798650
 
1d1780a
 
 
 
 
 
 
 
222fca2
 
 
 
1d1780a
222fca2
 
d798650
 
 
 
 
 
 
222fca2
 
 
d798650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222fca2
d798650
 
222fca2
1d1780a
d798650
222fca2
d798650
222fca2
d798650
 
222fca2
d798650
222fca2
 
d798650
2adc116
8a4996e
85a1a43
2adc116
 
 
 
 
 
 
 
222fca2
 
 
 
 
2adc116
 
d798650
222fca2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from sentencepiece import SentencePieceProcessor
import random

# License Information
# This application uses the following open-source libraries:
#
# 1. Gradio:
#    - License: Apache License 2.0
#    - Copyright: 2020-2023, Gradio contributors
#    - Full License: http://www.apache.org/licenses/LICENSE-2.0
#
# 2. SentencePiece:
#    - License: Apache License 2.0
#    - Copyright: 2018 Google Inc.
#    - Full License: http://www.apache.org/licenses/LICENSE-2.0

# Load the tokenizer
sp = SentencePieceProcessor("models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model")

def get_color_mapping(tokens):
    unique_tokens = list(set(tokens))
    colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in unique_tokens]
    color_mapping = dict(zip(unique_tokens, colors))
    return color_mapping

def process_model(text, model_name):
    token_ids = sp.encode(text)
    tokens = [sp.id_to_piece(id) for id in token_ids]
    num_tokens = len(tokens)
    
    color_mapping = get_color_mapping(tokens)

    modelname_html = f'<h2>{model_name}</h2>'
    
    tokens_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token}</span>' for token in tokens]
    token_ids_colored = [f'<span style="color:{color_mapping[token]}; font-weight: bold;">{token_id}</span>' for token, token_id in zip(tokens, token_ids)]
    
    tokens_html = f'<h3>{model_name} Tokens</h3>' + ' '.join(tokens_colored)
    num_tokens_html = f'<h3>Number of Tokens: <span style="font-size: 20px; font-weight: bold;">{num_tokens}</span></h3>'
    token_ids_html = f'<h3>{model_name} Token IDs</h3>' + ' '.join(map(str, token_ids_colored))
    
    return modelname_html + num_tokens_html + tokens_html + token_ids_html

def tokenize_input(text):
    result = process_model(text, "SentencePiece Tokenizer")
    num_chars = len(text)
    num_chars_html = f'<h2>Number of Characters: <span style="font-size: 20px; font-weight: bold;">{num_chars}</span></h2>'
    return num_chars_html, result

with gr.Blocks() as demo:
    gr.Markdown("## SentencePiece Tokenizer App")
    with gr.Row():
        input_text = gr.Textbox(lines=2, placeholder="Enter text here...", label="Enter text to tokenize using SentencePiece tokenizer.")
        num_chars_output = gr.HTML()
    with gr.Row():
        tokenizer_output = gr.HTML(label="SentencePiece Tokenizer")
    
    input_text.change(tokenize_input, inputs=[input_text], outputs=[num_chars_output, tokenizer_output])
    input_text.submit(tokenize_input, inputs=[input_text], outputs=[num_chars_output, tokenizer_output])

    gr.Markdown("""
        <hr>
        
        ### License Information
        This application uses the following open-source libraries:
        
        1. **Gradio**:
       - License: Apache License 2.0
       - Copyright: 2020-2023, Gradio contributors
       - Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
       - Repository: [Gradio GitHub](https://github.com/gradio-app/gradio/)
        2. **SentencePiece**:
       - License: Apache License 2.0
       - Copyright: 2018 Google Inc.
       - Full License: [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
       - Repository: [SentencePiece GitHub](https://github.com/google/sentencepiece)
        """)

# Launch the app
demo.launch()