Spaces:
Sleeping
Sleeping
Taranosaurus
commited on
Commit
β’
2488d19
1
Parent(s):
bbb587f
Formatting and legibility changes
Browse files
app.py
CHANGED
@@ -80,17 +80,23 @@ def de_tokenize_er(pairs):
|
|
80 |
with gr.Blocks() as frontend:
|
81 |
with gr.Row():
|
82 |
with gr.Column(scale=3):
|
83 |
-
gr.Markdown("# π Tokenizaminer\n
|
|
|
|
|
84 |
with gr.Group():
|
85 |
-
input_checkpoint = gr.Dropdown(
|
86 |
btn_load_tokenizer = gr.Button(value="Load Tokenizer")
|
87 |
with gr.Row():
|
88 |
-
|
|
|
|
|
89 |
with gr.Row():
|
90 |
btn_tokenize = gr.Button(value="Tokenize!")
|
91 |
btn_random_seq = gr.Button(value="Randomize!")
|
92 |
with gr.Row():
|
93 |
-
|
|
|
|
|
94 |
with gr.Row():
|
95 |
btn_decode = gr.Button(value="Decode")
|
96 |
with gr.Row():
|
@@ -101,6 +107,7 @@ with gr.Blocks() as frontend:
|
|
101 |
output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
|
102 |
with gr.Column(scale=1):
|
103 |
with gr.Group():
|
|
|
104 |
output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
|
105 |
output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
|
106 |
output_vocab = gr.Code(label="Vocabulary")
|
|
|
80 |
with gr.Blocks() as frontend:
|
81 |
with gr.Row():
|
82 |
with gr.Column(scale=3):
|
83 |
+
gr.Markdown("# π Tokenizaminer\n### The Tokenizer Examiner... π΅οΈπ³οΈ\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.")
|
84 |
+
with gr.Row():
|
85 |
+
gr.Markdown("\n#### 1. Load Tokenizer\nSelect from the list or enter any model from π€ Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
|
86 |
with gr.Group():
|
87 |
+
input_checkpoint = gr.Dropdown(choices=checkpoints, value=checkpoint, allow_custom_value=True, container=False)
|
88 |
btn_load_tokenizer = gr.Button(value="Load Tokenizer")
|
89 |
with gr.Row():
|
90 |
+
gr.Markdown("\n#### 2. Sequence & Tokenize")
|
91 |
+
with gr.Row():
|
92 |
+
input_sequence = gr.TextArea(value=sequence, placeholder=placeholder, lines=3, interactive=True, container=False)
|
93 |
with gr.Row():
|
94 |
btn_tokenize = gr.Button(value="Tokenize!")
|
95 |
btn_random_seq = gr.Button(value="Randomize!")
|
96 |
with gr.Row():
|
97 |
+
gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
|
98 |
+
with gr.Row():
|
99 |
+
token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","ID"], type="array", datatype=["str", "number"], height=400, interactive=True)
|
100 |
with gr.Row():
|
101 |
btn_decode = gr.Button(value="Decode")
|
102 |
with gr.Row():
|
|
|
107 |
output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
|
108 |
with gr.Column(scale=1):
|
109 |
with gr.Group():
|
110 |
+
gr.Markdown("\n#### Tokenizer Data")
|
111 |
output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
|
112 |
output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
|
113 |
output_vocab = gr.Code(label="Vocabulary")
|