Spaces:

Taranosaurus
/

Tokenizaminer

Sleeping

App Files Files Community

Taranosaurus commited on Jan 3, 2024

Commit

2488d19

1 Parent(s): bbb587f

Formatting and legibility changes

Browse files

Files changed (1) hide show

app.py +11 -4

app.py CHANGED Viewed

@@ -80,17 +80,23 @@ def de_tokenize_er(pairs):
 with gr.Blocks() as frontend:
     with gr.Row():
         with gr.Column(scale=3):
-            gr.Markdown("# 🐇 Tokenizaminer\n\n### The Tokenizer Examiner... 🕵️🕳️\n\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\n\n## Instructions\n\n1. Load a tokenizer\n2. Type and Tokenize a sequence\n3. Manipulate it to see what happens!")
             with gr.Group():
-                input_checkpoint = gr.Dropdown(label="1. Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, info="Select from the list or enter any model from 🤗 Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
                 btn_load_tokenizer = gr.Button(value="Load Tokenizer")
             with gr.Row():
-                input_sequence = gr.TextArea(label="2. Sequence", value=sequence, placeholder=placeholder, lines=3, interactive=True)
             with gr.Row():
                     btn_tokenize = gr.Button(value="Tokenize!")
                     btn_random_seq = gr.Button(value="Randomize!")
             with gr.Row():
-                token_id_pair = gr.DataFrame(label="3. Decode", col_count=(2,"fixed"), headers=["Token","ID"], type="array", datatype=["str", "number"], height=400, interactive=True)
             with gr.Row():
                 btn_decode = gr.Button(value="Decode")
             with gr.Row():
@@ -101,6 +107,7 @@ with gr.Blocks() as frontend:
                     output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
         with gr.Column(scale=1):
             with gr.Group():
                 output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
                 output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
                 output_vocab = gr.Code(label="Vocabulary")

 with gr.Blocks() as frontend:
     with gr.Row():
         with gr.Column(scale=3):
+            gr.Markdown("# 🐇 Tokenizaminer\n### The Tokenizer Examiner... 🕵️🕳️\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.")
+            with gr.Row():
+                gr.Markdown("\n#### 1. Load Tokenizer\nSelect from the list or enter any model from 🤗 Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
             with gr.Group():
+                input_checkpoint = gr.Dropdown(choices=checkpoints, value=checkpoint, allow_custom_value=True, container=False)
                 btn_load_tokenizer = gr.Button(value="Load Tokenizer")
             with gr.Row():
+                gr.Markdown("\n#### 2. Sequence & Tokenize")
+            with gr.Row():
+                input_sequence = gr.TextArea(value=sequence, placeholder=placeholder, lines=3, interactive=True, container=False)
             with gr.Row():
                     btn_tokenize = gr.Button(value="Tokenize!")
                     btn_random_seq = gr.Button(value="Randomize!")
             with gr.Row():
+                gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
+            with gr.Row():
+                token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","ID"], type="array", datatype=["str", "number"], height=400, interactive=True)
             with gr.Row():
                 btn_decode = gr.Button(value="Decode")
             with gr.Row():
                     output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
         with gr.Column(scale=1):
             with gr.Group():
+                gr.Markdown("\n#### Tokenizer Data")
                 output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
                 output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
                 output_vocab = gr.Code(label="Vocabulary")