Spaces:

vpkprasanna
/

TokenizerViz

Running

App Files Files Community

prasanna kumar commited on Sep 1, 2024

Commit

6e43644

1 Parent(s): b9a925b

text box added and decoding issue fixed

Browse files

Files changed (1) hide show

app.py +18 -11

app.py CHANGED Viewed

@@ -25,23 +25,29 @@ def create_vertical_histogram(data, title):
     )
     return fig
-def process_input(input_type, input_value, model_name):
     tokenizer = AutoTokenizer.from_pretrained(model_path + model_name)
     if input_type == "Text":
-        text = input_value
     elif input_type == "Token IDs":
-        try:
-            token_ids = ast.literal_eval(input_value)
-            text = tokenizer.decode(token_ids)
-        except ValueError:
-            return "Error", "Invalid input", "", "", "", None, None, None
     character_count = len(text)
     word_count = len(text.split())
-    token_ids = tokenizer.encode(text, add_special_tokens=True)
-    tokens = tokenizer.convert_ids_to_tokens(token_ids)
     space_count = sum(1 for token in tokens if token == '▁')
     special_char_count = sum(1 for token in tokens if not token.isalnum() and token != '▁')
@@ -65,7 +71,7 @@ def process_input(input_type, input_value, model_name):
     analysis += f"Special character tokens: {special_char_count}\n"
     analysis += f"Other tokens: {len(tokens) - space_count - special_char_count}"
-    return analysis, " ".join(tokens), str(token_ids), words_hist, special_chars_hist, numbers_hist
 def text_example():
     return "Hello, world! This is an example text input for tokenization."
@@ -90,6 +96,7 @@ with gr.Blocks() as iface:
     submit_button = gr.Button("Process")
     analysis_output = gr.Textbox(label="Analysis", lines=6)
     tokens_output = gr.Textbox(label="Tokens", lines=3)
     token_ids_output = gr.Textbox(label="Token IDs", lines=2)
@@ -111,7 +118,7 @@ with gr.Blocks() as iface:
     submit_button.click(
         process_input,
         inputs=[input_type, input_text, model_name],
-        outputs=[analysis_output, tokens_output, token_ids_output, words_plot, special_chars_plot, numbers_plot]
     )
 if __name__ == "__main__":

     )
     return fig
+def process_text(text:str,model_name):
+    tokenizer = AutoTokenizer.from_pretrained(model_path + model_name)
+    token_ids = tokenizer.encode(text, add_special_tokens=True)
+    tokens = tokenizer.convert_ids_to_tokens(token_ids)
+    return text,tokens,token_ids,
+def process_ids(ids:str,model_name):
     tokenizer = AutoTokenizer.from_pretrained(model_path + model_name)
+    token_ids = ast.literal_eval(ids)
+    text = tokenizer.decode(token_ids)
+    tokens = tokenizer.convert_ids_to_tokens(token_ids)
+    return text,tokens,token_ids
+def process_input(input_type, input_value, model_name):
     if input_type == "Text":
+        text,tokens,token_ids = process_text(text=input_value,model_name=model_name)
     elif input_type == "Token IDs":
+        text,tokens,token_ids = process_ids(ids=input_value,model_name=model_name)
     character_count = len(text)
     word_count = len(text.split())
     space_count = sum(1 for token in tokens if token == '▁')
     special_char_count = sum(1 for token in tokens if not token.isalnum() and token != '▁')
     analysis += f"Special character tokens: {special_char_count}\n"
     analysis += f"Other tokens: {len(tokens) - space_count - special_char_count}"
+    return analysis, text,tokens, str(token_ids), words_hist, special_chars_hist, numbers_hist
 def text_example():
     return "Hello, world! This is an example text input for tokenization."
     submit_button = gr.Button("Process")
     analysis_output = gr.Textbox(label="Analysis", lines=6)
+    text_output = gr.Textbox(label="Text",lines=6)
     tokens_output = gr.Textbox(label="Tokens", lines=3)
     token_ids_output = gr.Textbox(label="Token IDs", lines=2)
     submit_button.click(
         process_input,
         inputs=[input_type, input_text, model_name],
+        outputs=[analysis_output,text_output ,tokens_output, token_ids_output, words_plot, special_chars_plot, numbers_plot]
     )
 if __name__ == "__main__":