TokenizerViz / app.py
prasanna kumar
Final commit for llama based model
c6a1e30
raw
history blame
1.75 kB
import gradio as gr
from transformers import AutoTokenizer
import ast
model_path = "models/"
import gradio as gr
# Available models
MODELS = ["Meta-Llama-3.1-8B"]
def process_input(input_type, input_value, model_name):
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path+model_name)
if input_type == "Text":
character_count = len(input_value)
# Tokenize the text
token_ids = tokenizer.encode(input_value,add_special_tokens=True)
tokens = tokenizer.convert_ids_to_tokens(token_ids)
return len(tokens),character_count, tokens, token_ids
elif input_type == "Token IDs":
try:
token_ids = ast.literal_eval(input_value)
# Convert token IDs back to text
text = tokenizer.decode(token_ids)
# Create output strings
return len(token_ids),len(token_ids), text, input_value,
except ValueError:
return "Error", "Invalid input. Please enter space-separated integers for Token IDs.", ""
# Create Gradio interface
iface = gr.Interface(
fn=process_input,
inputs=[
gr.Radio(["Text", "Token IDs"], label="Input Type", value="Text"),
gr.Textbox(lines=5, label="Input"),
gr.Dropdown(choices=MODELS, label="Select Model")
],
outputs=[
gr.Textbox(label="Token Count"),
gr.Textbox(label="Character Count"),
gr.Textbox(label="Tokens", lines=10),
gr.Textbox(label="Token IDS", lines=5)
],
title="LLM Tokenization - Convert Text to tokens and vice versa!",
description="Enter text or token IDs and select a model to see the results."
)
if __name__ == "__main__":
iface.queue()
iface.launch()