import gradio as gr from transformers import AutoTokenizer import ast model_path = "models/" import gradio as gr # Available models MODELS = ["Meta-Llama-3.1-8B"] def process_input(input_type, input_value, model_name): # Initialize tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path+model_name) if input_type == "Text": # Tokenize the text tokens = tokenizer.tokenize(input_value) token_ids = tokenizer.encode(input_value) # Create output strings # tokens_str = [f"{i+1}. {token}" for i, token in enumerate(tokens)] # token_ids_str = " ".join(map(str, token_ids)) return f"Total tokens: {len(tokens)}", tokens, token_ids elif input_type == "Token IDs": try: token_ids = ast.literal_eval(input_value) # Convert string of token IDs to list of integers # token_ids = list(map(int, input_value.split())) # Convert token IDs back to text text = tokenizer.decode(token_ids) # print("The decoded text",text) # Tokenize the text to get individual tokens # Create output strings return f"Total tokens: {len(token_ids)}", text, input_value except ValueError: return "Error", "Invalid input. Please enter space-separated integers for Token IDs.", "" # Create Gradio interface iface = gr.Interface( fn=process_input, inputs=[ gr.Radio(["Text", "Token IDs"], label="Input Type", value="Text"), gr.Textbox(lines=5, label="Input"), gr.Dropdown(choices=MODELS, label="Select Model") ], outputs=[ gr.Textbox(label="Token Count"), gr.Textbox(label="Tokens", lines=10), gr.Textbox(label="Token IDS", lines=5) ], title="LLM Tokenization and Token ID Converter", description="Enter text or token IDs and select a model to see the conversion results." ) if __name__ == "__main__": iface.launch()