import gradio as gr from tokenizers import Tokenizer import json from huggingface_hub import hf_hub_download import os # Download tokenizer files from HF Hub def get_tokenizer(): try: # Download tokenizer.json tokenizer_path = hf_hub_download( repo_id="Saiteja/telugu-bpe", filename="tokenizer.json", repo_type="model" ) # Download examples.json examples_path = hf_hub_download( repo_id="Saiteja/telugu-bpe", filename="examples.json", repo_type="model" ) return tokenizer_path, examples_path except Exception as e: print(f"Error downloading files: {e}") return None, None # Get tokenizer and examples tokenizer_path, examples_path = get_tokenizer() # Load the tokenizer tokenizer = Tokenizer.from_file(tokenizer_path) # Load examples with open(examples_path, "r", encoding="utf-8") as f: examples_data = json.load(f) # Extract example texts # example_texts = [ # "నమస్కారం", # Hello # "తెలుగు భాష చాలా అందమైనది", # Telugu language is very beautiful # "భారతదేశం నా దేశం", # India is my country # "తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", # Telugu literature is very rich # "నేను తెలుగు భాషను ప్రేమిస్తున్నాను" # I love Telugu language # ] example_texts = [example["text"] for example in examples_data] def tokenize_text(text): """Tokenize the input text and return tokens, ids and compression ratio.""" if not text.strip(): return "Please enter some text." try: encoding = tokenizer.encode(text) compression_ratio = len(text) / len(encoding.ids) result = f"""Tokens: {encoding.tokens} Token IDs: {encoding.ids} Number of tokens: {len(encoding.ids)} Text length: {len(text)} Compression ratio: {compression_ratio:.2f}""" return result except Exception as e: return f"Error: {str(e)}" # Create the Gradio interface iface = gr.Interface( fn=tokenize_text, inputs=gr.Textbox( lines=5, placeholder="Enter Telugu text here...", label="Input Text" ), outputs=gr.Textbox( label="Tokenization Results", lines=10 ), title="Telugu Tokenizer Demo", description="""This demo uses a custom Telugu tokenizer trained on a large corpus of Telugu text. The tokenizer has a vocabulary size of 50,000+ tokens and achieves a compression ratio of >3.0. Try entering some Telugu text to see how it's tokenized! Tokenizer: https://huggingface.co/Saiteja/telugu-bpe""", examples=example_texts, theme=gr.themes.Soft() ) # Launch the app if __name__ == "__main__": iface.launch()