import gradio as gr from src.hindi_bpe import HindiBPE import pickle import os # Initialize the tokenizer tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2) # Load production model state model_file = 'hindi_bpe_model.pkl' if os.path.exists(model_file): print("Loading production model...") with open(model_file, 'rb') as f: state = pickle.load(f) tokenizer.vocab = state['vocab'] tokenizer.inverse_vocab = state['inverse_vocab'] tokenizer.bpe_ranks = state['bpe_ranks'] print(f"Model loaded successfully!") print(f"Vocabulary size: {len(tokenizer.vocab)} tokens") else: raise FileNotFoundError("Production model not found! Please run train_bpe.py first and copy the model file.") def process_text(text: str, mode: str) -> str: """Process text using the tokenizer""" if not text.strip(): return "Please enter some text." if mode == "Encode": # Encode the text encoded = tokenizer.encode(text) return f"Encoded tokens: {encoded}" else: # First encode then decode to show the round trip encoded = tokenizer.encode(text) decoded = tokenizer.decode(encoded) return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}" # Create the interface iface = gr.Interface( fn=process_text, inputs=[ gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"), gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode") ], outputs=gr.Textbox(label="Result"), title="Hindi BPE Tokenizer (Production Model)", description="""This is a production-grade Byte Pair Encoding (BPE) tokenizer trained on 1 million Hindi sentences. Features: - Vocabulary size: < 5000 tokens - Compression ratio: ≥ 3.2 - Trained on 1M sentences - Proper handling of Hindi Unicode characters and combining marks""", examples=[ ["नमस्ते भारत", "Encode & Decode"], ["मैं हिंदी सीख रहा हूं", "Encode & Decode"], ["यह एक परीक्षण वाक्य है", "Encode & Decode"], ["भारत एक विशाल देश है", "Encode & Decode"], ["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"] ] ) if __name__ == "__main__": iface.launch()