Hindi-BPE / app.py
jatingocodeo's picture
Update app.py
f61c187 verified
import gradio as gr
from src.hindi_bpe import HindiBPE
import pickle
import os
# Initialize the tokenizer
tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)
# Load production model state
model_file = 'hindi_bpe_model.pkl'
if os.path.exists(model_file):
print("Loading production model...")
with open(model_file, 'rb') as f:
state = pickle.load(f)
tokenizer.vocab = state['vocab']
tokenizer.inverse_vocab = state['inverse_vocab']
tokenizer.bpe_ranks = state['bpe_ranks']
print(f"Model loaded successfully!")
print(f"Vocabulary size: {len(tokenizer.vocab)} tokens")
else:
raise FileNotFoundError("Production model not found! Please run train_bpe.py first and copy the model file.")
def process_text(text: str, mode: str) -> str:
"""Process text using the tokenizer"""
if not text.strip():
return "Please enter some text."
if mode == "Encode":
# Encode the text
encoded = tokenizer.encode(text)
return f"Encoded tokens: {encoded}"
else:
# First encode then decode to show the round trip
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}"
# Create the interface
iface = gr.Interface(
fn=process_text,
inputs=[
gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"),
gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
],
outputs=gr.Textbox(label="Result"),
title="Hindi BPE Tokenizer (Production Model)",
description="""This is a production-grade Byte Pair Encoding (BPE) tokenizer trained on 1 million Hindi sentences.
Features:
- Vocabulary size: < 5000 tokens
- Compression ratio: ≥ 3.2
- Trained on 1M sentences
- Proper handling of Hindi Unicode characters and combining marks""",
examples=[
["नमस्ते भारत", "Encode & Decode"],
["मैं हिंदी सीख रहा हूं", "Encode & Decode"],
["यह एक परीक्षण वाक्य है", "Encode & Decode"],
["भारत एक विशाल देश है", "Encode & Decode"],
["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"]
]
)
if __name__ == "__main__":
iface.launch()