Spaces:
Running
Running
File size: 2,482 Bytes
f081412 f61c187 f081412 f61c187 f081412 f61c187 f081412 f61c187 f081412 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import gradio as gr
from src.hindi_bpe import HindiBPE
import pickle
import os
# Initialize the tokenizer
tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)
# Load production model state
model_file = 'hindi_bpe_model.pkl'
if os.path.exists(model_file):
print("Loading production model...")
with open(model_file, 'rb') as f:
state = pickle.load(f)
tokenizer.vocab = state['vocab']
tokenizer.inverse_vocab = state['inverse_vocab']
tokenizer.bpe_ranks = state['bpe_ranks']
print(f"Model loaded successfully!")
print(f"Vocabulary size: {len(tokenizer.vocab)} tokens")
else:
raise FileNotFoundError("Production model not found! Please run train_bpe.py first and copy the model file.")
def process_text(text: str, mode: str) -> str:
"""Process text using the tokenizer"""
if not text.strip():
return "Please enter some text."
if mode == "Encode":
# Encode the text
encoded = tokenizer.encode(text)
return f"Encoded tokens: {encoded}"
else:
# First encode then decode to show the round trip
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}"
# Create the interface
iface = gr.Interface(
fn=process_text,
inputs=[
gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"),
gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
],
outputs=gr.Textbox(label="Result"),
title="Hindi BPE Tokenizer (Production Model)",
description="""This is a production-grade Byte Pair Encoding (BPE) tokenizer trained on 1 million Hindi sentences.
Features:
- Vocabulary size: < 5000 tokens
- Compression ratio: ≥ 3.2
- Trained on 1M sentences
- Proper handling of Hindi Unicode characters and combining marks""",
examples=[
["नमस्ते भारत", "Encode & Decode"],
["मैं हिंदी सीख रहा हूं", "Encode & Decode"],
["यह एक परीक्षण वाक्य है", "Encode & Decode"],
["भारत एक विशाल देश है", "Encode & Decode"],
["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"]
]
)
if __name__ == "__main__":
iface.launch() |