File size: 2,482 Bytes
f081412
 
f61c187
 
f081412
 
 
 
f61c187
 
 
 
 
 
 
 
 
 
 
 
 
 
f081412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f61c187
 
f081412
 
 
f61c187
f081412
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
from src.hindi_bpe import HindiBPE
import pickle
import os

# Initialize the tokenizer
tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)

# Load production model state
model_file = 'hindi_bpe_model.pkl'
if os.path.exists(model_file):
    print("Loading production model...")
    with open(model_file, 'rb') as f:
        state = pickle.load(f)
        tokenizer.vocab = state['vocab']
        tokenizer.inverse_vocab = state['inverse_vocab']
        tokenizer.bpe_ranks = state['bpe_ranks']
    print(f"Model loaded successfully!")
    print(f"Vocabulary size: {len(tokenizer.vocab)} tokens")
else:
    raise FileNotFoundError("Production model not found! Please run train_bpe.py first and copy the model file.")

def process_text(text: str, mode: str) -> str:
    """Process text using the tokenizer"""
    if not text.strip():
        return "Please enter some text."
        
    if mode == "Encode":
        # Encode the text
        encoded = tokenizer.encode(text)
        return f"Encoded tokens: {encoded}"
    else:
        # First encode then decode to show the round trip
        encoded = tokenizer.encode(text)
        decoded = tokenizer.decode(encoded)
        return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}"

# Create the interface
iface = gr.Interface(
    fn=process_text,
    inputs=[
        gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"),
        gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
    ],
    outputs=gr.Textbox(label="Result"),
    title="Hindi BPE Tokenizer (Production Model)",
    description="""This is a production-grade Byte Pair Encoding (BPE) tokenizer trained on 1 million Hindi sentences.
    Features:
    - Vocabulary size: < 5000 tokens
    - Compression ratio: ≥ 3.2
    - Trained on 1M sentences
    - Proper handling of Hindi Unicode characters and combining marks""",
    examples=[
        ["नमस्ते भारत", "Encode & Decode"],
        ["मैं हिंदी सीख रहा हूं", "Encode & Decode"],
        ["यह एक परीक्षण वाक्य है", "Encode & Decode"],
        ["भारत एक विशाल देश है", "Encode & Decode"],
        ["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"]
    ]
)

if __name__ == "__main__":
    iface.launch()