File size: 1,816 Bytes
f081412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import gradio as gr
from src.hindi_bpe import HindiBPE

# Initialize the tokenizer
tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)

def process_text(text: str, mode: str) -> str:
    """Process text using the tokenizer"""
    if not text.strip():
        return "Please enter some text."
        
    if mode == "Encode":
        # Encode the text
        encoded = tokenizer.encode(text)
        return f"Encoded tokens: {encoded}"
    else:
        # First encode then decode to show the round trip
        encoded = tokenizer.encode(text)
        decoded = tokenizer.decode(encoded)
        return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}"

# Create the interface
iface = gr.Interface(
    fn=process_text,
    inputs=[
        gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"),
        gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
    ],
    outputs=gr.Textbox(label="Result"),
    title="Hindi BPE Tokenizer",
    description="""This is a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text.
    Features:
    - Vocabulary size: < 5000 tokens
    - Compression ratio: ≥ 3.2
    - Proper handling of Hindi Unicode characters and combining marks""",
    examples=[
        ["नमस्ते भारत", "Encode & Decode"],
        ["मैं हिंदी सीख रहा हूं", "Encode & Decode"],
        ["यह एक परीक्षण वाक्य है", "Encode & Decode"],
        ["भारत एक विशाल देश है", "Encode & Decode"],
        ["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"]
    ]
)

if __name__ == "__main__":
    iface.launch()