Spaces:

jatingocodeo
/

Hindi-BPE

Running

App Files Files Community

jatingocodeo commited on 4 days ago

Commit

f081412

verified ·

1 Parent(s): fe890e3

Create app.py

Browse files

Files changed (1) hide show

app.py +46 -0

app.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import gradio as gr
+from src.hindi_bpe import HindiBPE
+# Initialize the tokenizer
+tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)
+def process_text(text: str, mode: str) -> str:
+    """Process text using the tokenizer"""
+    if not text.strip():
+        return "Please enter some text."
+    if mode == "Encode":
+        # Encode the text
+        encoded = tokenizer.encode(text)
+        return f"Encoded tokens: {encoded}"
+    else:
+        # First encode then decode to show the round trip
+        encoded = tokenizer.encode(text)
+        decoded = tokenizer.decode(encoded)
+        return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}"
+# Create the interface
+iface = gr.Interface(
+    fn=process_text,
+    inputs=[
+        gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"),
+        gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
+    ],
+    outputs=gr.Textbox(label="Result"),
+    title="Hindi BPE Tokenizer",
+    description="""This is a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text.
+    Features:
+    - Vocabulary size: < 5000 tokens
+    - Compression ratio: ≥ 3.2
+    - Proper handling of Hindi Unicode characters and combining marks""",
+    examples=[
+        ["नमस्ते भारत", "Encode & Decode"],
+        ["मैं हिंदी सीख रहा हूं", "Encode & Decode"],
+        ["यह एक परीक्षण वाक्य है", "Encode & Decode"],
+        ["भारत एक विशाल देश है", "Encode & Decode"],
+        ["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"]
+    ]
+)
+if __name__ == "__main__":
+    iface.launch()