Spaces:
Sleeping
Sleeping
import gradio as gr | |
from src.hindi_bpe import HindiBPE | |
# Initialize the tokenizer | |
tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2) | |
def process_text(text: str, mode: str) -> str: | |
"""Process text using the tokenizer""" | |
if not text.strip(): | |
return "Please enter some text." | |
if mode == "Encode": | |
# Encode the text | |
encoded = tokenizer.encode(text) | |
return f"Encoded tokens: {encoded}" | |
else: | |
# First encode then decode to show the round trip | |
encoded = tokenizer.encode(text) | |
decoded = tokenizer.decode(encoded) | |
return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}" | |
# Create the interface | |
iface = gr.Interface( | |
fn=process_text, | |
inputs=[ | |
gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"), | |
gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode") | |
], | |
outputs=gr.Textbox(label="Result"), | |
title="Hindi BPE Tokenizer", | |
description="""This is a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text. | |
Features: | |
- Vocabulary size: < 5000 tokens | |
- Compression ratio: ≥ 3.2 | |
- Proper handling of Hindi Unicode characters and combining marks""", | |
examples=[ | |
["नमस्ते भारत", "Encode & Decode"], | |
["मैं हिंदी सीख रहा हूं", "Encode & Decode"], | |
["यह एक परीक्षण वाक्य है", "Encode & Decode"], | |
["भारत एक विशाल देश है", "Encode & Decode"], | |
["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"] | |
] | |
) | |
if __name__ == "__main__": | |
iface.launch() |