Spaces:
Sleeping
Sleeping
File size: 1,816 Bytes
f081412 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import gradio as gr
from src.hindi_bpe import HindiBPE
# Initialize the tokenizer
tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)
def process_text(text: str, mode: str) -> str:
"""Process text using the tokenizer"""
if not text.strip():
return "Please enter some text."
if mode == "Encode":
# Encode the text
encoded = tokenizer.encode(text)
return f"Encoded tokens: {encoded}"
else:
# First encode then decode to show the round trip
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}"
# Create the interface
iface = gr.Interface(
fn=process_text,
inputs=[
gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"),
gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
],
outputs=gr.Textbox(label="Result"),
title="Hindi BPE Tokenizer",
description="""This is a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text.
Features:
- Vocabulary size: < 5000 tokens
- Compression ratio: ≥ 3.2
- Proper handling of Hindi Unicode characters and combining marks""",
examples=[
["नमस्ते भारत", "Encode & Decode"],
["मैं हिंदी सीख रहा हूं", "Encode & Decode"],
["यह एक परीक्षण वाक्य है", "Encode & Decode"],
["भारत एक विशाल देश है", "Encode & Decode"],
["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"]
]
)
if __name__ == "__main__":
iface.launch() |