Spaces:

jatingocodeo
/

Hindi-BPE

Sleeping

App Files Files Community

Hindi-BPE / app.py

jatingocodeo

Create app.py

f081412 verified 4 days ago

raw

history blame

1.82 kB

	import gradio as gr
	from src.hindi_bpe import HindiBPE

	# Initialize the tokenizer
	tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)

	def process_text(text: str, mode: str) -> str:
	"""Process text using the tokenizer"""
	if not text.strip():
	return "Please enter some text."

	if mode == "Encode":
	# Encode the text
	encoded = tokenizer.encode(text)
	return f"Encoded tokens: {encoded}"
	else:
	# First encode then decode to show the round trip
	encoded = tokenizer.encode(text)
	decoded = tokenizer.decode(encoded)
	return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}"

	# Create the interface
	iface = gr.Interface(
	fn=process_text,
	inputs=[
	gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"),
	gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
	],
	outputs=gr.Textbox(label="Result"),
	title="Hindi BPE Tokenizer",
	description="""This is a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text.
	Features:
	- Vocabulary size: < 5000 tokens
	- Compression ratio: ≥ 3.2
	- Proper handling of Hindi Unicode characters and combining marks""",
	examples=[
	["नमस्ते भारत", "Encode & Decode"],
	["मैं हिंदी सीख रहा हूं", "Encode & Decode"],
	["यह एक परीक्षण वाक्य है", "Encode & Decode"],
	["भारत एक विशाल देश है", "Encode & Decode"],
	["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"]
	]
	)

	if __name__ == "__main__":
	iface.launch()