jatingocodeo commited on
Commit
f081412
·
verified ·
1 Parent(s): fe890e3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -0
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.hindi_bpe import HindiBPE
3
+
4
+ # Initialize the tokenizer
5
+ tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)
6
+
7
+ def process_text(text: str, mode: str) -> str:
8
+ """Process text using the tokenizer"""
9
+ if not text.strip():
10
+ return "Please enter some text."
11
+
12
+ if mode == "Encode":
13
+ # Encode the text
14
+ encoded = tokenizer.encode(text)
15
+ return f"Encoded tokens: {encoded}"
16
+ else:
17
+ # First encode then decode to show the round trip
18
+ encoded = tokenizer.encode(text)
19
+ decoded = tokenizer.decode(encoded)
20
+ return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}"
21
+
22
+ # Create the interface
23
+ iface = gr.Interface(
24
+ fn=process_text,
25
+ inputs=[
26
+ gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"),
27
+ gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
28
+ ],
29
+ outputs=gr.Textbox(label="Result"),
30
+ title="Hindi BPE Tokenizer",
31
+ description="""This is a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text.
32
+ Features:
33
+ - Vocabulary size: < 5000 tokens
34
+ - Compression ratio: ≥ 3.2
35
+ - Proper handling of Hindi Unicode characters and combining marks""",
36
+ examples=[
37
+ ["नमस्ते भारत", "Encode & Decode"],
38
+ ["मैं हिंदी सीख रहा हूं", "Encode & Decode"],
39
+ ["यह एक परीक्षण वाक्य है", "Encode & Decode"],
40
+ ["भारत एक विशाल देश है", "Encode & Decode"],
41
+ ["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"]
42
+ ]
43
+ )
44
+
45
+ if __name__ == "__main__":
46
+ iface.launch()