jatingocodeo commited on
Commit
f61c187
·
verified ·
1 Parent(s): ca6a8e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -2
app.py CHANGED
@@ -1,9 +1,25 @@
1
  import gradio as gr
2
  from src.hindi_bpe import HindiBPE
 
 
3
 
4
  # Initialize the tokenizer
5
  tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def process_text(text: str, mode: str) -> str:
8
  """Process text using the tokenizer"""
9
  if not text.strip():
@@ -27,11 +43,12 @@ iface = gr.Interface(
27
  gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
28
  ],
29
  outputs=gr.Textbox(label="Result"),
30
- title="Hindi BPE Tokenizer",
31
- description="""This is a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text.
32
  Features:
33
  - Vocabulary size: < 5000 tokens
34
  - Compression ratio: ≥ 3.2
 
35
  - Proper handling of Hindi Unicode characters and combining marks""",
36
  examples=[
37
  ["नमस्ते भारत", "Encode & Decode"],
 
1
  import gradio as gr
2
  from src.hindi_bpe import HindiBPE
3
+ import pickle
4
+ import os
5
 
6
  # Initialize the tokenizer
7
  tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)
8
 
9
+ # Load production model state
10
+ model_file = 'hindi_bpe_model.pkl'
11
+ if os.path.exists(model_file):
12
+ print("Loading production model...")
13
+ with open(model_file, 'rb') as f:
14
+ state = pickle.load(f)
15
+ tokenizer.vocab = state['vocab']
16
+ tokenizer.inverse_vocab = state['inverse_vocab']
17
+ tokenizer.bpe_ranks = state['bpe_ranks']
18
+ print(f"Model loaded successfully!")
19
+ print(f"Vocabulary size: {len(tokenizer.vocab)} tokens")
20
+ else:
21
+ raise FileNotFoundError("Production model not found! Please run train_bpe.py first and copy the model file.")
22
+
23
  def process_text(text: str, mode: str) -> str:
24
  """Process text using the tokenizer"""
25
  if not text.strip():
 
43
  gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
44
  ],
45
  outputs=gr.Textbox(label="Result"),
46
+ title="Hindi BPE Tokenizer (Production Model)",
47
+ description="""This is a production-grade Byte Pair Encoding (BPE) tokenizer trained on 1 million Hindi sentences.
48
  Features:
49
  - Vocabulary size: < 5000 tokens
50
  - Compression ratio: ≥ 3.2
51
+ - Trained on 1M sentences
52
  - Proper handling of Hindi Unicode characters and combining marks""",
53
  examples=[
54
  ["नमस्ते भारत", "Encode & Decode"],