kavyamanohar commited on
Commit
54b4405
1 Parent(s): 58f1c79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -60
app.py CHANGED
@@ -1,14 +1,16 @@
1
  import gradio as gr
 
2
  import numpy as np
 
3
  from tensorflow.keras.preprocessing.text import Tokenizer
4
  from tensorflow.keras.preprocessing.sequence import pad_sequences
 
5
  import re
6
  from huggingface_hub import from_pretrained_keras
7
 
8
- # Load the model from Hugging Face
9
  model = from_pretrained_keras("vrclc/transliteration")
10
 
11
- # Define source and target tokenizers
12
  source_tokens = list('abcdefghijklmnopqrstuvwxyz ')
13
  source_tokenizer = Tokenizer(char_level=True, filters='')
14
  source_tokenizer.fit_on_texts(source_tokens)
@@ -16,15 +18,19 @@ source_tokenizer.fit_on_texts(source_tokens)
16
  malayalam_tokens = [
17
  # Independent vowels
18
  'അ', 'ആ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'ൠ', 'ഌ', 'ൡ', 'എ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ',
 
19
  # Consonants
20
  'ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ',
21
  'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ', 'ദ', 'ധ', 'ന',
22
  'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'ല', 'വ', 'ശ',
23
  'ഷ', 'സ', 'ഹ', 'ള', 'ഴ', 'റ',
 
24
  # Chillu letters
25
  'ൺ', 'ൻ', 'ർ', 'ൽ', 'ൾ',
 
26
  # Additional characters
27
  'ം', 'ഃ', '്',
 
28
  # Vowel modifiers / Signs
29
  'ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'ൄ', 'െ', 'േ', 'ൈ', 'ൊ', 'ോ', 'ൌ', 'ൗ', ' '
30
  ]
@@ -33,28 +39,22 @@ malayalam_tokens = [
33
  target_tokenizer = Tokenizer(char_level=True, filters='')
34
  target_tokenizer.fit_on_texts(malayalam_tokens)
35
 
36
- # Get max sequence length from the model
37
  max_seq_length = model.get_layer("encoder_input").input_shape[0][1]
38
 
39
  def transliterate_with_split_tokens(input_text, model, source_tokenizer, target_tokenizer, max_seq_length):
40
  """
41
  Transliterates input text, preserving non-token characters.
42
  """
43
- # Handle empty input
44
- if not input_text:
45
- return ""
46
-
47
  # Regular expression to split the text into tokens and non-tokens
48
  tokens_and_non_tokens = re.findall(r"([a-zA-Z]+)|([^a-zA-Z]+)", input_text)
 
49
  transliterated_text = ""
50
-
51
  for token_or_non_token in tokens_and_non_tokens:
52
  token = token_or_non_token[0]
53
  non_token = token_or_non_token[1]
54
-
55
  if token:
56
- # Convert to lowercase to handle mixed case
57
- token = token.lower()
58
  input_sequence = source_tokenizer.texts_to_sequences([token])[0]
59
  input_sequence_padded = pad_sequences([input_sequence], maxlen=max_seq_length, padding='post')
60
  predicted_sequence = model.predict(input_sequence_padded)
@@ -63,54 +63,23 @@ def transliterate_with_split_tokens(input_text, model, source_tokenizer, target_
63
  transliterated_text += transliterated_word
64
  elif non_token:
65
  transliterated_text += non_token
66
-
67
  return transliterated_text
68
 
69
- # Create Gradio interface with enhanced features
70
- def create_transliteration_interface():
71
- # Define input and output components with more details
72
- input_textbox = gr.Textbox(
73
- lines=3,
74
- placeholder="Enter English text to transliterate to Malayalam...",
75
- label="Input Text"
76
- )
77
-
78
- output_textbox = gr.Textbox(
79
- lines=3,
80
- label="Transliterated Malayalam Text"
81
- )
82
-
83
- # Create the Gradio interface with more comprehensive configuration
84
- interface = gr.Interface(
85
- fn=transliterate_with_split_tokens,
86
- inputs=[
87
- gr.Textbox(
88
- lines=3,
89
- placeholder="Enter English text to transliterate to Malayalam...",
90
- label="Input Text"
91
- )
92
- ],
93
- outputs=[
94
- gr.Textbox(
95
- lines=3,
96
- label="Transliterated Malayalam Text"
97
- )
98
- ],
99
- title="🌟 English to Malayalam Transliterator",
100
- description="Transliterate English text to Malayalam characters. Simply type or paste your English text, and see the Malayalam transliteration instantly!",
101
- article="## How to Use\n1. Enter English text in the input box\n2. The transliteration will appear automatically\n3. Works with words, phrases, and sentences",
102
- examples=[
103
- ["ente veed"],
104
- ["malayalam"],
105
- ["hello world"],
106
- ["njan pranayam"]
107
- ],
108
- theme="huggingface"
109
- )
110
-
111
- return interface
112
-
113
- # Launch the Gradio interface
114
- if __name__ == "__main__":
115
- iface = create_transliteration_interface()
116
- iface.launch()
 
1
  import gradio as gr
2
+
3
  import numpy as np
4
+
5
  from tensorflow.keras.preprocessing.text import Tokenizer
6
  from tensorflow.keras.preprocessing.sequence import pad_sequences
7
+
8
  import re
9
  from huggingface_hub import from_pretrained_keras
10
 
 
11
  model = from_pretrained_keras("vrclc/transliteration")
12
 
13
+ # Define source and target tokenizers (replace with your actual tokenizers)
14
  source_tokens = list('abcdefghijklmnopqrstuvwxyz ')
15
  source_tokenizer = Tokenizer(char_level=True, filters='')
16
  source_tokenizer.fit_on_texts(source_tokens)
 
18
  malayalam_tokens = [
19
  # Independent vowels
20
  'അ', 'ആ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'ൠ', 'ഌ', 'ൡ', 'എ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ',
21
+
22
  # Consonants
23
  'ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ',
24
  'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ', 'ദ', 'ധ', 'ന',
25
  'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'ല', 'വ', 'ശ',
26
  'ഷ', 'സ', 'ഹ', 'ള', 'ഴ', 'റ',
27
+
28
  # Chillu letters
29
  'ൺ', 'ൻ', 'ർ', 'ൽ', 'ൾ',
30
+
31
  # Additional characters
32
  'ം', 'ഃ', '്',
33
+
34
  # Vowel modifiers / Signs
35
  'ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'ൄ', 'െ', 'േ', 'ൈ', 'ൊ', 'ോ', 'ൌ', 'ൗ', ' '
36
  ]
 
39
  target_tokenizer = Tokenizer(char_level=True, filters='')
40
  target_tokenizer.fit_on_texts(malayalam_tokens)
41
 
42
+ # Load your pre-trained model
43
  max_seq_length = model.get_layer("encoder_input").input_shape[0][1]
44
 
45
  def transliterate_with_split_tokens(input_text, model, source_tokenizer, target_tokenizer, max_seq_length):
46
  """
47
  Transliterates input text, preserving non-token characters.
48
  """
 
 
 
 
49
  # Regular expression to split the text into tokens and non-tokens
50
  tokens_and_non_tokens = re.findall(r"([a-zA-Z]+)|([^a-zA-Z]+)", input_text)
51
+
52
  transliterated_text = ""
 
53
  for token_or_non_token in tokens_and_non_tokens:
54
  token = token_or_non_token[0]
55
  non_token = token_or_non_token[1]
56
+
57
  if token:
 
 
58
  input_sequence = source_tokenizer.texts_to_sequences([token])[0]
59
  input_sequence_padded = pad_sequences([input_sequence], maxlen=max_seq_length, padding='post')
60
  predicted_sequence = model.predict(input_sequence_padded)
 
63
  transliterated_text += transliterated_word
64
  elif non_token:
65
  transliterated_text += non_token
66
+
67
  return transliterated_text
68
 
69
+ def transliterate(input_text):
70
+ return transliterate_with_split_tokens(input_text, model, source_tokenizer, target_tokenizer, max_seq_length)
71
+
72
+ input_text = "ente veed "
73
+ transliterated_text = transliterate_with_split_tokens(input_text, model, source_tokenizer, target_tokenizer, max_seq_length)
74
+
75
+ transliterated_text
76
+ # Create a Gradio interface
77
+ iface = gr.Interface(
78
+ fn=transliterate,
79
+ inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
80
+ outputs="text",
81
+ title="English to Malayalam Transliteration",
82
+ description="Transliterate English text to Malayalam.",
83
+ )
84
+
85
+ iface.launch(share=True)