File size: 4,406 Bytes
998926e
54b4405
998926e
54b4405
998926e
 
54b4405
998926e
 
 
 
 
54b4405
998926e
 
 
 
 
 
 
54b4405
998926e
 
 
 
 
54b4405
998926e
 
54b4405
998926e
 
54b4405
998926e
 
 
 
 
 
 
 
54b4405
998926e
 
 
 
 
 
 
 
54b4405
998926e
 
 
 
54b4405
998926e
 
 
 
 
 
 
 
 
54b4405
998926e
 
54b4405
 
 
38ddf7b
 
 
 
 
 
4f8080f
38ddf7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f8080f
 
38ddf7b
 
f5203de
38ddf7b
 
3ae47c2
38ddf7b
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr

import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import re
from huggingface_hub import from_pretrained_keras

model = from_pretrained_keras("vrclc/transliteration")

# Define source and target tokenizers (replace with your actual tokenizers)
source_tokens = list('abcdefghijklmnopqrstuvwxyz ')
source_tokenizer = Tokenizer(char_level=True, filters='')
source_tokenizer.fit_on_texts(source_tokens)

malayalam_tokens = [
    # Independent vowels
    'അ', 'ആ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'ൠ', 'ഌ', 'ൡ', 'എ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ',

    # Consonants
    'ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ',
    'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ', 'ദ', 'ധ', 'ന',
    'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'ല', 'വ', 'ശ',
    'ഷ', 'സ', 'ഹ', 'ള', 'ഴ', 'റ',

    # Chillu letters
    'ൺ', 'ൻ', 'ർ', 'ൽ', 'ൾ',

    # Additional characters
    'ം', 'ഃ', '്',

    # Vowel modifiers / Signs
    'ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'ൄ', 'െ', 'േ', 'ൈ', 'ൊ', 'ോ', 'ൌ', 'ൗ', ' '
]

# Create tokenizer for Malayalam tokens
target_tokenizer = Tokenizer(char_level=True, filters='')
target_tokenizer.fit_on_texts(malayalam_tokens)

# Load your pre-trained model
max_seq_length = model.get_layer("encoder_input").input_shape[0][1]

def transliterate_with_split_tokens(input_text, model, source_tokenizer, target_tokenizer, max_seq_length):
    """
    Transliterates input text, preserving non-token characters.
    """
    # Regular expression to split the text into tokens and non-tokens
    tokens_and_non_tokens = re.findall(r"([a-zA-Z]+)|([^a-zA-Z]+)", input_text)

    transliterated_text = ""
    for token_or_non_token in tokens_and_non_tokens:
        token = token_or_non_token[0]
        non_token = token_or_non_token[1]

        if token:
            input_sequence = source_tokenizer.texts_to_sequences([token])[0]
            input_sequence_padded = pad_sequences([input_sequence], maxlen=max_seq_length, padding='post')
            predicted_sequence = model.predict(input_sequence_padded)
            predicted_indices = np.argmax(predicted_sequence, axis=-1)[0]
            transliterated_word = ''.join([target_tokenizer.index_word[idx] for idx in predicted_indices if idx != 0])
            transliterated_text += transliterated_word
        elif non_token:
            transliterated_text += non_token

    return transliterated_text

def transliterate(input_text):
  return transliterate_with_split_tokens(input_text, model, source_tokenizer, target_tokenizer, max_seq_length)


# Create Gradio interface with enhanced features
def create_transliteration_interface():
    # Define input and output components with more details
    input_textbox = gr.Textbox(
        lines=3, 
        placeholder="Enter Manglish text to transliterate to Malayalam...", 
        label="Input Text"
    )
    
    output_textbox = gr.Textbox(
        lines=3, 
        label="Transliterated Malayalam Text"
    )
    
    # Create the Gradio interface with more comprehensive configuration
    interface = gr.Interface(
        fn=transliterate,
        inputs=[
            gr.Textbox(
                lines=3, 
                placeholder="Enter English text to transliterate to Malayalam...", 
                label="Input Text"
            )
        ],
        outputs=[
            gr.Textbox(
                lines=3, 
                label="Transliterated Malayalam Text"
            )
        ],
        title="🌟 English to Malayalam Transliterator",
        description="Transliterate Manglish (Romanised Malayalam) text to Malayalam characters. Simply type or paste your Manglish text, and see the Malayalam transliteration instantly!",
        article="## How to Use\n1. Enter Manglish text in the input box\n2. The transliteration will appear automatically\n3. Works with words, phrases, and sentences",
        examples=[
            ["ente veed"],
            ["malayalam  padikkano? 😃"],
            ["india ente rajyamanu"]
        ],
        cache_examples=False,
        theme="huggingface"
    )
    
    return interface

# Launch the Gradio interface
if __name__ == "__main__":
    iface = create_transliteration_interface()
    iface.launch()