en-ml-transliteration

Running

App Files Files Community

en-ml-transliteration / app.py

kavyamanohar

Create app.py

998926e verified 8 days ago

raw

history blame

4.41 kB

	import gradio as gr
	import numpy as np
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	import re
	from huggingface_hub import from_pretrained_keras

	# Load the model from Hugging Face
	model = from_pretrained_keras("vrclc/transliteration")

	# Define source and target tokenizers
	source_tokens = list('abcdefghijklmnopqrstuvwxyz ')
	source_tokenizer = Tokenizer(char_level=True, filters='')
	source_tokenizer.fit_on_texts(source_tokens)

	malayalam_tokens = [
	# Independent vowels
	'അ', 'ആ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'ൠ', 'ഌ', 'ൡ', 'എ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ',
	# Consonants
	'ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ',
	'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ', 'ദ', 'ധ', 'ന',
	'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'ല', 'വ', 'ശ',
	'ഷ', 'സ', 'ഹ', 'ള', 'ഴ', 'റ',
	# Chillu letters
	'ൺ', 'ൻ', 'ർ', 'ൽ', 'ൾ',
	# Additional characters
	'ം', 'ഃ', '്',
	# Vowel modifiers / Signs
	'ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'ൄ', 'െ', 'േ', 'ൈ', 'ൊ', 'ോ', 'ൌ', 'ൗ', ' '
	]

	# Create tokenizer for Malayalam tokens
	target_tokenizer = Tokenizer(char_level=True, filters='')
	target_tokenizer.fit_on_texts(malayalam_tokens)

	# Get max sequence length from the model
	max_seq_length = model.get_layer("encoder_input").input_shape[0][1]

	def transliterate_with_split_tokens(input_text, model, source_tokenizer, target_tokenizer, max_seq_length):
	"""
	Transliterates input text, preserving non-token characters.
	"""
	# Handle empty input
	if not input_text:
	return ""

	# Regular expression to split the text into tokens and non-tokens
	tokens_and_non_tokens = re.findall(r"([a-zA-Z]+)\|([^a-zA-Z]+)", input_text)
	transliterated_text = ""

	for token_or_non_token in tokens_and_non_tokens:
	token = token_or_non_token[0]
	non_token = token_or_non_token[1]

	if token:
	# Convert to lowercase to handle mixed case
	token = token.lower()
	input_sequence = source_tokenizer.texts_to_sequences([token])[0]
	input_sequence_padded = pad_sequences([input_sequence], maxlen=max_seq_length, padding='post')
	predicted_sequence = model.predict(input_sequence_padded)
	predicted_indices = np.argmax(predicted_sequence, axis=-1)[0]
	transliterated_word = ''.join([target_tokenizer.index_word[idx] for idx in predicted_indices if idx != 0])
	transliterated_text += transliterated_word
	elif non_token:
	transliterated_text += non_token

	return transliterated_text

	# Create Gradio interface with enhanced features
	def create_transliteration_interface():
	# Define input and output components with more details
	input_textbox = gr.Textbox(
	lines=3,
	placeholder="Enter English text to transliterate to Malayalam...",
	label="Input Text"
	)

	output_textbox = gr.Textbox(
	lines=3,
	label="Transliterated Malayalam Text"
	)

	# Create the Gradio interface with more comprehensive configuration
	interface = gr.Interface(
	fn=transliterate_with_split_tokens,
	inputs=[
	gr.Textbox(
	lines=3,
	placeholder="Enter English text to transliterate to Malayalam...",
	label="Input Text"
	)
	],
	outputs=[
	gr.Textbox(
	lines=3,
	label="Transliterated Malayalam Text"
	)
	],
	title="🌟 English to Malayalam Transliterator",
	description="Transliterate English text to Malayalam characters. Simply type or paste your English text, and see the Malayalam transliteration instantly!",
	article="## How to Use\n1. Enter English text in the input box\n2. The transliteration will appear automatically\n3. Works with words, phrases, and sentences",
	examples=[
	["ente veed"],
	["malayalam"],
	["hello world"],
	["njan pranayam"]
	],
	theme="huggingface"
	)

	return interface

	# Launch the Gradio interface
	if __name__ == "__main__":
	iface = create_transliteration_interface()
	iface.launch()