Spaces:
Sleeping
Sleeping
File size: 2,516 Bytes
97fe9c2 46cfa5a 97fe9c2 46cfa5a 97fe9c2 46cfa5a 97fe9c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import tensorflow as tf
import numpy as np
import re
from config import config
def clean_text(text):
text = text.lower() # Lowercase the text
text = re.sub(r'^[^\w\s(]+', '', text) # Remove any punctuation at the start of the sentence
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
text = re.sub(r"([*'_.,!?؟،۔()])\1+", r'\1', text) # Reduce multiple instances of the same punctuation to one
text = re.sub(r'([^\w\s])\1+', r'\1', text) # Reduce sequences of the same non-alphanumeric character (excluding spaces and specific punctuation) to one
text = re.sub(r"[^\w,'-.?!؟،۔\s]", '', text) # Remove special characters and symbols
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
return text
class Translator(tf.Module):
def __init__(self, sp_model_en, sp_model_ur, transformer):
self.sp_model_en = sp_model_en
self.sp_model_ur = sp_model_ur
self.transformer = transformer
def __call__(self, sentence, max_length=config.sequence_length):
sentence = clean_text(sentence)
sentence = tf.constant(sentence)
if len(sentence.shape) == 0:
sentence = sentence[tf.newaxis]
# Tokenize the English sentence
sentence = self.sp_model_en.tokenize(sentence).to_tensor()
encoder_input = sentence
# Initialize the output for Urdu with `[START]` token
start = self.sp_model_ur.tokenize([''])[0][0][tf.newaxis]
end = self.sp_model_ur.tokenize([''])[0][1][tf.newaxis]
output_array = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
output_array = output_array.write(0, start)
for i in tf.range(max_length):
output = tf.transpose(output_array.stack())
predictions = self.transformer([encoder_input, output], training=False)
predictions = predictions[:, -1:, :] # Shape `(batch_size, 1, vocab_size)`
predicted_id = tf.argmax(predictions, axis=-1)
predicted_id = tf.cast(predicted_id, tf.int32)
output_array = output_array.write(i+1, predicted_id[0])
if predicted_id == end:
break
output = tf.transpose(output_array.stack())
text = self.sp_model_ur.detokenize(output)[0] # Shape: `()`
return text.numpy().decode('utf-8') |