huzaifanafees commited on
Commit
46cfa5a
·
1 Parent(s): 12646ff

Update translator.py

Browse files
Files changed (1) hide show
  1. translator.py +12 -0
translator.py CHANGED
@@ -1,7 +1,18 @@
1
  import tensorflow as tf
2
  import numpy as np
 
3
  from config import config
4
 
 
 
 
 
 
 
 
 
 
 
5
  class Translator(tf.Module):
6
  def __init__(self, sp_model_en, sp_model_ur, transformer):
7
  self.sp_model_en = sp_model_en
@@ -9,6 +20,7 @@ class Translator(tf.Module):
9
  self.transformer = transformer
10
 
11
  def __call__(self, sentence, max_length=config.sequence_length):
 
12
  sentence = tf.constant(sentence)
13
  if len(sentence.shape) == 0:
14
  sentence = sentence[tf.newaxis]
 
1
  import tensorflow as tf
2
  import numpy as np
3
+ import re
4
  from config import config
5
 
6
+ def clean_text(text):
7
+ text = text.lower() # Lowercase the text
8
+ text = re.sub(r'^[^\w\s(]+', '', text) # Remove any punctuation at the start of the sentence
9
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
10
+ text = re.sub(r"([*'_.,!?؟،۔()])\1+", r'\1', text) # Reduce multiple instances of the same punctuation to one
11
+ text = re.sub(r'([^\w\s])\1+', r'\1', text) # Reduce sequences of the same non-alphanumeric character (excluding spaces and specific punctuation) to one
12
+ text = re.sub(r"[^\w,'-.?!؟،۔\s]", '', text) # Remove special characters and symbols
13
+ text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
14
+ return text
15
+
16
  class Translator(tf.Module):
17
  def __init__(self, sp_model_en, sp_model_ur, transformer):
18
  self.sp_model_en = sp_model_en
 
20
  self.transformer = transformer
21
 
22
  def __call__(self, sentence, max_length=config.sequence_length):
23
+ sentence = clean_text(sentence)
24
  sentence = tf.constant(sentence)
25
  if len(sentence.shape) == 0:
26
  sentence = sentence[tf.newaxis]