Spaces:

eaysu
/

HelsinkiNLP-Translate

Sleeping

App Files Files Community

eaysu commited on 16 days ago

Commit

b31c836

•

1 Parent(s): b5d5b5e

sentences translating seperately

Browse files

Files changed (1) hide show

app.py +24 -22

app.py CHANGED Viewed

@@ -1,6 +1,12 @@
 import gradio as gr
 from transformers import MarianMTModel, MarianTokenizer
 import torch
 # Cache for storing models and tokenizers
 models_cache = {}
@@ -19,7 +25,7 @@ def load_model(model_name):
 def translate_text(model_name, text):
     """
-    Translate input text using the specified model.
     """
     if not model_name or not text:
         return "Please select a model and provide text for translation."
@@ -28,14 +34,23 @@ def translate_text(model_name, text):
         # Load the model and tokenizer
         model, tokenizer = load_model(model_name)
-        # Tokenize the text
-        tokens = tokenizer(text, return_tensors="pt", padding=True)
-        if torch.cuda.is_available():
-            tokens = {k: v.to('cuda') for k, v in tokens.items()}
-        # Generate translation
-        translated = model.generate(**tokens)
-        return tokenizer.decode(translated[0], skip_special_tokens=True)
     except Exception as e:
         return f"Error: {str(e)}"
@@ -44,20 +59,7 @@ def translate_text(model_name, text):
 model_options = [
     ("English to Turkish", "Helsinki-NLP/opus-mt-tc-big-en-tr"),
     ("Turkish to English", "Helsinki-NLP/opus-mt-tc-big-tr-en"),
-    ("English to French", "Helsinki-NLP/opus-mt-tc-big-en-fr"),
-    ("French to English", "Helsinki-NLP/opus-mt-tc-big-fr-en"),
-    ("English to German", "Helsinki-NLP/opus-mt-en-de"),
-    ("German to English", "Helsinki-NLP/opus-mt-de-en"),
-    ("English to Spanish", "Helsinki-NLP/opus-mt-tc-big-en-es"),
-    ("Spanish to English", "Helsinki-NLP/opus-mt-es-en"),
-    ("English to Arabic", "Helsinki-NLP/opus-mt-tc-big-en-ar"),
-    ("Arabic to English", "Helsinki-NLP/opus-mt-tc-big-ar-en"),
-    ("English to Urdu", "Helsinki-NLP/opus-mt-en-ur"),
-    ("Urdu to English", "Helsinki-NLP/opus-mt-ur-en"),
-    ("English to Hindi", "Helsinki-NLP/opus-mt-en-hi"),
-    ("Hindi to English", "Helsinki-NLP/opus-mt-hi-en"),
-    ("English to Chinese", "Helsinki-NLP/opus-mt-en-zh"),
-    ("Chinese to English", "Helsinki-NLP/opus-mt-zh-en"),
 ]
 # Create Gradio interface

 import gradio as gr
 from transformers import MarianMTModel, MarianTokenizer
 import torch
+import nltk
+# Download punkt for sentence tokenization
+nltk.download('punkt')
+from nltk.tokenize import sent_tokenize
 # Cache for storing models and tokenizers
 models_cache = {}
 def translate_text(model_name, text):
     """
+    Translate input text sentence by sentence using the specified model.
     """
     if not model_name or not text:
         return "Please select a model and provide text for translation."
         # Load the model and tokenizer
         model, tokenizer = load_model(model_name)
+        # Split text into sentences
+        sentences = sent_tokenize(text)
+        translated_sentences = []
+        for sentence in sentences:
+            # Tokenize the sentence
+            tokens = tokenizer(sentence, return_tensors="pt", padding=True)
+            if torch.cuda.is_available():
+                tokens = {k: v.to('cuda') for k, v in tokens.items()}
+            # Generate translation for the sentence
+            translated = model.generate(**tokens)
+            translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
+            translated_sentences.append(translated_text)
+        # Join translated sentences back into a single string
+        return " ".join(translated_sentences)
     except Exception as e:
         return f"Error: {str(e)}"
 model_options = [
     ("English to Turkish", "Helsinki-NLP/opus-mt-tc-big-en-tr"),
     ("Turkish to English", "Helsinki-NLP/opus-mt-tc-big-tr-en"),
+    # Add other models here...
 ]
 # Create Gradio interface