Spaces:

UNESCO
/

nllb

Running on Zero

App Files Files Community

davanstrien HF staff commited on Apr 4

Commit

71ae380

•

1 Parent(s): 620b306

sliding window

Browse files

Files changed (1) hide show

app.py +30 -41

app.py CHANGED Viewed

@@ -1,16 +1,12 @@
 import spaces
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from flores import code_mapping
 import platform
-import re
 device = "cpu" if platform.system() == "Darwin" else "cuda"
-MODEL_NAME = (
-    "facebook/nllb-200-distilled-600M"
-    if platform.system() == "Darwin"
-    else "facebook/nllb-200-3.3B"
-)
 code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1]))
 flores_codes = list(code_mapping.keys())
@@ -26,49 +22,42 @@ model, tokenizer = load_model()
 @spaces.GPU
-def _translate(text: str, src_lang: str, tgt_lang: str):
-    source = code_mapping[src_lang]
-    target = code_mapping[tgt_lang]
-    translator = pipeline(
-        "translation",
-        model=model,
-        tokenizer=tokenizer,
-        src_lang=source,
-        tgt_lang=target,
-        device=device,
-    )
-    output = translator(text, max_length=400)
-    return output[0]["translation_text"]
-def translate(text: str, src_lang: str, tgt_lang: str):
-    # split the input text into smaller chunks
-    outputs = ""
-    paragraph_chunks = text.split("\n")
-    for chunk in paragraph_chunks:
-        # check if the chunk is too long
-        if len(chunk) > 500:
-            # split on full stops, question marks, and exclamation marks
-            sentence_chunks = re.split(r"(?<=[.!?])\s+", chunk)
-            for sentence in sentence_chunks:
-                if sentence.strip():  # check if the sentence is not empty
-                    outputs += f"{_translate(sentence, src_lang, tgt_lang)} "
-            outputs += "\n\n"
-        else:
-            outputs += _translate(chunk, src_lang, tgt_lang) + "\n\n"
-    return outputs.strip()
 description = """
-No Language Left Behind (NLLB) is a series of open-source models aiming to provide high-quality translations between 200 language.
 This demo application allows you to use the NLLB model to translate text between a source and target language.
 ## Notes
 - Whilst the model supports 200 languages, the quality of translations may vary between languages.
 - "Low Resource" languages (languages which are less present on the internet and have a lower amount of investment) may have lower quality translations.
-- The demo is not intended to be used for very long texts.
 """
 instructions = """

 import spaces
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from flores import code_mapping
 import platform
+import torch
 device = "cpu" if platform.system() == "Darwin" else "cuda"
+MODEL_NAME = "facebook/m2m100_1.2B"
 code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1]))
 flores_codes = list(code_mapping.keys())
 @spaces.GPU
+def translate(
+    text: str,
+    src_lang: str,
+    tgt_lang: str,
+    window_size: int = 800,
+    overlap_size: int = 200,
+):
+    input_tokens = tokenizer.encode(text, return_tensors="pt")[0].cpu().numpy().tolist()
+    translated_chunks = []
+    for i in range(0, len(input_tokens), window_size - overlap_size):
+        window = input_tokens[i : i + window_size]
+        translated_chunk = model.generate(
+            input_ids=torch.tensor([window]).to(device),
+            forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
+            max_length=window_size,
+            num_return_sequences=1,
+        )
+        translated_chunk = tokenizer.decode(
+            translated_chunk[0], skip_special_tokens=True
+        )
+        translated_chunks.append(translated_chunk)
+    translated_text = " ".join(translated_chunks)
+    return translated_text
 description = """
+No Language Left Behind (NLLB) is a series of open-source models aiming to provide high-quality translations between 200 languages.
 This demo application allows you to use the NLLB model to translate text between a source and target language.
 ## Notes
 - Whilst the model supports 200 languages, the quality of translations may vary between languages.
 - "Low Resource" languages (languages which are less present on the internet and have a lower amount of investment) may have lower quality translations.
+- The demo uses a sliding window approach to handle longer texts.
 """
 instructions = """