Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,697 Bytes
38742d7 2a62da0 3740b63 71ae380 38742d7 71ae380 5e1003d 6634f63 5e1003d aac03a6 38742d7 74a629f 38742d7 6bcde50 38742d7 61e0458 38742d7 74a629f 38742d7 86d577a 3f23d73 38742d7 3f23d73 2a62da0 38742d7 2a62da0 3740b63 6634f63 5e1003d 5c565ab 6634f63 2a62da0 6634f63 2a62da0 3f23d73 e72a9c0 3740b63 e72a9c0 2a62da0 e72a9c0 d0ffdbf e72a9c0 15ccfd9 29ba4e2 38742d7 e73c7fc 29ba4e2 a47c01b 7dc20b3 e73c7fc 29ba4e2 1473813 38742d7 6634f63 38742d7 74a629f 38742d7 6634f63 29ba4e2 aac03a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import spaces
import gradio as gr
from sacremoses import MosesPunctNormalizer
from stopes.pipelines.monolingual.utils.sentence_split import get_split_algo
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from flores import code_mapping
import platform
import torch
import nltk
from functools import lru_cache
nltk.download("punkt_tab")
REMOVED_TARGET_LANGUAGES = {"Ligurian", "Lombard", "Sicilian"}
device = "cpu" if platform.system() == "Darwin" else "cuda"
MODEL_NAME = "facebook/nllb-200-3.3B"
code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[0]))
flores_codes = list(code_mapping.keys())
target_languages = [language for language in flores_codes if not language in REMOVED_TARGET_LANGUAGES]
def load_model():
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
print(f"Model loaded in {device}")
return model
model = load_model()
# Loading the tokenizer once, because re-loading it takes about 1.5 seconds each time
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
punct_normalizer = MosesPunctNormalizer(lang="en")
@lru_cache(maxsize=202)
def get_language_specific_sentence_splitter(language_code):
short_code = language_code[:3]
splitter = get_split_algo(short_code, "default")
return splitter
# cache function
@lru_cache(maxsize=100)
def translate(text: str, src_lang: str, tgt_lang: str):
if not src_lang:
raise gr.Error("The source language is empty! Please choose it in the dropdown list.")
if not tgt_lang:
raise gr.Error("The target language is empty! Please choose it in the dropdown list.")
return _translate(text, src_lang, tgt_lang)
# Only assign GPU if cache not used
@spaces.GPU
def _translate(text: str, src_lang: str, tgt_lang: str):
src_code = code_mapping[src_lang]
tgt_code = code_mapping[tgt_lang]
tokenizer.src_lang = src_code
tokenizer.tgt_lang = tgt_code
# normalizing the punctuation first
text = punct_normalizer.normalize(text)
paragraphs = text.split("\n")
translated_paragraphs = []
for paragraph in paragraphs:
splitter = get_language_specific_sentence_splitter(src_code)
sentences = list(splitter(paragraph))
translated_sentences = []
for sentence in sentences:
input_tokens = (
tokenizer(sentence, return_tensors="pt")
.input_ids[0]
.cpu()
.numpy()
.tolist()
)
translated_chunk = model.generate(
input_ids=torch.tensor([input_tokens]).to(device),
forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code),
max_length=len(input_tokens) + 50,
num_return_sequences=1,
num_beams=5,
no_repeat_ngram_size=4, # repetition blocking works better if this number is below num_beams
renormalize_logits=True, # recompute token probabilities after banning the repetitions
)
translated_chunk = tokenizer.decode(
translated_chunk[0], skip_special_tokens=True
)
translated_sentences.append(translated_chunk)
translated_paragraph = " ".join(translated_sentences)
translated_paragraphs.append(translated_paragraph)
return "\n".join(translated_paragraphs)
description = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/UNESCO/nllb/resolve/main/UNESCO_META_HF_BANNER.png" alt="UNESCO Meta Hugging Face Banner" style="max-width: 800px; width: 100%; margin: 0 auto;">
<h1 style="color: #0077be;">UNESCO Language Translator, powered by Meta and Hugging Face</h1>
</div>
UNESCO, Meta, and Hugging Face have come together to create an accessible, high-quality translation experience in 200 languages.
This is made possible through an open approach to AI innovation using Meta's open-sourced No Language Left Behind (NLLB) AI model, hosted on Hugging Face Spaces.
"""
disclaimer = """
## Disclaimer
This translation interface, developed as part of UNESCO's work on Multilingualism and supported by Meta's No Language Left Behind AI model and Hugging Face, is designed to assist with language translation using open-source AI technologies. However, translations generated by the tool may not be accurate or perfect. While we strive to provide accurate translations, the tool may produce inaccuracies due to the complexity and nuances of different languages.
- The tool may not fully capture the context, cultural nuances, idiomatic expressions, or specific terminologies.
- Manual review and adjustment are recommended for important translations.
- The translations are provided "as is" without any warranties of any kind, either expressed or implied.
- Users should not rely solely on the tool for critical or sensitive translations and are responsible for verifying the accuracy and appropriateness of the translations for their specific needs.
- We recommend consulting with professional translators for official, legal, medical, or other critical translations.
- We shall not be liable for any direct, indirect, incidental, special, or consequential damages arising out of or in connection with the use or inability to use the translation tool, including but not limited to errors or omissions in translations.
By using this translation tool, you agree to these terms and acknowledge that the use of the tool is at your own risk.
For any feedback or support, please contact UNESCO World Atlas of Languages Team: WAL.Data@unesco.org.
"""
examples_inputs = [["The United Nations Educational, Scientific and Cultural Organization is a specialized agency of the United Nations with the aim of promoting world peace and security through international cooperation in education, arts, sciences and culture. ","English","Ayacucho Quechua"],]
with gr.Blocks() as demo:
gr.Markdown(description)
with gr.Row():
src_lang = gr.Dropdown(label="Source Language", choices=flores_codes)
target_lang = gr.Dropdown(label="Target Language", choices=target_languages)
with gr.Row():
input_text = gr.Textbox(label="Input Text", lines=6)
with gr.Row():
btn = gr.Button("Translate text")
with gr.Row():
output = gr.Textbox(label="Output Text", lines=6)
btn.click(
translate,
inputs=[input_text, src_lang, target_lang],
outputs=output,
)
examples = gr.Examples(examples=examples_inputs,inputs=[input_text, src_lang,target_lang], fn=translate, outputs=output, cache_examples=True)
with gr.Row():
gr.Markdown(disclaimer)
demo.launch() |