File size: 3,100 Bytes
38742d7
 
71ae380
38742d7
 
71ae380
5e1003d
 
 
38742d7
74a629f
 
 
38742d7
6bcde50
38742d7
 
 
74a629f
38742d7
 
 
3f23d73
38742d7
 
3f23d73
 
 
 
 
 
 
 
38742d7
 
 
5e1003d
3f23d73
 
e72a9c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15ccfd9
 
38742d7
a47c01b
7dc20b3
a47c01b
1473813
38742d7
 
a47c01b
38742d7
 
 
74a629f
38742d7
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import spaces
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from flores import code_mapping
import platform
import torch
import nltk

nltk.download("punkt")

REMOVED_TARGET_LANGUAGES = {"Ligurian", "Lombard", "Sicilian"}


device = "cpu" if platform.system() == "Darwin" else "cuda"
MODEL_NAME = "facebook/nllb-200-3.3B"

code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1]))
flores_codes = list(code_mapping.keys())
target_languages = [language for language in flores_codes if not language in REMOVED_TARGET_LANGUAGES]

def load_model():
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
    return model


model = load_model()


def load_tokenizer(src_lang, tgt_lang):
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_NAME, src_lang=code_mapping[src_lang], tgt_lang=code_mapping[tgt_lang]
    )
    return tokenizer


@spaces.GPU
def translate(text: str, src_lang: str, tgt_lang: str):
    tokenizer = load_tokenizer(src_lang, tgt_lang)

    paragraphs = text.split("\n")
    translated_paragraphs = []

    for paragraph in paragraphs:
        sentences = nltk.sent_tokenize(paragraph)
        translated_sentences = []

        for sentence in sentences:
            input_tokens = (
                tokenizer(sentence, return_tensors="pt")
                .input_ids[0]
                .cpu()
                .numpy()
                .tolist()
            )
            translated_chunk = model.generate(
                input_ids=torch.tensor([input_tokens]).to(device),
                forced_bos_token_id=tokenizer.lang_code_to_id[code_mapping[tgt_lang]],
                max_length=len(input_tokens) + 50,
                num_return_sequences=1,
            )
            translated_chunk = tokenizer.decode(
                translated_chunk[0], skip_special_tokens=True
            )
            translated_sentences.append(translated_chunk)

        translated_paragraph = " ".join(translated_sentences)
        translated_paragraphs.append(translated_paragraph)

    return "\n".join(translated_paragraphs)


description = """
UNESCO, Meta, and Hugging Face have come together to create an accessible, high-quality translation experience in 200 languages. 

This is made possible through an open approach to AI innovation using Meta’s open-sourced No Language Left Behind (NLLB) AI model, hosted on Hugging Face Spaces. 
"""

with gr.Blocks() as demo:
    gr.Markdown("# UNESCO Language Translator, powered by Meta and Hugging Face")
    gr.Markdown(description)
    with gr.Row():
        src_lang = gr.Dropdown(label="Source Language", choices=flores_codes)
        target_lang = gr.Dropdown(label="Target Language", choices=target_languages)
    with gr.Row():
        input_text = gr.Textbox(label="Input Text", lines=6)
    with gr.Row():
        btn = gr.Button("Translate text")
    with gr.Row():
        output = gr.Textbox(label="Output Text", lines=6)
    btn.click(
        translate,
        inputs=[input_text, src_lang, target_lang],
        outputs=output,
    )
demo.launch()