Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
from flores import code_mapping | |
import platform | |
import torch | |
device = "cpu" if platform.system() == "Darwin" else "cuda" | |
MODEL_NAME = "facebook/nllb-200-3.3B" | |
code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1])) | |
flores_codes = list(code_mapping.keys()) | |
def load_model(): | |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device) | |
return model | |
model = load_model() | |
def load_tokenizer(src_lang, tgt_lang): | |
tokenizer = AutoTokenizer.from_pretrained( | |
MODEL_NAME, src_lang=code_mapping[src_lang], tgt_lang=code_mapping[tgt_lang] | |
) | |
return tokenizer | |
def translate( | |
text: str, | |
src_lang: str, | |
tgt_lang: str, | |
window_size: int = 800, | |
overlap_size: int = 200, | |
): | |
tokenizer = load_tokenizer(src_lang, tgt_lang) | |
input_tokens = ( | |
tokenizer(text, return_tensors="pt").input_ids[0].cpu().numpy().tolist() | |
) | |
translated_chunks = [] | |
for i in range(0, len(input_tokens), window_size - overlap_size): | |
window = input_tokens[i : i + window_size] | |
translated_chunk = model.generate( | |
input_ids=torch.tensor([window]).to(device), | |
forced_bos_token_id=tokenizer.lang_code_to_id[code_mapping[tgt_lang]], | |
max_length=window_size, | |
num_return_sequences=1, | |
) | |
translated_chunk = tokenizer.decode( | |
translated_chunk[0], skip_special_tokens=True | |
) | |
translated_chunks.append(translated_chunk) | |
return " ".join(translated_chunks) | |
description = """ | |
No Language Left Behind (NLLB) is a series of open-source models aiming to provide high-quality translations between 200 languages. | |
This demo application allows you to use the NLLB model to translate text between a source and target language. | |
## Notes | |
- Whilst the model supports 200 languages, the quality of translations may vary between languages. | |
- "Low Resource" languages (languages which are less present on the internet and have a lower amount of investment) may have lower quality translations. | |
- The demo uses a sliding window approach to handle longer texts. | |
""" | |
instructions = """ | |
1. Select the source and target language from the dropdown menus. | |
2. Enter the text you would like to translate. | |
3. Click the 'Translate text' button. | |
""" | |
with gr.Blocks() as demo: | |
gr.Markdown("# No Language Left Behind (NLLB) Translation Demo") | |
gr.Markdown(description) | |
gr.Markdown("## Instructions") | |
gr.Markdown(instructions) | |
with gr.Row(): | |
src_lang = gr.Dropdown(label="Source Language", choices=flores_codes) | |
target_lang = gr.Dropdown(label="Target Language", choices=flores_codes) | |
with gr.Row(): | |
input_text = gr.Textbox(label="Input Text", lines=6) | |
with gr.Row(): | |
btn = gr.Button("Translate text") | |
with gr.Row(): | |
output = gr.Textbox(label="Output Text", lines=6) | |
btn.click( | |
translate, | |
inputs=[input_text, src_lang, target_lang], | |
outputs=output, | |
) | |
demo.launch() | |