import spaces import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from flores import code_mapping import platform import torch device = "cpu" if platform.system() == "Darwin" else "cuda" MODEL_NAME = "facebook/nllb-200-3.3B" code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1])) flores_codes = list(code_mapping.keys()) def load_model(): model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device) return model model = load_model() def load_tokenizer(src_lang, tgt_lang): tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, src_lang=code_mapping[src_lang], tgt_lang=code_mapping[tgt_lang] ) return tokenizer @spaces.GPU def translate( text: str, src_lang: str, tgt_lang: str, window_size: int = 800, overlap_size: int = 200, ): tokenizer = load_tokenizer(src_lang, tgt_lang) input_tokens = ( tokenizer(text, return_tensors="pt").input_ids[0].cpu().numpy().tolist() ) translated_chunks = [] for i in range(0, len(input_tokens), window_size - overlap_size): window = input_tokens[i : i + window_size] translated_chunk = model.generate( input_ids=torch.tensor([window]).to(device), forced_bos_token_id=tokenizer.lang_code_to_id[code_mapping[tgt_lang]], max_length=window_size, num_return_sequences=1, ) translated_chunk = tokenizer.decode( translated_chunk[0], skip_special_tokens=True ) translated_chunks.append(translated_chunk) return " ".join(translated_chunks) description = """ No Language Left Behind (NLLB) is a series of open-source models aiming to provide high-quality translations between 200 languages. This demo application allows you to use the NLLB model to translate text between a source and target language. ## Notes - Whilst the model supports 200 languages, the quality of translations may vary between languages. - "Low Resource" languages (languages which are less present on the internet and have a lower amount of investment) may have lower quality translations. - The demo uses a sliding window approach to handle longer texts. """ instructions = """ 1. Select the source and target language from the dropdown menus. 2. Enter the text you would like to translate. 3. Click the 'Translate text' button. """ with gr.Blocks() as demo: gr.Markdown("# No Language Left Behind (NLLB) Translation Demo") gr.Markdown(description) gr.Markdown("## Instructions") gr.Markdown(instructions) with gr.Row(): src_lang = gr.Dropdown(label="Source Language", choices=flores_codes) target_lang = gr.Dropdown(label="Target Language", choices=flores_codes) with gr.Row(): input_text = gr.Textbox(label="Input Text", lines=6) with gr.Row(): btn = gr.Button("Translate text") with gr.Row(): output = gr.Textbox(label="Output Text", lines=6) btn.click( translate, inputs=[input_text, src_lang, target_lang], outputs=output, ) demo.launch()