import spaces import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from flores import code_mapping import platform import torch device = "cpu" if platform.system() == "Darwin" else "cuda" MODEL_NAME = "facebook/m2m100_1.2B" code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1])) flores_codes = list(code_mapping.keys()) def load_model(): model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) return model, tokenizer model, tokenizer = load_model() @spaces.GPU def translate( text: str, src_lang: str, tgt_lang: str, window_size: int = 800, overlap_size: int = 200, ): input_tokens = ( tokenizer(text, return_tensors="pt", src_lang=code_mapping[src_lang]) .input_ids[0] .cpu() .numpy() .tolist() ) translated_chunks = [] for i in range(0, len(input_tokens), window_size - overlap_size): window = input_tokens[i : i + window_size] translated_chunk = model.generate( input_ids=torch.tensor([window]).to(device), forced_bos_token_id=tokenizer.lang_code_to_id[code_mapping[tgt_lang]], max_length=window_size, num_return_sequences=1, ) translated_chunk = tokenizer.decode( translated_chunk[0], skip_special_tokens=True ) translated_chunks.append(translated_chunk) translated_text = " ".join(translated_chunks) return translated_text description = """ No Language Left Behind (NLLB) is a series of open-source models aiming to provide high-quality translations between 200 languages. This demo application allows you to use the NLLB model to translate text between a source and target language. ## Notes - Whilst the model supports 200 languages, the quality of translations may vary between languages. - "Low Resource" languages (languages which are less present on the internet and have a lower amount of investment) may have lower quality translations. - The demo uses a sliding window approach to handle longer texts. """ instructions = """ 1. Select the source and target language from the dropdown menus. 2. Enter the text you would like to translate. 3. Click the 'Translate text' button. """ with gr.Blocks() as demo: gr.Markdown("# No Language Left Behind (NLLB) Translation Demo") gr.Markdown(description) gr.Markdown("## Instructions") gr.Markdown(instructions) with gr.Row(): src_lang = gr.Dropdown(label="Source Language", choices=flores_codes) target_lang = gr.Dropdown(label="Target Language", choices=flores_codes) with gr.Row(): input_text = gr.Textbox(label="Input Text", lines=6) with gr.Row(): btn = gr.Button("Translate text") with gr.Row(): output = gr.Textbox(label="Output Text", lines=6) btn.click( translate, inputs=[input_text, src_lang, target_lang], outputs=output, ) demo.launch()