davanstrien HF staff commited on
Commit
71ae380
1 Parent(s): 620b306

sliding window

Browse files
Files changed (1) hide show
  1. app.py +30 -41
app.py CHANGED
@@ -1,16 +1,12 @@
1
  import spaces
2
  import gradio as gr
3
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
4
  from flores import code_mapping
5
  import platform
6
- import re
7
 
8
  device = "cpu" if platform.system() == "Darwin" else "cuda"
9
- MODEL_NAME = (
10
- "facebook/nllb-200-distilled-600M"
11
- if platform.system() == "Darwin"
12
- else "facebook/nllb-200-3.3B"
13
- )
14
 
15
  code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1]))
16
  flores_codes = list(code_mapping.keys())
@@ -26,49 +22,42 @@ model, tokenizer = load_model()
26
 
27
 
28
  @spaces.GPU
29
- def _translate(text: str, src_lang: str, tgt_lang: str):
30
- source = code_mapping[src_lang]
31
- target = code_mapping[tgt_lang]
32
- translator = pipeline(
33
- "translation",
34
- model=model,
35
- tokenizer=tokenizer,
36
- src_lang=source,
37
- tgt_lang=target,
38
- device=device,
39
- )
40
- output = translator(text, max_length=400)
41
- return output[0]["translation_text"]
42
-
43
-
44
- def translate(text: str, src_lang: str, tgt_lang: str):
45
- # split the input text into smaller chunks
46
- outputs = ""
47
- paragraph_chunks = text.split("\n")
48
- for chunk in paragraph_chunks:
49
- # check if the chunk is too long
50
- if len(chunk) > 500:
51
- # split on full stops, question marks, and exclamation marks
52
- sentence_chunks = re.split(r"(?<=[.!?])\s+", chunk)
53
- for sentence in sentence_chunks:
54
- if sentence.strip(): # check if the sentence is not empty
55
- outputs += f"{_translate(sentence, src_lang, tgt_lang)} "
56
- outputs += "\n\n"
57
- else:
58
- outputs += _translate(chunk, src_lang, tgt_lang) + "\n\n"
59
-
60
- return outputs.strip()
61
 
62
 
63
  description = """
64
- No Language Left Behind (NLLB) is a series of open-source models aiming to provide high-quality translations between 200 language.
65
  This demo application allows you to use the NLLB model to translate text between a source and target language.
66
 
67
  ## Notes
68
 
69
  - Whilst the model supports 200 languages, the quality of translations may vary between languages.
70
  - "Low Resource" languages (languages which are less present on the internet and have a lower amount of investment) may have lower quality translations.
71
- - The demo is not intended to be used for very long texts.
72
  """
73
 
74
  instructions = """
 
1
  import spaces
2
  import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
  from flores import code_mapping
5
  import platform
6
+ import torch
7
 
8
  device = "cpu" if platform.system() == "Darwin" else "cuda"
9
+ MODEL_NAME = "facebook/m2m100_1.2B"
 
 
 
 
10
 
11
  code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1]))
12
  flores_codes = list(code_mapping.keys())
 
22
 
23
 
24
  @spaces.GPU
25
+ def translate(
26
+ text: str,
27
+ src_lang: str,
28
+ tgt_lang: str,
29
+ window_size: int = 800,
30
+ overlap_size: int = 200,
31
+ ):
32
+ input_tokens = tokenizer.encode(text, return_tensors="pt")[0].cpu().numpy().tolist()
33
+ translated_chunks = []
34
+
35
+ for i in range(0, len(input_tokens), window_size - overlap_size):
36
+ window = input_tokens[i : i + window_size]
37
+ translated_chunk = model.generate(
38
+ input_ids=torch.tensor([window]).to(device),
39
+ forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
40
+ max_length=window_size,
41
+ num_return_sequences=1,
42
+ )
43
+ translated_chunk = tokenizer.decode(
44
+ translated_chunk[0], skip_special_tokens=True
45
+ )
46
+ translated_chunks.append(translated_chunk)
47
+
48
+ translated_text = " ".join(translated_chunks)
49
+ return translated_text
 
 
 
 
 
 
 
50
 
51
 
52
  description = """
53
+ No Language Left Behind (NLLB) is a series of open-source models aiming to provide high-quality translations between 200 languages.
54
  This demo application allows you to use the NLLB model to translate text between a source and target language.
55
 
56
  ## Notes
57
 
58
  - Whilst the model supports 200 languages, the quality of translations may vary between languages.
59
  - "Low Resource" languages (languages which are less present on the internet and have a lower amount of investment) may have lower quality translations.
60
+ - The demo uses a sliding window approach to handle longer texts.
61
  """
62
 
63
  instructions = """