cointegrated commited on
Commit
3740b63
1 Parent(s): 2a62da0

use sentence splitters from stopes

Browse files

Signed-off-by: David Dale <daviddale@meta.com>

Files changed (2) hide show
  1. app.py +11 -1
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import spaces
2
  import gradio as gr
3
  from sacremoses import MosesPunctNormalizer
 
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from flores import code_mapping
6
  import platform
@@ -35,6 +36,14 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
35
 
36
  punct_normalizer = MosesPunctNormalizer(lang="en")
37
 
 
 
 
 
 
 
 
 
38
  # cache function
39
  @lru_cache(maxsize=100)
40
  def translate(text: str, src_lang: str, tgt_lang: str):
@@ -60,7 +69,8 @@ def _translate(text: str, src_lang: str, tgt_lang: str):
60
  translated_paragraphs = []
61
 
62
  for paragraph in paragraphs:
63
- sentences = nltk.sent_tokenize(paragraph)
 
64
  translated_sentences = []
65
 
66
  for sentence in sentences:
 
1
  import spaces
2
  import gradio as gr
3
  from sacremoses import MosesPunctNormalizer
4
+ from stopes.pipelines.monolingual.utils.sentence_split import get_split_algo
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
  from flores import code_mapping
7
  import platform
 
36
 
37
  punct_normalizer = MosesPunctNormalizer(lang="en")
38
 
39
+
40
+ @lru_cache(maxsize=202)
41
+ def get_language_specific_sentence_splitter(language_code):
42
+ short_code = language_code[:3]
43
+ splitter = get_split_algo(short_code, "default")
44
+ return splitter
45
+
46
+
47
  # cache function
48
  @lru_cache(maxsize=100)
49
  def translate(text: str, src_lang: str, tgt_lang: str):
 
69
  translated_paragraphs = []
70
 
71
  for paragraph in paragraphs:
72
+ splitter = get_language_specific_sentence_splitter(src_code)
73
+ sentences = list(splitter(paragraph))
74
  translated_sentences = []
75
 
76
  for sentence in sentences:
requirements.txt CHANGED
@@ -5,3 +5,4 @@ gradio==4.32.2
5
  spaces
6
  nltk
7
  sacremoses
 
 
5
  spaces
6
  nltk
7
  sacremoses
8
+ stopes[mono] @ git+https://github.com/facebookresearch/stopes@better-sentence-splitters