Spaces:
Running
on
Zero
Running
on
Zero
cointegrated
commited on
Commit
•
3740b63
1
Parent(s):
2a62da0
use sentence splitters from stopes
Browse filesSigned-off-by: David Dale <daviddale@meta.com>
- app.py +11 -1
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import spaces
|
2 |
import gradio as gr
|
3 |
from sacremoses import MosesPunctNormalizer
|
|
|
4 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
5 |
from flores import code_mapping
|
6 |
import platform
|
@@ -35,6 +36,14 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|
35 |
|
36 |
punct_normalizer = MosesPunctNormalizer(lang="en")
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
# cache function
|
39 |
@lru_cache(maxsize=100)
|
40 |
def translate(text: str, src_lang: str, tgt_lang: str):
|
@@ -60,7 +69,8 @@ def _translate(text: str, src_lang: str, tgt_lang: str):
|
|
60 |
translated_paragraphs = []
|
61 |
|
62 |
for paragraph in paragraphs:
|
63 |
-
|
|
|
64 |
translated_sentences = []
|
65 |
|
66 |
for sentence in sentences:
|
|
|
1 |
import spaces
|
2 |
import gradio as gr
|
3 |
from sacremoses import MosesPunctNormalizer
|
4 |
+
from stopes.pipelines.monolingual.utils.sentence_split import get_split_algo
|
5 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
6 |
from flores import code_mapping
|
7 |
import platform
|
|
|
36 |
|
37 |
punct_normalizer = MosesPunctNormalizer(lang="en")
|
38 |
|
39 |
+
|
40 |
+
@lru_cache(maxsize=202)
|
41 |
+
def get_language_specific_sentence_splitter(language_code):
|
42 |
+
short_code = language_code[:3]
|
43 |
+
splitter = get_split_algo(short_code, "default")
|
44 |
+
return splitter
|
45 |
+
|
46 |
+
|
47 |
# cache function
|
48 |
@lru_cache(maxsize=100)
|
49 |
def translate(text: str, src_lang: str, tgt_lang: str):
|
|
|
69 |
translated_paragraphs = []
|
70 |
|
71 |
for paragraph in paragraphs:
|
72 |
+
splitter = get_language_specific_sentence_splitter(src_code)
|
73 |
+
sentences = list(splitter(paragraph))
|
74 |
translated_sentences = []
|
75 |
|
76 |
for sentence in sentences:
|
requirements.txt
CHANGED
@@ -5,3 +5,4 @@ gradio==4.32.2
|
|
5 |
spaces
|
6 |
nltk
|
7 |
sacremoses
|
|
|
|
5 |
spaces
|
6 |
nltk
|
7 |
sacremoses
|
8 |
+
stopes[mono] @ git+https://github.com/facebookresearch/stopes@better-sentence-splitters
|