Spaces:
Running
Running
import ctranslate2 | |
from subword_nmt.apply_bpe import BPE | |
import codecs | |
import re | |
def apply_subwording(sample_text, model_code_path): | |
# APPLY BPE WITH SUBWORD-NMT | |
model = codecs.open(model_code_path, encoding='utf-8') | |
bpe = BPE(model) | |
subwording_text = "" | |
for line in sample_text.splitlines(): | |
subwording_line = bpe.process_line(line) | |
subwording_text = subwording_text + subwording_line + "\n" | |
return subwording_text | |
def remove_subwording_marks(translated_text): | |
return re.sub("@@ ", "", translated_text) | |
def translate_nos(sample_text, model): | |
tokenizer_model = model[0] | |
translator_model = model[1] | |
# Apply subwording | |
subwording_text = apply_subwording(sample_text, tokenizer_model) | |
# Translate entry | |
translator = ctranslate2.Translator(translator_model, device="cpu") | |
output ="" | |
for line in subwording_text.splitlines(): | |
line = line.strip() | |
r = translator.translate_batch( | |
[line.split()], replace_unknowns=True, beam_size=5, batch_type='examples' | |
) | |
results =' '.join(r[0].hypotheses[0])+"\n" | |
output = output + results | |
# Remove subwording | |
output = remove_subwording_marks(output) | |
return output |