File size: 1,307 Bytes
5afd42b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96aee55
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import ctranslate2
import sentencepiece as spm


modelDir = "./model"
sp_source_model = "./model/spm.ja.nopretok.model"
sp_target_model = "./model/spm.en.nopretok.model"
# inter_threads: quantas operações independentes podem ser executadas simultaneamente
translator = ctranslate2.Translator(modelDir, device="cpu", intra_threads=4, inter_threads=1)


def tokenizeBatch(text):
    sp = spm.SentencePieceProcessor(sp_source_model)
    if isinstance(text, list): return sp.encode(text, out_type=str)
    elif isinstance(text, str):
        return [sp.encode(text, out_type=str)]


def detokenizeBatch(text: str):
    sp = spm.SentencePieceProcessor(sp_target_model)
    translation = sp.decode(text)
    return translation


def translate(text: str):
    translated = translator.translate_batch(
        source=tokenizeBatch(text), 
        num_hypotheses= 1, 
        return_alternatives= False, 
        replace_unknowns= False, 
        no_repeat_ngram_size= 3, # repetition_penalty
        disable_unk= True, 
        beam_size= 5, 
        sampling_temperature= 0, 
    )

    return [''.join( detokenizeBatch(result.hypotheses[0]) ) for result in translated]


if __name__ == "__main__":
    translated = translate("ダンガンロンパ 希望の学園と絶望の高校生")
    print(translated)