|
import ctranslate2 |
|
import sentencepiece as spm |
|
|
|
|
|
modelDir = "./model" |
|
sp_source_model = "./model/spm.ja.nopretok.model" |
|
sp_target_model = "./model/spm.en.nopretok.model" |
|
|
|
translator = ctranslate2.Translator(modelDir, device="cpu", intra_threads=4, inter_threads=1) |
|
|
|
|
|
def tokenizeBatch(text): |
|
sp = spm.SentencePieceProcessor(sp_source_model) |
|
if isinstance(text, list): return sp.encode(text, out_type=str) |
|
elif isinstance(text, str): |
|
return [sp.encode(text, out_type=str)] |
|
|
|
|
|
def detokenizeBatch(text: str): |
|
sp = spm.SentencePieceProcessor(sp_target_model) |
|
translation = sp.decode(text) |
|
return translation |
|
|
|
|
|
def translate(text: str): |
|
translated = translator.translate_batch( |
|
source=tokenizeBatch(text), |
|
num_hypotheses= 1, |
|
return_alternatives= False, |
|
replace_unknowns= False, |
|
no_repeat_ngram_size= 3, |
|
disable_unk= True, |
|
beam_size= 5, |
|
sampling_temperature= 0, |
|
) |
|
|
|
return [''.join( detokenizeBatch(result.hypotheses[0]) ) for result in translated] |
|
|
|
|
|
if __name__ == "__main__": |
|
translated = translate("ダンガンロンパ 希望の学園と絶望の高校生") |
|
print(translated) |
|
|
|
|
|
|