playmak3r commited on
Commit
cf76446
1 Parent(s): 3311693

refactor: model module

Browse files
Files changed (1) hide show
  1. server/model.py +41 -27
server/model.py CHANGED
@@ -1,44 +1,58 @@
 
1
  import ctranslate2
2
  import sentencepiece as spm
3
 
4
 
5
- modelDir = "./model"
6
- sp_source_model = "./model/spm.ja.nopretok.model"
7
- sp_target_model = "./model/spm.en.nopretok.model"
8
- # inter_threads: quantas operações independentes podem ser executadas simultaneamente
9
- translator = ctranslate2.Translator(modelDir, device="cpu", intra_threads=4, inter_threads=1)
10
 
 
 
 
11
 
12
- def tokenizeBatch(text):
13
- sp = spm.SentencePieceProcessor(sp_source_model)
14
- if isinstance(text, list): return sp.encode(text, out_type=str)
15
- elif isinstance(text, str):
16
- return [sp.encode(text, out_type=str)]
17
 
 
 
 
 
 
 
 
18
 
19
- def detokenizeBatch(text: str):
20
- sp = spm.SentencePieceProcessor(sp_target_model)
21
- translation = sp.decode(text)
22
- return translation
 
23
 
24
 
25
- def translate(text: str):
26
- translated = translator.translate_batch(
27
- source=tokenizeBatch(text),
28
- num_hypotheses= 1,
29
- return_alternatives= False,
30
- replace_unknowns= False,
31
- no_repeat_ngram_size= 3, # repetition_penalty
32
- disable_unk= True,
33
- beam_size= 5,
34
- sampling_temperature= 0,
35
- )
36
 
37
- return [''.join( detokenizeBatch(result.hypotheses[0]) ) for result in translated]
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
  if __name__ == "__main__":
41
- translated = translate("ダンガンロンパ 希望の学園と絶望の高校生")
 
 
 
 
 
 
42
  print(translated)
43
 
44
 
 
1
+ import sys, os
2
  import ctranslate2
3
  import sentencepiece as spm
4
 
5
 
 
 
 
 
 
6
 
7
+ def indexOf(list: list, value):
8
+ try: return list.index(value)
9
+ except: return -1
10
 
 
 
 
 
 
11
 
12
+ class SugoiTranslator:
13
+ def __init__(self, modelDir= "./model") -> None:
14
+ self.modelDir = modelDir
15
+ self.sp_source_model = os.path.join(modelDir, "spm.ja.nopretok.model")
16
+ self.sp_target_model = os.path.join(modelDir, "spm.en.nopretok.model")
17
+ # inter_threads: quantas operações independentes podem ser executadas simultaneamente
18
+ self.translator = ctranslate2.Translator(modelDir, device="cpu", intra_threads=4, inter_threads=1)
19
 
20
+ def tokenizeBatch(self, text):
21
+ sp = spm.SentencePieceProcessor(self.sp_source_model)
22
+ if isinstance(text, list): return sp.encode(text, out_type=str)
23
+ elif isinstance(text, str):
24
+ return [sp.encode(text, out_type=str)]
25
 
26
 
27
+ def detokenizeBatch(self, text: str):
28
+ sp = spm.SentencePieceProcessor(self.sp_target_model)
29
+ translation = sp.decode(text)
30
+ return translation
 
 
 
 
 
 
 
31
 
32
+
33
+ def translate(self, text: str):
34
+ translated = self.translator.translate_batch(
35
+ source= self.tokenizeBatch(text),
36
+ num_hypotheses= 1,
37
+ return_alternatives= False,
38
+ replace_unknowns= False,
39
+ no_repeat_ngram_size= 3, # repetition_penalty
40
+ disable_unk= True,
41
+ beam_size= 5,
42
+ sampling_temperature= 0,
43
+ )
44
+
45
+ return [''.join( self.detokenizeBatch(result.hypotheses[0]) ) for result in translated]
46
 
47
 
48
  if __name__ == "__main__":
49
+ index = indexOf(sys.argv, "-modelDir")
50
+ if index != -1:
51
+ global modelDir
52
+ modelDir = sys.argv[index+1]
53
+
54
+ sugoiTranslator = SugoiTranslator(modelDir)
55
+ translated = sugoiTranslator.translate("ダンガンロンパ 希望の学園と絶望の高校生")
56
  print(translated)
57
 
58