drown0315 commited on
Commit
5e3d623
·
1 Parent(s): ada1a34

feat: 增加双语字幕

Browse files
Files changed (2) hide show
  1. decode.py +7 -3
  2. requirements.txt +3 -0
decode.py CHANGED
@@ -32,6 +32,7 @@ class Segment:
32
  start: float
33
  duration: float
34
  text: str = ""
 
35
 
36
  @property
37
  def end(self):
@@ -44,6 +45,8 @@ class Segment:
44
  s = s.replace(".", ",")
45
  s += "\n"
46
  s += self.text
 
 
47
  return s
48
 
49
 
@@ -124,12 +127,13 @@ def decode(
124
 
125
  for seg, stream in zip(segments, streams):
126
  en_text = stream.result.text.strip()
127
- cn_text = _llm_translator.translate(en_text)
128
- seg.text = en_text +"\n"+cn_text
129
  if len(seg.text) == 0:
130
  logging.info("Skip empty segment")
131
  continue
132
 
 
 
133
  if len(all_text) == 0:
134
  all_text.append(seg.text)
135
  elif len(all_text[-1][0].encode()) == 1 and len(seg.text[0].encode()) == 1:
@@ -171,7 +175,7 @@ class LLMTranslator:
171
  def translate(self, src_text: str) -> str:
172
  translated = self._model.generate(**self._tokenizer(src_text, return_tensors="pt", padding=True))
173
  res = [self._tokenizer.decode(t, skip_special_tokens=True) for t in translated]
174
- return res
175
 
176
 
177
  _llm_translator = LLMTranslator()
 
32
  start: float
33
  duration: float
34
  text: str = ""
35
+ cn_text: str = ""
36
 
37
  @property
38
  def end(self):
 
45
  s = s.replace(".", ",")
46
  s += "\n"
47
  s += self.text
48
+ s += "\n"
49
+ s += self.cn_text
50
  return s
51
 
52
 
 
127
 
128
  for seg, stream in zip(segments, streams):
129
  en_text = stream.result.text.strip()
130
+ seg.text = en_text
 
131
  if len(seg.text) == 0:
132
  logging.info("Skip empty segment")
133
  continue
134
 
135
+ seg.cn_text = _llm_translator.translate(en_text)
136
+
137
  if len(all_text) == 0:
138
  all_text.append(seg.text)
139
  elif len(all_text[-1][0].encode()) == 1 and len(seg.text[0].encode()) == 1:
 
175
  def translate(self, src_text: str) -> str:
176
  translated = self._model.generate(**self._tokenizer(src_text, return_tensors="pt", padding=True))
177
  res = [self._tokenizer.decode(t, skip_special_tokens=True) for t in translated]
178
+ return "".join(str(itemText) for itemText in res)
179
 
180
 
181
  _llm_translator = LLMTranslator()
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
  #https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/cpu/1.10.29/sherpa_onnx-1.10.29-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
2
  sherpa-onnx>=1.10.35
3
  ffmpeg-python
 
 
 
 
1
  #https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/cpu/1.10.29/sherpa_onnx-1.10.29-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
2
  sherpa-onnx>=1.10.35
3
  ffmpeg-python
4
+ transformers
5
+ sentencepiece
6
+ torch