drown0315
commited on
Commit
·
5e3d623
1
Parent(s):
ada1a34
feat: 增加双语字幕
Browse files- decode.py +7 -3
- requirements.txt +3 -0
decode.py
CHANGED
@@ -32,6 +32,7 @@ class Segment:
|
|
32 |
start: float
|
33 |
duration: float
|
34 |
text: str = ""
|
|
|
35 |
|
36 |
@property
|
37 |
def end(self):
|
@@ -44,6 +45,8 @@ class Segment:
|
|
44 |
s = s.replace(".", ",")
|
45 |
s += "\n"
|
46 |
s += self.text
|
|
|
|
|
47 |
return s
|
48 |
|
49 |
|
@@ -124,12 +127,13 @@ def decode(
|
|
124 |
|
125 |
for seg, stream in zip(segments, streams):
|
126 |
en_text = stream.result.text.strip()
|
127 |
-
|
128 |
-
seg.text = en_text +"\n"+cn_text
|
129 |
if len(seg.text) == 0:
|
130 |
logging.info("Skip empty segment")
|
131 |
continue
|
132 |
|
|
|
|
|
133 |
if len(all_text) == 0:
|
134 |
all_text.append(seg.text)
|
135 |
elif len(all_text[-1][0].encode()) == 1 and len(seg.text[0].encode()) == 1:
|
@@ -171,7 +175,7 @@ class LLMTranslator:
|
|
171 |
def translate(self, src_text: str) -> str:
|
172 |
translated = self._model.generate(**self._tokenizer(src_text, return_tensors="pt", padding=True))
|
173 |
res = [self._tokenizer.decode(t, skip_special_tokens=True) for t in translated]
|
174 |
-
return res
|
175 |
|
176 |
|
177 |
_llm_translator = LLMTranslator()
|
|
|
32 |
start: float
|
33 |
duration: float
|
34 |
text: str = ""
|
35 |
+
cn_text: str = ""
|
36 |
|
37 |
@property
|
38 |
def end(self):
|
|
|
45 |
s = s.replace(".", ",")
|
46 |
s += "\n"
|
47 |
s += self.text
|
48 |
+
s += "\n"
|
49 |
+
s += self.cn_text
|
50 |
return s
|
51 |
|
52 |
|
|
|
127 |
|
128 |
for seg, stream in zip(segments, streams):
|
129 |
en_text = stream.result.text.strip()
|
130 |
+
seg.text = en_text
|
|
|
131 |
if len(seg.text) == 0:
|
132 |
logging.info("Skip empty segment")
|
133 |
continue
|
134 |
|
135 |
+
seg.cn_text = _llm_translator.translate(en_text)
|
136 |
+
|
137 |
if len(all_text) == 0:
|
138 |
all_text.append(seg.text)
|
139 |
elif len(all_text[-1][0].encode()) == 1 and len(seg.text[0].encode()) == 1:
|
|
|
175 |
def translate(self, src_text: str) -> str:
|
176 |
translated = self._model.generate(**self._tokenizer(src_text, return_tensors="pt", padding=True))
|
177 |
res = [self._tokenizer.decode(t, skip_special_tokens=True) for t in translated]
|
178 |
+
return "".join(str(itemText) for itemText in res)
|
179 |
|
180 |
|
181 |
_llm_translator = LLMTranslator()
|
requirements.txt
CHANGED
@@ -1,3 +1,6 @@
|
|
1 |
#https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/cpu/1.10.29/sherpa_onnx-1.10.29-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
|
2 |
sherpa-onnx>=1.10.35
|
3 |
ffmpeg-python
|
|
|
|
|
|
|
|
1 |
#https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/cpu/1.10.29/sherpa_onnx-1.10.29-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
|
2 |
sherpa-onnx>=1.10.35
|
3 |
ffmpeg-python
|
4 |
+
transformers
|
5 |
+
sentencepiece
|
6 |
+
torch
|