Spaces:
Build error
Build error
artificialguybr
commited on
Commit
•
75517c0
1
Parent(s):
285da88
Update app.py
Browse files
app.py
CHANGED
@@ -5,7 +5,8 @@ import subprocess
|
|
5 |
import os, stat
|
6 |
import uuid
|
7 |
from googletrans import Translator
|
8 |
-
|
|
|
9 |
import ffmpeg
|
10 |
import json
|
11 |
from scipy.signal import wiener
|
@@ -24,7 +25,6 @@ from huggingface_hub import HfApi
|
|
24 |
import moviepy.editor as mp
|
25 |
|
26 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
27 |
-
os.environ["COQUI_TOS_AGREED"] = "1"
|
28 |
api = HfApi(token=HF_TOKEN)
|
29 |
repo_id = "artificialguybr/video-dubbing"
|
30 |
ZipFile("ffmpeg.zip").extractall()
|
@@ -121,6 +121,10 @@ def transcribe_audio(file_path):
|
|
121 |
|
122 |
return result
|
123 |
|
|
|
|
|
|
|
|
|
124 |
@spaces.GPU
|
125 |
def process_video(radio, video, target_language, has_closeup_face):
|
126 |
try:
|
@@ -156,15 +160,34 @@ def process_video(radio, video, target_language, has_closeup_face):
|
|
156 |
print(f"Error encountered during transcription: {str(e)}")
|
157 |
raise
|
158 |
|
159 |
-
language_mapping = {
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
translator = Translator()
|
162 |
translated_text = translator.translate(whisper_text, dest=target_language_code).text
|
163 |
print(translated_text)
|
164 |
|
165 |
-
|
166 |
-
tts.to('cuda')
|
167 |
-
tts.tts_to_file(translated_text, speaker_wav=f"{run_uuid}_output_audio_final.wav", file_path=f"{run_uuid}_output_synth.wav", language=target_language_code)
|
168 |
|
169 |
pad_top = 0
|
170 |
pad_bottom = 15
|
@@ -228,7 +251,7 @@ iface = gr.Interface(
|
|
228 |
inputs=[
|
229 |
radio,
|
230 |
video,
|
231 |
-
gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)"], label="Target Language for Dubbing", value="Spanish"),
|
232 |
gr.Checkbox(
|
233 |
label="Video has a close-up face. Use Wav2lip.",
|
234 |
value=False,
|
@@ -246,10 +269,9 @@ with gr.Blocks() as demo:
|
|
246 |
radio.change(swap, inputs=[radio], outputs=video)
|
247 |
gr.Markdown("""
|
248 |
**Note:**
|
249 |
-
- Video limit is 1 minute. It will
|
250 |
- Generation may take up to 5 minutes.
|
251 |
-
-
|
252 |
-
- The tool uses open-source models for all models. It's a alpha version.
|
253 |
- Quality can be improved but would require more processing time per video. For scalability and hardware limitations, speed was chosen, not just quality.
|
254 |
- If you need more than 1 minute, duplicate the Space and change the limit on app.py.
|
255 |
- If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.
|
|
|
5 |
import os, stat
|
6 |
import uuid
|
7 |
from googletrans import Translator
|
8 |
+
import edge_tts
|
9 |
+
import asyncio
|
10 |
import ffmpeg
|
11 |
import json
|
12 |
from scipy.signal import wiener
|
|
|
25 |
import moviepy.editor as mp
|
26 |
|
27 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
28 |
api = HfApi(token=HF_TOKEN)
|
29 |
repo_id = "artificialguybr/video-dubbing"
|
30 |
ZipFile("ffmpeg.zip").extractall()
|
|
|
121 |
|
122 |
return result
|
123 |
|
124 |
+
async def text_to_speech(text, voice, output_file):
|
125 |
+
communicate = edge_tts.Communicate(text, voice)
|
126 |
+
await communicate.save(output_file)
|
127 |
+
|
128 |
@spaces.GPU
|
129 |
def process_video(radio, video, target_language, has_closeup_face):
|
130 |
try:
|
|
|
160 |
print(f"Error encountered during transcription: {str(e)}")
|
161 |
raise
|
162 |
|
163 |
+
language_mapping = {
|
164 |
+
'English': ('en', 'en-US-EricNeural'),
|
165 |
+
'Spanish': ('es', 'es-ES-AlvaroNeural'),
|
166 |
+
'French': ('fr', 'fr-FR-HenriNeural'),
|
167 |
+
'German': ('de', 'de-DE-ConradNeural'),
|
168 |
+
'Italian': ('it', 'it-IT-DiegoNeural'),
|
169 |
+
'Portuguese': ('pt', 'pt-PT-DuarteNeural'),
|
170 |
+
'Polish': ('pl', 'pl-PL-MarekNeural'),
|
171 |
+
'Turkish': ('tr', 'tr-TR-AhmetNeural'),
|
172 |
+
'Russian': ('ru', 'ru-RU-DmitryNeural'),
|
173 |
+
'Dutch': ('nl', 'nl-NL-MaartenNeural'),
|
174 |
+
'Czech': ('cs', 'cs-CZ-AntoninNeural'),
|
175 |
+
'Arabic': ('ar', 'ar-SA-HamedNeural'),
|
176 |
+
'Chinese (Simplified)': ('zh-CN', 'zh-CN-YunxiNeural'),
|
177 |
+
'Japanese': ('ja', 'ja-JP-KeitaNeural'),
|
178 |
+
'Korean': ('ko', 'ko-KR-InJoonNeural'),
|
179 |
+
'Hindi': ('hi', 'hi-IN-MadhurNeural'),
|
180 |
+
'Swedish': ('sv', 'sv-SE-MattiasNeural'),
|
181 |
+
'Danish': ('da', 'da-DK-JeppeNeural'),
|
182 |
+
'Finnish': ('fi', 'fi-FI-HarriNeural'),
|
183 |
+
'Greek': ('el', 'el-GR-NestorasNeural')
|
184 |
+
}
|
185 |
+
target_language_code, voice = language_mapping[target_language]
|
186 |
translator = Translator()
|
187 |
translated_text = translator.translate(whisper_text, dest=target_language_code).text
|
188 |
print(translated_text)
|
189 |
|
190 |
+
asyncio.run(text_to_speech(translated_text, voice, f"{run_uuid}_output_synth.wav"))
|
|
|
|
|
191 |
|
192 |
pad_top = 0
|
193 |
pad_bottom = 15
|
|
|
251 |
inputs=[
|
252 |
radio,
|
253 |
video,
|
254 |
+
gr.Dropdown(choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)", "Japanese", "Korean", "Hindi", "Swedish", "Danish", "Finnish", "Greek"], label="Target Language for Dubbing", value="Spanish"),
|
255 |
gr.Checkbox(
|
256 |
label="Video has a close-up face. Use Wav2lip.",
|
257 |
value=False,
|
|
|
269 |
radio.change(swap, inputs=[radio], outputs=video)
|
270 |
gr.Markdown("""
|
271 |
**Note:**
|
272 |
+
- Video limit is 1 minute. It will dubbing all people using just one voice.
|
273 |
- Generation may take up to 5 minutes.
|
274 |
+
- The tool uses open-source models for all models. It's an alpha version.
|
|
|
275 |
- Quality can be improved but would require more processing time per video. For scalability and hardware limitations, speed was chosen, not just quality.
|
276 |
- If you need more than 1 minute, duplicate the Space and change the limit on app.py.
|
277 |
- If you incorrectly mark the 'Video has a close-up face' checkbox, the dubbing may not work as expected.
|