Roger Condori commited on
Commit
e6b8403
β€’
1 Parent(s): e67b186

add base app

Browse files
.github/workflows/main.yml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v2
14
+ with:
15
+ fetch-depth: 0
16
+ - name: Add remote
17
+ env:
18
+ HF: ${{ secrets.HF }}
19
+ run: git remote add space https://r3gm:$HF@huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content
20
+ - name: Push to hub
21
+ env:
22
+ HF: ${{ secrets.HF }}
23
+ run: git push --force https://r3gm:$HF@huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content main
README.md CHANGED
@@ -1,2 +1,12 @@
1
- # sonitranslate_app_hf
2
- hugging face app
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: SoniTranslate_translate_audio_of_a_video_content
3
+ emoji: πŸ¦€
4
+ colorFrom: indigo
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.35.2
8
+ app_file: app.py
9
+ pinned: true
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #os.system("git clone https://github.com/R3gm/SoniTranslate")
3
+ # pip install -r requirements.txt
4
+ import numpy as np
5
+ import gradio as gr
6
+ import whisperx
7
+ import torch
8
+ from gtts import gTTS
9
+ import librosa
10
+ import edge_tts
11
+ import asyncio
12
+ import gc
13
+ from pydub import AudioSegment
14
+ from tqdm import tqdm
15
+ from deep_translator import GoogleTranslator
16
+ import os
17
+ from soni_translate.audio_segments import create_translated_audio
18
+ from soni_translate.text_to_speech import make_voice_gradio
19
+ from soni_translate.translate_segments import translate_text
20
+ #from soni_translate import test
21
+
22
+ title = "<center><strong><font size='7'>πŸ“½οΈ SoniTranslate 🈷️</font></strong></center>"
23
+
24
+ news = """ ## πŸ“– News
25
+ πŸ”₯ 2023/07/01: Support (Thanks for [text](https://github.com)).
26
+ """
27
+
28
+ description = """ ## Translate the audio of a video content from one language to another while preserving synchronization.
29
+
30
+
31
+ This is a demo on Github project πŸ“½οΈ [SoniTranslate](https://github.com/R3gm/SoniTranslate).
32
+
33
+ πŸ“Ό You can upload a video or provide a video link. The generation is **limited to 10 seconds** to prevent errors with the queue in cpu. If you use a GPU, you won't have any of these limitations.
34
+
35
+ πŸš€ For **translate a video of any duration** and faster results, you can use the Colab notebook with GPU.
36
+
37
+ [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb)
38
+
39
+ """
40
+
41
+ tutorial = """ # πŸ”° Instructions for use.
42
+
43
+ 1. Upload a video on the first tab or use a video link on the second tab.
44
+
45
+ 2. Choose the language in which you want to translate the video.
46
+
47
+ 3. Specify the number of people speaking in the video and assign each one a text-to-speech voice suitable for the translation language.
48
+
49
+ 4. Press the 'Translate' button to obtain the results.
50
+
51
+ """
52
+
53
+
54
+ if not os.path.exists('audio'):
55
+ os.makedirs('audio')
56
+
57
+ if not os.path.exists('audio2/audio'):
58
+ os.makedirs('audio2/audio')
59
+
60
+ # Check GPU
61
+ if torch.cuda.is_available():
62
+ device = "cuda"
63
+ list_compute_type = ['float16', 'float32']
64
+ compute_type_default = 'float16'
65
+ whisper_model_default = 'large-v1'
66
+ else:
67
+ device = "cpu"
68
+ list_compute_type = ['float32']
69
+ compute_type_default = 'float32'
70
+ whisper_model_default = 'base'
71
+ print('Working in: ', device)
72
+
73
+
74
+ # Download an audio
75
+ #url = "https://www.youtube.com/watch?v=Rdi-SNhe2v4"
76
+
77
+ ### INIT
78
+ list_tts = ['af-ZA-AdriNeural-Female', 'af-ZA-WillemNeural-Male', 'am-ET-AmehaNeural-Male', 'am-ET-MekdesNeural-Female', 'ar-AE-FatimaNeural-Female', 'ar-AE-HamdanNeural-Male', 'ar-BH-AliNeural-Male', 'ar-BH-LailaNeural-Female', 'ar-DZ-AminaNeural-Female', 'ar-DZ-IsmaelNeural-Male', 'ar-EG-SalmaNeural-Female', 'ar-EG-ShakirNeural-Male', 'ar-IQ-BasselNeural-Male', 'ar-IQ-RanaNeural-Female', 'ar-JO-SanaNeural-Female', 'ar-JO-TaimNeural-Male', 'ar-KW-FahedNeural-Male', 'ar-KW-NouraNeural-Female', 'ar-LB-LaylaNeural-Female', 'ar-LB-RamiNeural-Male', 'ar-LY-ImanNeural-Female', 'ar-LY-OmarNeural-Male', 'ar-MA-JamalNeural-Male', 'ar-MA-MounaNeural-Female', 'ar-OM-AbdullahNeural-Male', 'ar-OM-AyshaNeural-Female', 'ar-QA-AmalNeural-Female', 'ar-QA-MoazNeural-Male', 'ar-SA-HamedNeural-Male', 'ar-SA-ZariyahNeural-Female', 'ar-SY-AmanyNeural-Female', 'ar-SY-LaithNeural-Male', 'ar-TN-HediNeural-Male', 'ar-TN-ReemNeural-Female', 'ar-YE-MaryamNeural-Female', 'ar-YE-SalehNeural-Male', 'az-AZ-BabekNeural-Male', 'az-AZ-BanuNeural-Female', 'bg-BG-BorislavNeural-Male', 'bg-BG-KalinaNeural-Female', 'bn-BD-NabanitaNeural-Female', 'bn-BD-PradeepNeural-Male', 'bn-IN-BashkarNeural-Male', 'bn-IN-TanishaaNeural-Female', 'bs-BA-GoranNeural-Male', 'bs-BA-VesnaNeural-Female', 'ca-ES-EnricNeural-Male', 'ca-ES-JoanaNeural-Female', 'cs-CZ-AntoninNeural-Male', 'cs-CZ-VlastaNeural-Female', 'cy-GB-AledNeural-Male', 'cy-GB-NiaNeural-Female', 'da-DK-ChristelNeural-Female', 'da-DK-JeppeNeural-Male', 'de-AT-IngridNeural-Female', 'de-AT-JonasNeural-Male', 'de-CH-JanNeural-Male', 'de-CH-LeniNeural-Female', 'de-DE-AmalaNeural-Female', 'de-DE-ConradNeural-Male', 'de-DE-KatjaNeural-Female', 'de-DE-KillianNeural-Male', 'el-GR-AthinaNeural-Female', 'el-GR-NestorasNeural-Male', 'en-AU-NatashaNeural-Female', 'en-AU-WilliamNeural-Male', 'en-CA-ClaraNeural-Female', 'en-CA-LiamNeural-Male', 'en-GB-LibbyNeural-Female', 'en-GB-MaisieNeural-Female', 'en-GB-RyanNeural-Male', 'en-GB-SoniaNeural-Female', 'en-GB-ThomasNeural-Male', 'en-HK-SamNeural-Male', 'en-HK-YanNeural-Female', 'en-IE-ConnorNeural-Male', 'en-IE-EmilyNeural-Female', 'en-IN-NeerjaExpressiveNeural-Female', 'en-IN-NeerjaNeural-Female', 'en-IN-PrabhatNeural-Male', 'en-KE-AsiliaNeural-Female', 'en-KE-ChilembaNeural-Male', 'en-NG-AbeoNeural-Male', 'en-NG-EzinneNeural-Female', 'en-NZ-MitchellNeural-Male', 'en-NZ-MollyNeural-Female', 'en-PH-JamesNeural-Male', 'en-PH-RosaNeural-Female', 'en-SG-LunaNeural-Female', 'en-SG-WayneNeural-Male', 'en-TZ-ElimuNeural-Male', 'en-TZ-ImaniNeural-Female', 'en-US-AnaNeural-Female', 'en-US-AriaNeural-Female', 'en-US-ChristopherNeural-Male', 'en-US-EricNeural-Male', 'en-US-GuyNeural-Male', 'en-US-JennyNeural-Female', 'en-US-MichelleNeural-Female', 'en-US-RogerNeural-Male', 'en-US-SteffanNeural-Male', 'en-ZA-LeahNeural-Female', 'en-ZA-LukeNeural-Male', 'es-AR-ElenaNeural-Female', 'es-AR-TomasNeural-Male', 'es-BO-MarceloNeural-Male', 'es-BO-SofiaNeural-Female', 'es-CL-CatalinaNeural-Female', 'es-CL-LorenzoNeural-Male', 'es-CO-GonzaloNeural-Male', 'es-CO-SalomeNeural-Female', 'es-CR-JuanNeural-Male', 'es-CR-MariaNeural-Female', 'es-CU-BelkysNeural-Female', 'es-CU-ManuelNeural-Male', 'es-DO-EmilioNeural-Male', 'es-DO-RamonaNeural-Female', 'es-EC-AndreaNeural-Female', 'es-EC-LuisNeural-Male', 'es-ES-AlvaroNeural-Male', 'es-ES-ElviraNeural-Female', 'es-GQ-JavierNeural-Male', 'es-GQ-TeresaNeural-Female', 'es-GT-AndresNeural-Male', 'es-GT-MartaNeural-Female', 'es-HN-CarlosNeural-Male', 'es-HN-KarlaNeural-Female', 'es-MX-DaliaNeural-Female', 'es-MX-JorgeNeural-Male', 'es-NI-FedericoNeural-Male', 'es-NI-YolandaNeural-Female', 'es-PA-MargaritaNeural-Female', 'es-PA-RobertoNeural-Male', 'es-PE-AlexNeural-Male', 'es-PE-CamilaNeural-Female', 'es-PR-KarinaNeural-Female', 'es-PR-VictorNeural-Male', 'es-PY-MarioNeural-Male', 'es-PY-TaniaNeural-Female', 'es-SV-LorenaNeural-Female', 'es-SV-RodrigoNeural-Male', 'es-US-AlonsoNeural-Male', 'es-US-PalomaNeural-Female', 'es-UY-MateoNeural-Male', 'es-UY-ValentinaNeural-Female', 'es-VE-PaolaNeural-Female', 'es-VE-SebastianNeural-Male', 'et-EE-AnuNeural-Female', 'et-EE-KertNeural-Male', 'fa-IR-DilaraNeural-Female', 'fa-IR-FaridNeural-Male', 'fi-FI-HarriNeural-Male', 'fi-FI-NooraNeural-Female', 'fil-PH-AngeloNeural-Male', 'fil-PH-BlessicaNeural-Female', 'fr-BE-CharlineNeural-Female', 'fr-BE-GerardNeural-Male', 'fr-CA-AntoineNeural-Male', 'fr-CA-JeanNeural-Male', 'fr-CA-SylvieNeural-Female', 'fr-CH-ArianeNeural-Female', 'fr-CH-FabriceNeural-Male', 'fr-FR-DeniseNeural-Female', 'fr-FR-EloiseNeural-Female', 'fr-FR-HenriNeural-Male', 'ga-IE-ColmNeural-Male', 'ga-IE-OrlaNeural-Female', 'gl-ES-RoiNeural-Male', 'gl-ES-SabelaNeural-Female', 'gu-IN-DhwaniNeural-Female', 'gu-IN-NiranjanNeural-Male', 'he-IL-AvriNeural-Male', 'he-IL-HilaNeural-Female', 'hi-IN-MadhurNeural-Male', 'hi-IN-SwaraNeural-Female', 'hr-HR-GabrijelaNeural-Female', 'hr-HR-SreckoNeural-Male', 'hu-HU-NoemiNeural-Female', 'hu-HU-TamasNeural-Male', 'id-ID-ArdiNeural-Male', 'id-ID-GadisNeural-Female', 'is-IS-GudrunNeural-Female', 'is-IS-GunnarNeural-Male', 'it-IT-DiegoNeural-Male', 'it-IT-ElsaNeural-Female', 'it-IT-IsabellaNeural-Female', 'ja-JP-KeitaNeural-Male', 'ja-JP-NanamiNeural-Female', 'jv-ID-DimasNeural-Male', 'jv-ID-SitiNeural-Female', 'ka-GE-EkaNeural-Female', 'ka-GE-GiorgiNeural-Male', 'kk-KZ-AigulNeural-Female', 'kk-KZ-DauletNeural-Male', 'km-KH-PisethNeural-Male', 'km-KH-SreymomNeural-Female', 'kn-IN-GaganNeural-Male', 'kn-IN-SapnaNeural-Female', 'ko-KR-InJoonNeural-Male', 'ko-KR-SunHiNeural-Female', 'lo-LA-ChanthavongNeural-Male', 'lo-LA-KeomanyNeural-Female', 'lt-LT-LeonasNeural-Male', 'lt-LT-OnaNeural-Female', 'lv-LV-EveritaNeural-Female', 'lv-LV-NilsNeural-Male', 'mk-MK-AleksandarNeural-Male', 'mk-MK-MarijaNeural-Female', 'ml-IN-MidhunNeural-Male', 'ml-IN-SobhanaNeural-Female', 'mn-MN-BataaNeural-Male', 'mn-MN-YesuiNeural-Female', 'mr-IN-AarohiNeural-Female', 'mr-IN-ManoharNeural-Male', 'ms-MY-OsmanNeural-Male', 'ms-MY-YasminNeural-Female', 'mt-MT-GraceNeural-Female', 'mt-MT-JosephNeural-Male', 'my-MM-NilarNeural-Female', 'my-MM-ThihaNeural-Male', 'nb-NO-FinnNeural-Male', 'nb-NO-PernilleNeural-Female', 'ne-NP-HemkalaNeural-Female', 'ne-NP-SagarNeural-Male', 'nl-BE-ArnaudNeural-Male', 'nl-BE-DenaNeural-Female', 'nl-NL-ColetteNeural-Female', 'nl-NL-FennaNeural-Female', 'nl-NL-MaartenNeural-Male', 'pl-PL-MarekNeural-Male', 'pl-PL-ZofiaNeural-Female', 'ps-AF-GulNawazNeural-Male', 'ps-AF-LatifaNeural-Female', 'pt-BR-AntonioNeural-Male', 'pt-BR-FranciscaNeural-Female', 'pt-PT-DuarteNeural-Male', 'pt-PT-RaquelNeural-Female', 'ro-RO-AlinaNeural-Female', 'ro-RO-EmilNeural-Male', 'ru-RU-DmitryNeural-Male', 'ru-RU-SvetlanaNeural-Female', 'si-LK-SameeraNeural-Male', 'si-LK-ThiliniNeural-Female', 'sk-SK-LukasNeural-Male', 'sk-SK-ViktoriaNeural-Female', 'sl-SI-PetraNeural-Female', 'sl-SI-RokNeural-Male', 'so-SO-MuuseNeural-Male', 'so-SO-UbaxNeural-Female', 'sq-AL-AnilaNeural-Female', 'sq-AL-IlirNeural-Male', 'sr-RS-NicholasNeural-Male', 'sr-RS-SophieNeural-Female', 'su-ID-JajangNeural-Male', 'su-ID-TutiNeural-Female', 'sv-SE-MattiasNeural-Male', 'sv-SE-SofieNeural-Female', 'sw-KE-RafikiNeural-Male', 'sw-KE-ZuriNeural-Female', 'sw-TZ-DaudiNeural-Male', 'sw-TZ-RehemaNeural-Female', 'ta-IN-PallaviNeural-Female', 'ta-IN-ValluvarNeural-Male', 'ta-LK-KumarNeural-Male', 'ta-LK-SaranyaNeural-Female', 'ta-MY-KaniNeural-Female', 'ta-MY-SuryaNeural-Male', 'ta-SG-AnbuNeural-Male', 'ta-SG-VenbaNeural-Female', 'te-IN-MohanNeural-Male', 'te-IN-ShrutiNeural-Female', 'th-TH-NiwatNeural-Male', 'th-TH-PremwadeeNeural-Female', 'tr-TR-AhmetNeural-Male', 'tr-TR-EmelNeural-Female', 'uk-UA-OstapNeural-Male', 'uk-UA-PolinaNeural-Female', 'ur-IN-GulNeural-Female', 'ur-IN-SalmanNeural-Male', 'ur-PK-AsadNeural-Male', 'ur-PK-UzmaNeural-Female', 'uz-UZ-MadinaNeural-Female', 'uz-UZ-SardorNeural-Male', 'vi-VN-HoaiMyNeural-Female', 'vi-VN-NamMinhNeural-Male', 'zh-CN-XiaoxiaoNeural-Female', 'zh-CN-XiaoyiNeural-Female', 'zh-CN-YunjianNeural-Male', 'zh-CN-YunxiNeural-Male', 'zh-CN-YunxiaNeural-Male', 'zh-CN-YunyangNeural-Male', 'zh-CN-liaoning-XiaobeiNeural-Female', 'zh-CN-shaanxi-XiaoniNeural-Female']
79
+
80
+
81
+ def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
82
+ TRANSLATE_AUDIO_TO, min_speakers, max_speakers,
83
+ tts_voice00, tts_voice01,tts_voice02,tts_voice03,tts_voice04,tts_voice05):
84
+
85
+ YOUR_HF_TOKEN = os.getenv("My_hf_token")
86
+
87
+ OutputFile = 'Video.mp4'
88
+ audio_wav = "audio.wav"
89
+ Output_name_file = "audio_dub_solo.wav"
90
+ mix_audio = "audio_mix.mp3"
91
+ video_output = "diar_output.mp4"
92
+
93
+ os.system(f"rm {Output_name_file}")
94
+ os.system("rm Video.mp4")
95
+ #os.system("rm diar_output.mp4")
96
+ os.system("rm audio.wav")
97
+
98
+
99
+ if os.path.exists(video):
100
+ if device == 'cpu':
101
+ # max 1 minute in cpu
102
+ print('10 s. Limited for CPU ')
103
+ os.system(f"ffmpeg -y -i {video} -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4")
104
+ else:
105
+ os.system(f"ffmpeg -y -i {video} -c:v libx264 -c:a aac -strict experimental Video.mp4")
106
+
107
+ os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
108
+ else:
109
+ if device == 'cpu':
110
+ # max 1 minute in cpu
111
+ print('10 s. Limited for CPU ')
112
+ #https://github.com/yt-dlp/yt-dlp/issues/2220
113
+ mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
114
+ wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 1 audio.wav"
115
+ else:
116
+ mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
117
+ wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'
118
+
119
+ os.system(mp4_)
120
+ os.system(wav_)
121
+
122
+ print("Set file complete.")
123
+
124
+ # 1. Transcribe with original whisper (batched)
125
+ model = whisperx.load_model(
126
+ WHISPER_MODEL_SIZE,
127
+ device,
128
+ compute_type=compute_type
129
+ )
130
+ audio = whisperx.load_audio(audio_wav)
131
+ result = model.transcribe(audio, batch_size=batch_size)
132
+ gc.collect(); torch.cuda.empty_cache(); del model
133
+ print("Transcript complete")
134
+
135
+ # 2. Align whisper output
136
+ model_a, metadata = whisperx.load_align_model(
137
+ language_code=result["language"],
138
+ device=device
139
+ )
140
+ result = whisperx.align(
141
+ result["segments"],
142
+ model_a,
143
+ metadata,
144
+ audio,
145
+ device,
146
+ return_char_alignments=True,
147
+ )
148
+ gc.collect(); torch.cuda.empty_cache(); del model_a
149
+ print("Align complete")
150
+
151
+ # 3. Assign speaker labels
152
+ diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
153
+ diarize_segments = diarize_model(
154
+ audio_wav,
155
+ min_speakers=min_speakers,
156
+ max_speakers=max_speakers)
157
+ result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
158
+ gc.collect(); torch.cuda.empty_cache(); del diarize_model
159
+ print("Diarize complete")
160
+
161
+ result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
162
+ print("Translation complete")
163
+
164
+ audio_files = []
165
+
166
+ # Mapping speakers to voice variables
167
+ speaker_to_voice = {
168
+ 'SPEAKER_00': tts_voice00,
169
+ 'SPEAKER_01': tts_voice01,
170
+ 'SPEAKER_02': tts_voice02,
171
+ 'SPEAKER_03': tts_voice03,
172
+ 'SPEAKER_04': tts_voice04,
173
+ 'SPEAKER_05': tts_voice05
174
+ }
175
+
176
+ for segment in result_diarize['segments']:
177
+
178
+ text = segment['text']
179
+ start = segment['start']
180
+ end = segment['end']
181
+
182
+ try:
183
+ speaker = segment['speaker']
184
+ except KeyError:
185
+ segment['speaker'] = "SPEAKER_99"
186
+ speaker = segment['speaker']
187
+ print("NO SPEAKER DETECT IN SEGMENT")
188
+
189
+ # make the tts audio
190
+ filename = f"audio/{start}.ogg"
191
+
192
+ if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
193
+ make_voice_gradio(text, speaker_to_voice[speaker], filename)
194
+ elif speaker == "SPEAKER_99":
195
+ try:
196
+ tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
197
+ tts.save(filename)
198
+ print('Using GTTS')
199
+ except:
200
+ tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
201
+ tts.save(filename)
202
+ print('ERROR AUDIO GTTS')
203
+
204
+ # duration
205
+ duration_true = end - start
206
+ duration_tts = librosa.get_duration(filename=filename)
207
+
208
+ # porcentaje
209
+ porcentaje = duration_tts / duration_true
210
+
211
+ if porcentaje > 2.1:
212
+ porcentaje = 2.1
213
+ elif porcentaje <= 1.2 and porcentaje >= 0.8:
214
+ porcentaje = 1.0
215
+ elif porcentaje <= 0.79:
216
+ porcentaje = 0.8
217
+
218
+ # Smoth and round
219
+ porcentaje = round(porcentaje+0.0, 1)
220
+
221
+ # apply aceleration or opposite to the audio file in audio2 folder
222
+ os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}")
223
+
224
+ duration_create = librosa.get_duration(filename=f"audio2/{filename}")
225
+ audio_files.append(filename)
226
+
227
+ # replace files with the accelerates
228
+ os.system("mv -f audio2/audio/*.ogg audio/")
229
+
230
+ os.system(f"rm {Output_name_file}")
231
+
232
+ create_translated_audio(result_diarize, audio_files, Output_name_file)
233
+
234
+ os.system("rm audio_dub_stereo.wav")
235
+ os.system("ffmpeg -i audio_dub_solo.wav -ac 1 audio_dub_stereo.wav")
236
+
237
+ #os.system(f"ffmpeg -i Video.mp4 -i {Output_name_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
238
+
239
+ os.system(f"rm {mix_audio}")
240
+ #os.system(f'''ffmpeg -i {audio_wav} -i audio_dub_stereo.wav -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}''')
241
+ #os.system(f'ffmpeg -y -i {audio_wav} -i audio_dub_stereo.wav -filter_complex "[0:0][1:0] amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
242
+ os.system(f'ffmpeg -y -i audio.wav -i audio_dub_stereo.wav -filter_complex "[0:0]volume=0.25[a];[1:0]volume=1.85[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
243
+
244
+ os.system(f"rm {video_output}")
245
+ os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
246
+
247
+ return video_output
248
+
249
+
250
+
251
+ import sys
252
+
253
+ class Logger:
254
+ def __init__(self, filename):
255
+ self.terminal = sys.stdout
256
+ self.log = open(filename, "w")
257
+
258
+ def write(self, message):
259
+ self.terminal.write(message)
260
+ self.log.write(message)
261
+
262
+ def flush(self):
263
+ self.terminal.flush()
264
+ self.log.flush()
265
+
266
+ def isatty(self):
267
+ return False
268
+
269
+ sys.stdout = Logger("output.log")
270
+
271
+ def read_logs():
272
+ sys.stdout.flush()
273
+ with open("output.log", "r") as f:
274
+ return f.read()
275
+
276
+
277
+ with gr.Blocks() as demo:
278
+ gr.Markdown(title)
279
+ gr.Markdown(description)
280
+ gr.Markdown(tutorial)
281
+
282
+ with gr.Tab("Translate audio from video"):
283
+ with gr.Row():
284
+ with gr.Column():
285
+ video_input = gr.Video() # height=300,width=300
286
+
287
+ gr.Markdown("Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
288
+ TRANSLATE_AUDIO_TO = gr.inputs.Dropdown(['en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], default='en',label = 'Translate audio to')
289
+
290
+ gr.Markdown("Select how many people are speaking in the video.")
291
+ min_speakers = gr.inputs.Slider(1, 6, default=1, label="min_speakers", step=1)
292
+ max_speakers = gr.inputs.Slider(1, 6, default=2, label="max_speakers",step=1)
293
+
294
+ gr.Markdown("Select the voice you want for each speaker.")
295
+ tts_voice00 = gr.inputs.Dropdown(list_tts, default='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1')
296
+ tts_voice01 = gr.inputs.Dropdown(list_tts, default='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2')
297
+ tts_voice02 = gr.inputs.Dropdown(list_tts, default='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3')
298
+ tts_voice03 = gr.inputs.Dropdown(list_tts, default='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4')
299
+ tts_voice04 = gr.inputs.Dropdown(list_tts, default='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5')
300
+ tts_voice05 = gr.inputs.Dropdown(list_tts, default='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6')
301
+
302
+ gr.Markdown("Default configuration of Whisper.")
303
+ WHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
304
+ batch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
305
+ compute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
306
+
307
+ with gr.Column(variant='compact'):
308
+ with gr.Row():
309
+ video_button = gr.Button("Translate audio of video", )
310
+ with gr.Row():
311
+ video_output = gr.Video()
312
+
313
+
314
+ gr.Examples(
315
+ examples=[
316
+ [
317
+ "./assets/Video_subtitled.mp4",
318
+ "base",
319
+ 16,
320
+ "float32",
321
+ "en",
322
+ 1,
323
+ 2,
324
+ 'en-AU-WilliamNeural-Male',
325
+ 'en-CA-ClaraNeural-Female',
326
+ 'en-GB-ThomasNeural-Male',
327
+ 'en-GB-SoniaNeural-Female',
328
+ 'en-NZ-MitchellNeural-Male',
329
+ 'en-GB-MaisieNeural-Female',
330
+ ],
331
+ ],
332
+ fn=translate_from_video,
333
+ inputs=[
334
+ video_input,
335
+ WHISPER_MODEL_SIZE,
336
+ batch_size,
337
+ compute_type,
338
+ TRANSLATE_AUDIO_TO,
339
+ min_speakers,
340
+ max_speakers,
341
+ tts_voice00,
342
+ tts_voice01,
343
+ tts_voice02,
344
+ tts_voice03,
345
+ tts_voice04,
346
+ tts_voice05,
347
+ ],
348
+ outputs=[video_output],
349
+ #cache_examples=True,
350
+ )
351
+
352
+
353
+ with gr.Tab("Translate audio from video link"):
354
+ with gr.Row():
355
+ with gr.Column():
356
+
357
+ link_input = gr.Textbox(label="Media link. Example: www.youtube.com/watch?v=g_9rPvbENUw", placeholder="URL goes here...")
358
+ #filename = gr.Textbox(label="File name", placeholder="best-vid")
359
+
360
+ gr.Markdown("Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
361
+ bTRANSLATE_AUDIO_TO = gr.inputs.Dropdown(['en', 'fr', 'de', 'es', 'it', 'ja', 'zh', 'nl', 'uk', 'pt'], default='en',label = 'Translate audio to')
362
+
363
+ gr.Markdown("Select how many people are speaking in the video.")
364
+ bmin_speakers = gr.inputs.Slider(1, 6, default=1, label="min_speakers", step=1)
365
+ bmax_speakers = gr.inputs.Slider(1, 6, default=2, label="max_speakers",step=1)
366
+
367
+ gr.Markdown("Select the voice you want for each speaker.")
368
+ btts_voice00 = gr.inputs.Dropdown(list_tts, default='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1')
369
+ btts_voice01 = gr.inputs.Dropdown(list_tts, default='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2')
370
+ btts_voice02 = gr.inputs.Dropdown(list_tts, default='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3')
371
+ btts_voice03 = gr.inputs.Dropdown(list_tts, default='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4')
372
+ btts_voice04 = gr.inputs.Dropdown(list_tts, default='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5')
373
+ btts_voice05 = gr.inputs.Dropdown(list_tts, default='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6')
374
+
375
+ gr.Markdown("Default configuration of Whisper.")
376
+ bWHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
377
+ bbatch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
378
+ bcompute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
379
+
380
+ # text_button = gr.Button("Translate audio of video")
381
+ # link_output = gr.Video() #gr.outputs.File(label="Download!")
382
+
383
+
384
+
385
+ with gr.Column(variant='compact'):
386
+ with gr.Row():
387
+ text_button = gr.Button("Translate audio of video")
388
+ with gr.Row():
389
+ link_output = gr.Video() #gr.outputs.File(label="Download!") # gr.Video()
390
+
391
+ gr.Examples(
392
+ examples=[
393
+ [
394
+ "https://www.youtube.com/watch?v=5ZeHtRKHl7Y",
395
+ "base",
396
+ 16,
397
+ "float32",
398
+ "en",
399
+ 1,
400
+ 2,
401
+ 'en-CA-ClaraNeural-Female',
402
+ 'en-AU-WilliamNeural-Male',
403
+ 'en-GB-ThomasNeural-Male',
404
+ 'en-GB-SoniaNeural-Female',
405
+ 'en-NZ-MitchellNeural-Male',
406
+ 'en-GB-MaisieNeural-Female',
407
+ ],
408
+ ],
409
+ fn=translate_from_video,
410
+ inputs=[
411
+ link_input,
412
+ bWHISPER_MODEL_SIZE,
413
+ bbatch_size,
414
+ bcompute_type,
415
+ bTRANSLATE_AUDIO_TO,
416
+ bmin_speakers,
417
+ bmax_speakers,
418
+ btts_voice00,
419
+ btts_voice01,
420
+ btts_voice02,
421
+ btts_voice03,
422
+ btts_voice04,
423
+ btts_voice05,
424
+ ],
425
+ outputs=[video_output],
426
+ #cache_examples=True,
427
+ )
428
+
429
+
430
+
431
+ with gr.Accordion("Logs"):
432
+ logs = gr.Textbox()
433
+ demo.load(read_logs, None, logs, every=1)
434
+
435
+ # run
436
+ video_button.click(translate_from_video, inputs=[
437
+ video_input,
438
+ WHISPER_MODEL_SIZE,
439
+ batch_size,
440
+ compute_type,
441
+ TRANSLATE_AUDIO_TO,
442
+ min_speakers,
443
+ max_speakers,
444
+ tts_voice00,
445
+ tts_voice01,
446
+ tts_voice02,
447
+ tts_voice03,
448
+ tts_voice04,
449
+ tts_voice05,], outputs=video_output)
450
+ text_button.click(translate_from_video, inputs=[
451
+ link_input,
452
+ bWHISPER_MODEL_SIZE,
453
+ bbatch_size,
454
+ bcompute_type,
455
+ bTRANSLATE_AUDIO_TO,
456
+ bmin_speakers,
457
+ bmax_speakers,
458
+ btts_voice00,
459
+ btts_voice01,
460
+ btts_voice02,
461
+ btts_voice03,
462
+ btts_voice04,
463
+ btts_voice05,], outputs=link_output)
464
+
465
+
466
+ demo.launch(enable_queue=True)
467
+
468
+
469
+
470
+
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ git+https://github.com/m-bain/whisperx.git
4
+ yt-dlp
5
+ gTTS
6
+ pydub
7
+ edge_tts
8
+ deep_translator
9
+ torchaudio==2.0.0
10
+ gradio
11
+ nest_asyncio
soni_translate/audio_segments.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydub import AudioSegment
2
+ from tqdm import tqdm
3
+ import os
4
+
5
+ def create_translated_audio(result_diarize, audio_files, Output_name_file):
6
+ total_duration = result_diarize['segments'][-1]['end'] # in seconds
7
+
8
+ # silent audio with total_duration
9
+ combined_audio = AudioSegment.silent(duration=int(total_duration * 1000))
10
+ print(round((total_duration / 60),2), 'minutes of video')
11
+
12
+ for line, audio_file in tqdm(zip(result_diarize['segments'], audio_files)):
13
+ start = float(line['start'])
14
+
15
+ # Overlay each audio at the corresponding time
16
+ try:
17
+ audio = AudioSegment.from_file(audio_file)
18
+ ###audio_a = audio.speedup(playback_speed=1.5)
19
+ start_time = start * 1000 # to ms
20
+ combined_audio = combined_audio.overlay(audio, position=start_time)
21
+ except:
22
+ print(f'ERROR AUDIO FILE {audio_file}')
23
+
24
+ os.system("rm -rf audio/*")
25
+
26
+ # combined audio as a file
27
+ combined_audio.export(Output_name_file, format="wav")
soni_translate/text_to_speech.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gtts import gTTS
2
+ import edge_tts
3
+ import asyncio
4
+ import nest_asyncio
5
+
6
+ def make_voice(tts_text, tts_voice, filename):
7
+ try:
8
+ nest_asyncio.apply()
9
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename))
10
+ except 1:
11
+ tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
12
+ tts.save(filename)
13
+ print('USE GTTS')
14
+ except 2:
15
+ tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
16
+ tts.save(filename)
17
+ print('REPLACE AUDIO GTTS')
18
+
19
+ def make_voice_gradio(tts_text, tts_voice, filename):
20
+ print(tts_text, filename)
21
+ try:
22
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename))
23
+ except 1:
24
+ tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
25
+ tts.save(filename)
26
+ print('USE GTTS')
27
+ except 2:
28
+ tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
29
+ tts.save(filename)
30
+ print('REPLACE AUDIO GTTS')
soni_translate/translate_segments.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm import tqdm
2
+ from deep_translator import GoogleTranslator
3
+
4
+ def translate_text(segments, TRANSLATE_AUDIO_TO):
5
+ for line in tqdm(range(len(segments))):
6
+ text = segments[line]['text']
7
+ translator = GoogleTranslator(source='auto', target=TRANSLATE_AUDIO_TO)
8
+ translated_line = translator.translate(text.strip())
9
+ segments[line]['text'] = translated_line
10
+ return segments