import numpy as np |
import gradio as gr |
import whisperx |
import torch |
from gtts import gTTS |
import librosa |
import edge_tts |
import gc |
from pydub import AudioSegment |
from tqdm import tqdm |
from deep_translator import GoogleTranslator |
import os |
from soni_translate.audio_segments import create_translated_audio |
from soni_translate.text_to_speech import make_voice |
from soni_translate.translate_segments import translate_text |
import time |
def translate_from_video( |
video, |
preview=False, |
WHISPER_MODEL_SIZE="large-v1", |
batch_size=16, |
compute_type="float16", |
SOURCE_LANGUAGE= "Automatic detection", |
min_speakers=1, |
max_speakers=2, |
tts_voice00="en-AU-WilliamNeural-Male", |
tts_voice01="en-CA-ClaraNeural-Female", |
tts_voice02="en-GB-ThomasNeural-Male", |
tts_voice03="en-GB-SoniaNeural-Female", |
tts_voice04="en-NZ-MitchellNeural-Male", |
tts_voice05="en-GB-MaisieNeural-Female", |
video_output="video_dub.mp4" |
): |
if YOUR_HF_TOKEN == "" or YOUR_HF_TOKEN == None: |
YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN") |
if not os.path.exists('audio'): |
os.makedirs('audio') |
if not os.path.exists('audio2/audio'): |
os.makedirs('audio2/audio') |
device = "cuda" if torch.cuda.is_available() else "cpu" |
compute_type = "float32" if device == "cpu" else compute_type |
OutputFile = 'Video.mp4' |
audio_wav = "audio.wav" |
Output_name_file = "audio_dub_solo.ogg" |
mix_audio = "audio_mix.mp3" |
os.system("rm Video.mp4") |
os.system("rm audio.webm") |
os.system("rm audio.wav") |
if os.path.exists(video): |
if preview: |
print('Creating preview video, 10 seconds') |
os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4') |
else: |
os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4') |
os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav") |
else: |
if preview: |
print('Creating preview from link, 10 seconds') |
mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' |
wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav" |
os.system(mp4_) |
os.system(wav_) |
else: |
mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' |
wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}' |
os.system(wav_) |
for i in range (120): |
time.sleep(1) |
print('process audio') |
if os.path.exists(audio_wav) and not os.path.exists('audio.webm'): |
time.sleep(1) |
os.system(mp4_) |
break |
if i == 119: |
print('Error donwloading the audio') |
return |
print("Set file complete.") |
SOURCE_LANGUAGE = None if SOURCE_LANGUAGE == 'Automatic detection' else SOURCE_LANGUAGE |
model = whisperx.load_model( |
device, |
compute_type=compute_type, |
language= SOURCE_LANGUAGE, |
) |
audio = whisperx.load_audio(audio_wav) |
result = model.transcribe(audio, batch_size=batch_size) |
gc.collect(); torch.cuda.empty_cache(); del model |
print("Transcript complete") |
model_a, metadata = whisperx.load_align_model( |
language_code=result["language"], |
device=device |
) |
result = whisperx.align( |
result["segments"], |
model_a, |
metadata, |
audio, |
device, |
return_char_alignments=True, |
) |
gc.collect(); torch.cuda.empty_cache(); del model_a |
print("Align complete") |
if result['segments'] == []: |
print('No active speech found in audio') |
return |
diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device) |
diarize_segments = diarize_model( |
audio_wav, |
min_speakers=min_speakers, |
max_speakers=max_speakers) |
result_diarize = whisperx.assign_word_speakers(diarize_segments, result) |
gc.collect(); torch.cuda.empty_cache(); del diarize_model |
print("Diarize complete") |
result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO) |
print("Translation complete") |
audio_files = [] |
speaker_to_voice = { |
'SPEAKER_00': tts_voice00, |
'SPEAKER_01': tts_voice01, |
'SPEAKER_02': tts_voice02, |
'SPEAKER_03': tts_voice03, |
'SPEAKER_04': tts_voice04, |
'SPEAKER_05': tts_voice05 |
} |
for segment in tqdm(result_diarize['segments']): |
text = segment['text'] |
start = segment['start'] |
end = segment['end'] |
try: |
speaker = segment['speaker'] |
except KeyError: |
segment['speaker'] = "SPEAKER_99" |
speaker = segment['speaker'] |
filename = f"audio/{start}.ogg" |
if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None': |
make_voice(text, speaker_to_voice[speaker], filename, TRANSLATE_AUDIO_TO) |
elif speaker == "SPEAKER_99": |
try: |
tts = gTTS(text, lang=TRANSLATE_AUDIO_TO) |
tts.save(filename) |
print('Using GTTS') |
except: |
tts = gTTS('a', lang=TRANSLATE_AUDIO_TO) |
tts.save(filename) |
print('Error: Audio will be replaced.') |
duration_true = end - start |
duration_tts = librosa.get_duration(filename=filename) |
porcentaje = duration_tts / duration_true |
if porcentaje > 2.1: |
porcentaje = 2.1 |
elif porcentaje <= 1.2 and porcentaje >= 0.8: |
porcentaje = 1.0 |
elif porcentaje <= 0.79: |
porcentaje = 0.8 |
porcentaje = round(porcentaje+0.0, 1) |
os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}") |
duration_create = librosa.get_duration(filename=f"audio2/{filename}") |
audio_files.append(filename) |
os.system("mv -f audio2/audio/*.ogg audio/") |
os.system(f"rm {Output_name_file}") |
create_translated_audio(result_diarize, audio_files, Output_name_file) |
os.system(f"rm {mix_audio}") |
os.system(f'ffmpeg -i {audio_wav} -i {Output_name_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}') |
os.system(f"rm {video_output}") |
os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}") |
return video_output |