SoniTranslate / soni_translate /video_dubbing.py
r3gm's picture
Upload 4 files
fc97911
raw
history blame
7.68 kB
import numpy as np
import gradio as gr
import whisperx
import torch
from gtts import gTTS
import librosa
import edge_tts
import gc
from pydub import AudioSegment
from tqdm import tqdm
from deep_translator import GoogleTranslator
import os
from soni_translate.audio_segments import create_translated_audio
from soni_translate.text_to_speech import make_voice
from soni_translate.translate_segments import translate_text
import time
def translate_from_video(
video,
YOUR_HF_TOKEN,
preview=False,
WHISPER_MODEL_SIZE="large-v1",
batch_size=16,
compute_type="float16",
SOURCE_LANGUAGE= "Automatic detection",
TRANSLATE_AUDIO_TO="en",
min_speakers=1,
max_speakers=2,
tts_voice00="en-AU-WilliamNeural-Male",
tts_voice01="en-CA-ClaraNeural-Female",
tts_voice02="en-GB-ThomasNeural-Male",
tts_voice03="en-GB-SoniaNeural-Female",
tts_voice04="en-NZ-MitchellNeural-Male",
tts_voice05="en-GB-MaisieNeural-Female",
video_output="video_dub.mp4"
):
if YOUR_HF_TOKEN == "" or YOUR_HF_TOKEN == None:
YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")
if not os.path.exists('audio'):
os.makedirs('audio')
if not os.path.exists('audio2/audio'):
os.makedirs('audio2/audio')
# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float32" if device == "cpu" else compute_type
OutputFile = 'Video.mp4'
audio_wav = "audio.wav"
Output_name_file = "audio_dub_solo.ogg"
mix_audio = "audio_mix.mp3"
os.system("rm Video.mp4")
os.system("rm audio.webm")
os.system("rm audio.wav")
if os.path.exists(video):
if preview:
print('Creating preview video, 10 seconds')
os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4')
else:
os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4')
os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
else:
if preview:
print('Creating preview from link, 10 seconds')
#https://github.com/yt-dlp/yt-dlp/issues/2220
mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
os.system(mp4_)
os.system(wav_)
else:
mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'
os.system(wav_)
for i in range (120):
time.sleep(1)
print('process audio')
if os.path.exists(audio_wav) and not os.path.exists('audio.webm'):
time.sleep(1)
os.system(mp4_)
break
if i == 119:
print('Error donwloading the audio')
return
print("Set file complete.")
SOURCE_LANGUAGE = None if SOURCE_LANGUAGE == 'Automatic detection' else SOURCE_LANGUAGE
# 1. Transcribe with original whisper (batched)
model = whisperx.load_model(
WHISPER_MODEL_SIZE,
device,
compute_type=compute_type,
language= SOURCE_LANGUAGE,
)
audio = whisperx.load_audio(audio_wav)
result = model.transcribe(audio, batch_size=batch_size)
gc.collect(); torch.cuda.empty_cache(); del model
print("Transcript complete")
# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(
language_code=result["language"],
device=device
)
result = whisperx.align(
result["segments"],
model_a,
metadata,
audio,
device,
return_char_alignments=True,
)
gc.collect(); torch.cuda.empty_cache(); del model_a
print("Align complete")
if result['segments'] == []:
print('No active speech found in audio')
return
# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
diarize_segments = diarize_model(
audio_wav,
min_speakers=min_speakers,
max_speakers=max_speakers)
result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
gc.collect(); torch.cuda.empty_cache(); del diarize_model
print("Diarize complete")
result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
print("Translation complete")
audio_files = []
# Mapping speakers to voice variables
speaker_to_voice = {
'SPEAKER_00': tts_voice00,
'SPEAKER_01': tts_voice01,
'SPEAKER_02': tts_voice02,
'SPEAKER_03': tts_voice03,
'SPEAKER_04': tts_voice04,
'SPEAKER_05': tts_voice05
}
for segment in tqdm(result_diarize['segments']):
text = segment['text']
start = segment['start']
end = segment['end']
try:
speaker = segment['speaker']
except KeyError:
segment['speaker'] = "SPEAKER_99"
speaker = segment['speaker']
print("NO SPEAKER DETECT IN SEGMENT")
# make the tts audio
filename = f"audio/{start}.ogg"
if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
make_voice(text, speaker_to_voice[speaker], filename, TRANSLATE_AUDIO_TO)
elif speaker == "SPEAKER_99":
try:
tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
tts.save(filename)
print('Using GTTS')
except:
tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
tts.save(filename)
print('Error: Audio will be replaced.')
# duration
duration_true = end - start
duration_tts = librosa.get_duration(filename=filename)
# porcentaje
porcentaje = duration_tts / duration_true
if porcentaje > 2.1:
porcentaje = 2.1
elif porcentaje <= 1.2 and porcentaje >= 0.8:
porcentaje = 1.0
elif porcentaje <= 0.79:
porcentaje = 0.8
# Smoth and round
porcentaje = round(porcentaje+0.0, 1)
# apply aceleration or opposite to the audio file in audio2 folder
os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}")
duration_create = librosa.get_duration(filename=f"audio2/{filename}")
audio_files.append(filename)
# replace files with the accelerates
os.system("mv -f audio2/audio/*.ogg audio/")
os.system(f"rm {Output_name_file}")
create_translated_audio(result_diarize, audio_files, Output_name_file)
os.system(f"rm {mix_audio}")
os.system(f'ffmpeg -i {audio_wav} -i {Output_name_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}')
os.system(f"rm {video_output}")
os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
return video_output