|
import numpy as np |
|
import gradio as gr |
|
import whisperx |
|
import torch |
|
from gtts import gTTS |
|
import librosa |
|
import edge_tts |
|
import gc |
|
from pydub import AudioSegment |
|
from tqdm import tqdm |
|
from deep_translator import GoogleTranslator |
|
import os |
|
from soni_translate.audio_segments import create_translated_audio |
|
from soni_translate.text_to_speech import make_voice |
|
from soni_translate.translate_segments import translate_text |
|
import time |
|
|
|
def translate_from_video( |
|
video, |
|
YOUR_HF_TOKEN, |
|
preview=False, |
|
WHISPER_MODEL_SIZE="large-v1", |
|
batch_size=16, |
|
compute_type="float16", |
|
SOURCE_LANGUAGE= "Automatic detection", |
|
TRANSLATE_AUDIO_TO="en", |
|
min_speakers=1, |
|
max_speakers=2, |
|
tts_voice00="en-AU-WilliamNeural-Male", |
|
tts_voice01="en-CA-ClaraNeural-Female", |
|
tts_voice02="en-GB-ThomasNeural-Male", |
|
tts_voice03="en-GB-SoniaNeural-Female", |
|
tts_voice04="en-NZ-MitchellNeural-Male", |
|
tts_voice05="en-GB-MaisieNeural-Female", |
|
video_output="video_dub.mp4" |
|
): |
|
|
|
if YOUR_HF_TOKEN == "" or YOUR_HF_TOKEN == None: |
|
YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN") |
|
|
|
if not os.path.exists('audio'): |
|
os.makedirs('audio') |
|
|
|
if not os.path.exists('audio2/audio'): |
|
os.makedirs('audio2/audio') |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
compute_type = "float32" if device == "cpu" else compute_type |
|
|
|
OutputFile = 'Video.mp4' |
|
audio_wav = "audio.wav" |
|
Output_name_file = "audio_dub_solo.ogg" |
|
mix_audio = "audio_mix.mp3" |
|
|
|
os.system("rm Video.mp4") |
|
os.system("rm audio.webm") |
|
os.system("rm audio.wav") |
|
|
|
if os.path.exists(video): |
|
if preview: |
|
print('Creating preview video, 10 seconds') |
|
os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4') |
|
else: |
|
os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4') |
|
|
|
os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav") |
|
else: |
|
if preview: |
|
print('Creating preview from link, 10 seconds') |
|
|
|
mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' |
|
wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav" |
|
os.system(mp4_) |
|
os.system(wav_) |
|
else: |
|
mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' |
|
wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}' |
|
|
|
os.system(wav_) |
|
|
|
for i in range (120): |
|
time.sleep(1) |
|
print('process audio') |
|
if os.path.exists(audio_wav) and not os.path.exists('audio.webm'): |
|
time.sleep(1) |
|
os.system(mp4_) |
|
break |
|
if i == 119: |
|
print('Error donwloading the audio') |
|
return |
|
|
|
print("Set file complete.") |
|
|
|
SOURCE_LANGUAGE = None if SOURCE_LANGUAGE == 'Automatic detection' else SOURCE_LANGUAGE |
|
|
|
|
|
model = whisperx.load_model( |
|
WHISPER_MODEL_SIZE, |
|
device, |
|
compute_type=compute_type, |
|
language= SOURCE_LANGUAGE, |
|
) |
|
audio = whisperx.load_audio(audio_wav) |
|
result = model.transcribe(audio, batch_size=batch_size) |
|
gc.collect(); torch.cuda.empty_cache(); del model |
|
print("Transcript complete") |
|
|
|
|
|
model_a, metadata = whisperx.load_align_model( |
|
language_code=result["language"], |
|
device=device |
|
) |
|
result = whisperx.align( |
|
result["segments"], |
|
model_a, |
|
metadata, |
|
audio, |
|
device, |
|
return_char_alignments=True, |
|
) |
|
gc.collect(); torch.cuda.empty_cache(); del model_a |
|
print("Align complete") |
|
|
|
if result['segments'] == []: |
|
print('No active speech found in audio') |
|
return |
|
|
|
|
|
diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device) |
|
diarize_segments = diarize_model( |
|
audio_wav, |
|
min_speakers=min_speakers, |
|
max_speakers=max_speakers) |
|
result_diarize = whisperx.assign_word_speakers(diarize_segments, result) |
|
gc.collect(); torch.cuda.empty_cache(); del diarize_model |
|
print("Diarize complete") |
|
|
|
result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO) |
|
print("Translation complete") |
|
|
|
audio_files = [] |
|
|
|
|
|
speaker_to_voice = { |
|
'SPEAKER_00': tts_voice00, |
|
'SPEAKER_01': tts_voice01, |
|
'SPEAKER_02': tts_voice02, |
|
'SPEAKER_03': tts_voice03, |
|
'SPEAKER_04': tts_voice04, |
|
'SPEAKER_05': tts_voice05 |
|
} |
|
|
|
for segment in tqdm(result_diarize['segments']): |
|
|
|
text = segment['text'] |
|
start = segment['start'] |
|
end = segment['end'] |
|
|
|
try: |
|
speaker = segment['speaker'] |
|
except KeyError: |
|
segment['speaker'] = "SPEAKER_99" |
|
speaker = segment['speaker'] |
|
print("NO SPEAKER DETECT IN SEGMENT") |
|
|
|
|
|
filename = f"audio/{start}.ogg" |
|
|
|
if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None': |
|
make_voice(text, speaker_to_voice[speaker], filename, TRANSLATE_AUDIO_TO) |
|
elif speaker == "SPEAKER_99": |
|
try: |
|
tts = gTTS(text, lang=TRANSLATE_AUDIO_TO) |
|
tts.save(filename) |
|
print('Using GTTS') |
|
except: |
|
tts = gTTS('a', lang=TRANSLATE_AUDIO_TO) |
|
tts.save(filename) |
|
print('Error: Audio will be replaced.') |
|
|
|
|
|
duration_true = end - start |
|
duration_tts = librosa.get_duration(filename=filename) |
|
|
|
|
|
porcentaje = duration_tts / duration_true |
|
|
|
if porcentaje > 2.1: |
|
porcentaje = 2.1 |
|
elif porcentaje <= 1.2 and porcentaje >= 0.8: |
|
porcentaje = 1.0 |
|
elif porcentaje <= 0.79: |
|
porcentaje = 0.8 |
|
|
|
|
|
porcentaje = round(porcentaje+0.0, 1) |
|
|
|
|
|
os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}") |
|
|
|
duration_create = librosa.get_duration(filename=f"audio2/{filename}") |
|
audio_files.append(filename) |
|
|
|
|
|
os.system("mv -f audio2/audio/*.ogg audio/") |
|
|
|
os.system(f"rm {Output_name_file}") |
|
create_translated_audio(result_diarize, audio_files, Output_name_file) |
|
|
|
os.system(f"rm {mix_audio}") |
|
os.system(f'ffmpeg -i {audio_wav} -i {Output_name_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}') |
|
|
|
os.system(f"rm {video_output}") |
|
os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}") |
|
|
|
return video_output |
|
|