wl-dub

Running

App Files Files Community

wl-dub / soni_translate /video_dubbing.py

r3gm

Upload 4 files

fc97911 over 1 year ago

raw

history blame

7.68 kB

	import numpy as np
	import gradio as gr
	import whisperx
	import torch
	from gtts import gTTS
	import librosa
	import edge_tts
	import gc
	from pydub import AudioSegment
	from tqdm import tqdm
	from deep_translator import GoogleTranslator
	import os
	from soni_translate.audio_segments import create_translated_audio
	from soni_translate.text_to_speech import make_voice
	from soni_translate.translate_segments import translate_text
	import time

	def translate_from_video(
	video,
	YOUR_HF_TOKEN,
	preview=False,
	WHISPER_MODEL_SIZE="large-v1",
	batch_size=16,
	compute_type="float16",
	SOURCE_LANGUAGE= "Automatic detection",
	TRANSLATE_AUDIO_TO="en",
	min_speakers=1,
	max_speakers=2,
	tts_voice00="en-AU-WilliamNeural-Male",
	tts_voice01="en-CA-ClaraNeural-Female",
	tts_voice02="en-GB-ThomasNeural-Male",
	tts_voice03="en-GB-SoniaNeural-Female",
	tts_voice04="en-NZ-MitchellNeural-Male",
	tts_voice05="en-GB-MaisieNeural-Female",
	video_output="video_dub.mp4"
	):

	if YOUR_HF_TOKEN == "" or YOUR_HF_TOKEN == None:
	YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")

	if not os.path.exists('audio'):
	os.makedirs('audio')

	if not os.path.exists('audio2/audio'):
	os.makedirs('audio2/audio')

	# Check GPU
	device = "cuda" if torch.cuda.is_available() else "cpu"
	compute_type = "float32" if device == "cpu" else compute_type

	OutputFile = 'Video.mp4'
	audio_wav = "audio.wav"
	Output_name_file = "audio_dub_solo.ogg"
	mix_audio = "audio_mix.mp3"

	os.system("rm Video.mp4")
	os.system("rm audio.webm")
	os.system("rm audio.wav")

	if os.path.exists(video):
	if preview:
	print('Creating preview video, 10 seconds')
	os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4')
	else:
	os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4')

	os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
	else:
	if preview:
	print('Creating preview from link, 10 seconds')
	#https://github.com/yt-dlp/yt-dlp/issues/2220
	mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
	wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
	os.system(mp4_)
	os.system(wav_)
	else:
	mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
	wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'

	os.system(wav_)

	for i in range (120):
	time.sleep(1)
	print('process audio')
	if os.path.exists(audio_wav) and not os.path.exists('audio.webm'):
	time.sleep(1)
	os.system(mp4_)
	break
	if i == 119:
	print('Error donwloading the audio')
	return

	print("Set file complete.")

	SOURCE_LANGUAGE = None if SOURCE_LANGUAGE == 'Automatic detection' else SOURCE_LANGUAGE

	# 1. Transcribe with original whisper (batched)
	model = whisperx.load_model(
	WHISPER_MODEL_SIZE,
	device,
	compute_type=compute_type,
	language= SOURCE_LANGUAGE,
	)
	audio = whisperx.load_audio(audio_wav)
	result = model.transcribe(audio, batch_size=batch_size)
	gc.collect(); torch.cuda.empty_cache(); del model
	print("Transcript complete")

	# 2. Align whisper output
	model_a, metadata = whisperx.load_align_model(
	language_code=result["language"],
	device=device
	)
	result = whisperx.align(
	result["segments"],
	model_a,
	metadata,
	audio,
	device,
	return_char_alignments=True,
	)
	gc.collect(); torch.cuda.empty_cache(); del model_a
	print("Align complete")

	if result['segments'] == []:
	print('No active speech found in audio')
	return

	# 3. Assign speaker labels
	diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
	diarize_segments = diarize_model(
	audio_wav,
	min_speakers=min_speakers,
	max_speakers=max_speakers)
	result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
	gc.collect(); torch.cuda.empty_cache(); del diarize_model
	print("Diarize complete")

	result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
	print("Translation complete")

	audio_files = []

	# Mapping speakers to voice variables
	speaker_to_voice = {
	'SPEAKER_00': tts_voice00,
	'SPEAKER_01': tts_voice01,
	'SPEAKER_02': tts_voice02,
	'SPEAKER_03': tts_voice03,
	'SPEAKER_04': tts_voice04,
	'SPEAKER_05': tts_voice05
	}

	for segment in tqdm(result_diarize['segments']):

	text = segment['text']
	start = segment['start']
	end = segment['end']

	try:
	speaker = segment['speaker']
	except KeyError:
	segment['speaker'] = "SPEAKER_99"
	speaker = segment['speaker']
	print("NO SPEAKER DETECT IN SEGMENT")

	# make the tts audio
	filename = f"audio/{start}.ogg"

	if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
	make_voice(text, speaker_to_voice[speaker], filename, TRANSLATE_AUDIO_TO)
	elif speaker == "SPEAKER_99":
	try:
	tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
	tts.save(filename)
	print('Using GTTS')
	except:
	tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
	tts.save(filename)
	print('Error: Audio will be replaced.')

	# duration
	duration_true = end - start
	duration_tts = librosa.get_duration(filename=filename)

	# porcentaje
	porcentaje = duration_tts / duration_true

	if porcentaje > 2.1:
	porcentaje = 2.1
	elif porcentaje <= 1.2 and porcentaje >= 0.8:
	porcentaje = 1.0
	elif porcentaje <= 0.79:
	porcentaje = 0.8

	# Smoth and round
	porcentaje = round(porcentaje+0.0, 1)

	# apply aceleration or opposite to the audio file in audio2 folder
	os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}")

	duration_create = librosa.get_duration(filename=f"audio2/{filename}")
	audio_files.append(filename)

	# replace files with the accelerates
	os.system("mv -f audio2/audio/*.ogg audio/")

	os.system(f"rm {Output_name_file}")
	create_translated_audio(result_diarize, audio_files, Output_name_file)

	os.system(f"rm {mix_audio}")
	os.system(f'ffmpeg -i {audio_wav} -i {Output_name_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}')

	os.system(f"rm {video_output}")
	os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")

	return video_output