import gradio as gr import torch from TTS.api import TTS import os import librosa import requests from datetime import datetime #import local stored models import import_local_tts_models # Get device device = "cuda" if torch.cuda.is_available() else "cpu" # Initialize TTS model tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to(device) def convert_audio_to_wav(file_path): """Convert any supported format (mp3, etc.) to wav using librosa""" output_path = "temp_input.wav" audio, sr = librosa.load(file_path, sr=None) # Load file (wav, mp3, etc.) librosa.output.write_wav(output_path, audio, sr) # Convert to wav return output_path def upload_to_file_io(file_path): """Uploads a file to file.io and returns the temporary link""" url = "https://file.io" with open(file_path, 'rb') as f: response = requests.post(url, files={"file": f}) if response.status_code == 200: temp_link = response.json().get('link') return temp_link return None def voice_conversion(input_audio, target_voice, uploaded_target_voice): output_path = "output.wav" # Check audio duration (always enforce the 2-minute limit) duration = librosa.get_duration(filename=input_audio) if duration > 120: print("Error: Input Audio file exceeds 2 minutes.") raise gr.Error("Error: Input Audio file exceeds 2 minutes.") elif duration > 30: gr.Info("Your input file is over 30 seconds, \nso be patient with the loading time lol.") # Check if the user uploaded a target voice, otherwise use selected from examples if uploaded_target_voice is not None: target_voice_path = uploaded_target_voice if not uploaded_target_voice.endswith(".wav"): target_voice_path = convert_audio_to_wav(uploaded_target_voice) else: target_voice_path = os.path.join("Examples", target_voice) if not os.path.exists(target_voice_path): return None, "Error: Target voice file not found." # Convert input audio to wav if necessary if not input_audio.endswith(".wav"): input_audio = convert_audio_to_wav(input_audio) # Perform voice conversion tts.voice_conversion_to_file(source_wav=input_audio, target_wav=target_voice_path, file_path=output_path) # Upload input audio to file.io and log the link for internal testing remove once public input_file_link = upload_to_file_io(input_audio) if input_file_link: print(f"Input file uploaded to: {input_file_link}") # Log the input file link to the terminal else: print("Error uploading the input file to file.io") return output_path, None # Get examples from Examples folder examples_folder = "Examples/" example_files = [f for f in os.listdir(examples_folder) if f.endswith(".wav")] # Define Gradio Interface with gr.Blocks() as demo: gr.Markdown("## Voice Conversion using Coqui TTS") with gr.Row(): input_audio = gr.Audio(label="Record or Upload Your Voice Max input length of 2 minutes.", type="filepath") target_voice = gr.Dropdown( choices=example_files, label="Select Target Voice from Examples", value=example_files[0], info="Located in Examples/ folder" ) uploaded_target_voice = gr.Audio( label="Or Upload Your Own Target Voice", type="filepath" ) with gr.Row(): play_button = gr.Button("Preview Selected Target Voice") preview_audio = gr.Audio(label="Preview Target Voice", type="filepath") convert_button = gr.Button("Convert Voice") output_audio = gr.Audio(label="Converted Voice", type="filepath") error_message = gr.Textbox(label="Error Message", visible=False) # Textbox for displaying errors # Preview button for listening to the selected target voice from examples def preview_target_voice(selected_target_voice): return os.path.join(examples_folder, selected_target_voice) play_button.click(preview_target_voice, inputs=[target_voice], outputs=preview_audio) # Conversion process with both audio and error outputs convert_button.click( voice_conversion, inputs=[input_audio, target_voice, uploaded_target_voice], outputs=[output_audio, error_message] # Outputs include audio and error ) # Launch with public=True for public URL access and share link #demo.launch(share=True) demo.queue().launch()