import os import re import random from scipy.io.wavfile import write from scipy.io.wavfile import read import numpy as np import gradio as gr import yt_dlp import subprocess from pydub import AudioSegment from audio_separator.separator import Separator from lib.infer import infer_audio import edge_tts import tempfile import anyio from pathlib import Path from lib.language_tts import language_dict import os import zipfile import shutil import urllib.request import gdown import subprocess import time from argparse import ArgumentParser from download_model import download_online_model main_dir = Path().resolve() print(main_dir) os.chdir(main_dir) models_dir = main_dir / "rvc_models" audio_separat_dir = main_dir / "audio_input" AUDIO_DIR = main_dir / 'audio_input' # Function to list all folders in the models directory def get_folders(): if models_dir.exists() and models_dir.is_dir(): return [folder.name for folder in models_dir.iterdir() if folder.is_dir()] return [] # Function to refresh and return the list of folders def refresh_folders(): return gr.Dropdown.update(choices=get_folders()) # Function to get the list of audio files in the specified directory def get_audio_files(): if not os.path.exists(AUDIO_DIR): os.makedirs(AUDIO_DIR) # List all supported audio file formats return [f for f in os.listdir(AUDIO_DIR) if f.lower().endswith(('.mp3', '.wav', '.flac', '.ogg', '.aac'))] # Function to return the full path of audio files for playback def load_audio_files(): audio_files = get_audio_files() return [os.path.join(AUDIO_DIR, f) for f in audio_files] # Refresh function to update the list of files def refresh_audio_list(): audio_files = load_audio_files() return gr.update(choices=audio_files) # Function to play selected audio file def play_audio(file_path): return file_path def download_audio(url): ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': 'ytdl/%(title)s.%(ext)s', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav', 'preferredquality': '192', }], } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(url, download=True) file_path = ydl.prepare_filename(info_dict).rsplit('.', 1)[0] + '.wav' sample_rate, audio_data = read(file_path) audio_array = np.asarray(audio_data, dtype=np.int16) return sample_rate, audio_array # Define the audio separation function def separate_audio(input_audio, model_voc_inst, model_deecho, model_back_voc): output_dir = audio_separat_dir os.makedirs(output_dir, exist_ok=True) separator = Separator(output_dir=output_dir) # Define output files vocals = os.path.join(output_dir, 'Vocals.wav') instrumental = os.path.join(output_dir, 'Instrumental.wav') vocals_reverb = os.path.join(output_dir, 'Vocals (Reverb).wav') vocals_no_reverb = os.path.join(output_dir, 'Vocals (No Reverb).wav') lead_vocals = os.path.join(output_dir, 'Lead Vocals.wav') backing_vocals = os.path.join(output_dir, 'Backing Vocals.wav') # Splitting a track into Vocal and Instrumental separator.load_model(model_filename=model_voc_inst) voc_inst = separator.separate(input_audio) os.rename(os.path.join(output_dir, voc_inst[0]), instrumental) # Rename to “Instrumental.wav” os.rename(os.path.join(output_dir, voc_inst[1]), vocals) # Rename to “Vocals.wav” # Applying DeEcho-DeReverb to Vocals separator.load_model(model_filename=model_deecho) voc_no_reverb = separator.separate(vocals) os.rename(os.path.join(output_dir, voc_no_reverb[0]), vocals_no_reverb) # Rename to “Vocals (No Reverb).wav” os.rename(os.path.join(output_dir, voc_no_reverb[1]), vocals_reverb) # Rename to “Vocals (Reverb).wav” # Separating Back Vocals from Main Vocals separator.load_model(model_filename=model_back_voc) backing_voc = separator.separate(vocals_no_reverb) os.rename(os.path.join(output_dir, backing_voc[0]), backing_vocals) # Rename to “Backing Vocals.wav” os.rename(os.path.join(output_dir, backing_voc[1]), lead_vocals) # Rename to “Lead Vocals.wav” return [ instrumental, vocals, vocals_no_reverb, vocals_reverb, lead_vocals, backing_vocals ] # Gradio Interface def gradio_interface(input_audio, model_voc_inst, model_deecho, model_back_voc): # Separate audio and get paths to output files results = separate_audio(input_audio, model_voc_inst, model_deecho, model_back_voc) # Convert file paths to Gradio downloadable links return [gr.File(file) for file in results] # Main function to process audio (Inference) def process_audio(MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE, FILTER_RADIUS, RMS_MIX_RATE, PROTECT, SPLIT_INFER, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP, KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT, upload_audio=None): # If no sound path is given, use the uploaded file if not SOUND_PATH and upload_audio is not None: SOUND_PATH = os.path.join("uploaded_audio", upload_audio.name) with open(SOUND_PATH, "wb") as f: f.write(upload_audio.read()) # Check if a model name is provided if not MODEL_NAME: return "Please provide a model name." # Run the inference os.system("chmod +x stftpitchshift") inferred_audio = infer_audio( MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE, FILTER_RADIUS, RMS_MIX_RATE, PROTECT, SPLIT_INFER, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP, KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT ) return inferred_audio async def text_to_speech_edge(text, language_code): voice = language_dict.get(language_code, "default_voice") communicate = edge_tts.Communicate(text, voice) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) return tmp_path if __name__ == '__main__': parser = ArgumentParser(description='Generate a AI song in the song_output/id directory.', add_help=True) parser.add_argument("--share", action="store_true", dest="share_enabled", default=False, help="Enable sharing") parser.add_argument("--listen", action="store_true", default=False, help="Make the UI reachable from your local network.") parser.add_argument('--listen-host', type=str, help='The hostname that the server will use.') parser.add_argument('--listen-port', type=int, help='The listening port that the server will use.') args = parser.parse_args() # Gradio Blocks Interface with Tabs with gr.Blocks(title="Hex RVC", theme=gr.themes.Base(primary_hue="red", secondary_hue="pink")) as app: gr.Markdown("# Hex RVC") gr.Markdown(" join [AIHub](https://discord.gg/aihub) to get the rvc model!") with gr.Tab("Inference"): with gr.Row(): MODEL_NAME = gr.Dropdown( label="Select a Model", choices=get_folders(), interactive=True, elem_id="model_folder" ) SOUND_PATH = gr.Dropdown( choices=load_audio_files(), label="Select an audio file", interactive=True, value=None, ) # Button to refresh the list of folders with gr.Row(): upload_audio = gr.Audio(label="Upload Audio", type='filepath', visible=False) with gr.Accordion("Conversion Settings"): with gr.Row(): F0_CHANGE = gr.Number(label="Pitch Change (semitones)", value=0) F0_METHOD = gr.Dropdown(choices=["crepe", "harvest", "mangio-crepe", "rmvpe", "rmvpe_legacy", "fcpe", "fcpe_legacy", "hybrid[rmvpe+fcpe]"], label="F0 Method", value="fcpe") with gr.Row(): MIN_PITCH = gr.Textbox(label="Min Pitch", value="50") MAX_PITCH = gr.Textbox(label="Max Pitch", value="1100") CREPE_HOP_LENGTH = gr.Number(label="Crepe Hop Length", value=120) INDEX_RATE = gr.Slider(label="Index Rate", minimum=0, maximum=1, value=0.75) FILTER_RADIUS = gr.Number(label="Filter Radius", value=3) RMS_MIX_RATE = gr.Slider(label="RMS Mix Rate", minimum=0, maximum=1, value=0.25) PROTECT = gr.Slider(label="Protect", minimum=0, maximum=1, value=0.33) with gr.Accordion("Hex TTS", open=False): input_text = gr.Textbox(lines=5, label="Input Text") #output_text = gr.Textbox(label="Output Text") #output_audio = gr.Audio(type="filepath", label="Exported Audio") language = gr.Dropdown(choices=list(language_dict.keys()), label="Choose the Voice Model") tts_convert = gr.Button("Convert") tts_convert.click(fn=text_to_speech_edge, inputs=[input_text, language], outputs=[upload_audio]) with gr.Accordion("Advanced Settings", open=False): SPLIT_INFER = gr.Checkbox(label="Enable Split Inference", value=False) MIN_SILENCE = gr.Number(label="Min Silence (ms)", value=500) SILENCE_THRESHOLD = gr.Number(label="Silence Threshold (dBFS)", value=-50) SEEK_STEP = gr.Slider(label="Seek Step (ms)", minimum=1, maximum=10, value=1) KEEP_SILENCE = gr.Number(label="Keep Silence (ms)", value=200) FORMANT_SHIFT = gr.Checkbox(label="Enable Formant Shift", value=False) QUEFRENCY = gr.Number(label="Quefrency", value=0) TIMBRE = gr.Number(label="Timbre", value=1) F0_AUTOTUNE = gr.Checkbox(label="Enable F0 Autotune", value=False) OUTPUT_FORMAT = gr.Dropdown(choices=["wav", "flac", "mp3"], label="Output Format", value="wav") output_audio = gr.Audio(label="Generated Audio", type='filepath') with gr.Row(): refresh_btn = gr.Button("Refresh") run_button = gr.Button("Convert") #ref_btn.click(update_models_list, None, outputs=MODEL_NAME) refresh_btn.click( lambda: (refresh_audio_list(), refresh_folders()), outputs=[SOUND_PATH, MODEL_NAME] ) run_button.click( process_audio, inputs=[MODEL_NAME, SOUND_PATH, F0_CHANGE, F0_METHOD, MIN_PITCH, MAX_PITCH, CREPE_HOP_LENGTH, INDEX_RATE, FILTER_RADIUS, RMS_MIX_RATE, PROTECT, SPLIT_INFER, MIN_SILENCE, SILENCE_THRESHOLD, SEEK_STEP, KEEP_SILENCE, FORMANT_SHIFT, QUEFRENCY, TIMBRE, F0_AUTOTUNE, OUTPUT_FORMAT, upload_audio], outputs=output_audio ) with gr.Tab("Download RVC Model"): with gr.Row(): url = gr.Textbox(label="Your model URL") dirname = gr.Textbox(label="Your Model name") outout_pah = gr.Textbox(label="output download", interactive=False) button_model = gr.Button("Download model") button_model.click(fn=download_online_model, inputs=[url, dirname], outputs=[outout_pah]) with gr.Tab("Audio Separation"): with gr.Row(): input_audio = gr.Audio(type="filepath", label="Upload Audio File") with gr.Row(): with gr.Accordion("Separation by Link", open = False): with gr.Row(): roformer_link = gr.Textbox( label = "Link", placeholder = "Paste the link here", interactive = True ) with gr.Row(): gr.Markdown("You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)") with gr.Row(): roformer_download_button = gr.Button( "Download!", variant = "primary" ) roformer_download_button.click(download_audio, [roformer_link], [input_audio]) with gr.Row(): model_voc_inst = gr.Textbox(value='model_bs_roformer_ep_317_sdr_12.9755.ckpt', label="Vocal & Instrumental Model", visible=False) model_deecho = gr.Textbox(value='UVR-DeEcho-DeReverb.pth', label="DeEcho-DeReverb Model", visible=False) model_back_voc = gr.Textbox(value='mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt', label="Backing Vocals Model", visible=False) submit_button = gr.Button("Separate Audio") with gr.Row(): output_files = gr.File(label="Download Separated Files", multiple=True) submit_button.click( fn=gradio_interface, inputs=[input_audio, model_voc_inst, model_deecho, model_back_voc], outputs=output_files ) # Launch the Gradio app app.launch( share=args.share_enabled, server_name=None if not args.listen else (args.listen_host or '0.0.0.0'), server_port=args.listen_port, )