import os import sys import json import argparse import subprocess from functools import lru_cache from distutils.util import strtobool now_dir = os.getcwd() sys.path.append(now_dir) current_script_directory = os.path.dirname(os.path.realpath(__file__)) logs_path = os.path.join(current_script_directory, "logs") from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline from rvc.lib.tools.analyzer import analyze_audio from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline from rvc.lib.tools.model_download import model_download_pipeline python = sys.executable # Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4 @lru_cache(maxsize=1) # Cache only one result since the file is static def load_voices_data(): with open( os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r", encoding="utf-8" ) as file: return json.load(file) voices_data = load_voices_data() locales = list({voice["ShortName"] for voice in voices_data}) @lru_cache(maxsize=None) def import_voice_converter(): from rvc.infer.infer import VoiceConverter return VoiceConverter() @lru_cache(maxsize=1) def get_config(): from rvc.configs.config import Config return Config() # Infer def run_infer_script( pitch: int, filter_radius: int, index_rate: float, volume_envelope: int, protect: float, hop_length: int, f0_method: str, input_path: str, output_path: str, pth_path: str, index_path: str, split_audio: bool, f0_autotune: bool, f0_autotune_strength: float, clean_audio: bool, clean_strength: float, export_format: str, f0_file: str, embedder_model: str, embedder_model_custom: str = None, formant_shifting: bool = False, formant_qfrency: float = 1.0, formant_timbre: float = 1.0, post_process: bool = False, reverb: bool = False, pitch_shift: bool = False, limiter: bool = False, gain: bool = False, distortion: bool = False, chorus: bool = False, bitcrush: bool = False, clipping: bool = False, compressor: bool = False, delay: bool = False, reverb_room_size: float = 0.5, reverb_damping: float = 0.5, reverb_wet_gain: float = 0.5, reverb_dry_gain: float = 0.5, reverb_width: float = 0.5, reverb_freeze_mode: float = 0.5, pitch_shift_semitones: float = 0.0, limiter_threshold: float = -6, limiter_release_time: float = 0.01, gain_db: float = 0.0, distortion_gain: float = 25, chorus_rate: float = 1.0, chorus_depth: float = 0.25, chorus_center_delay: float = 7, chorus_feedback: float = 0.0, chorus_mix: float = 0.5, bitcrush_bit_depth: int = 8, clipping_threshold: float = -6, compressor_threshold: float = 0, compressor_ratio: float = 1, compressor_attack: float = 1.0, compressor_release: float = 100, delay_seconds: float = 0.5, delay_feedback: float = 0.0, delay_mix: float = 0.5, sid: int = 0, ): kwargs = { "audio_input_path": input_path, "audio_output_path": output_path, "model_path": pth_path, "index_path": index_path, "pitch": pitch, "filter_radius": filter_radius, "index_rate": index_rate, "volume_envelope": volume_envelope, "protect": protect, "hop_length": hop_length, "f0_method": f0_method, "pth_path": pth_path, "index_path": index_path, "split_audio": split_audio, "f0_autotune": f0_autotune, "f0_autotune_strength": f0_autotune_strength, "clean_audio": clean_audio, "clean_strength": clean_strength, "export_format": export_format, "f0_file": f0_file, "embedder_model": embedder_model, "embedder_model_custom": embedder_model_custom, "post_process": post_process, "formant_shifting": formant_shifting, "formant_qfrency": formant_qfrency, "formant_timbre": formant_timbre, "reverb": reverb, "pitch_shift": pitch_shift, "limiter": limiter, "gain": gain, "distortion": distortion, "chorus": chorus, "bitcrush": bitcrush, "clipping": clipping, "compressor": compressor, "delay": delay, "reverb_room_size": reverb_room_size, "reverb_damping": reverb_damping, "reverb_wet_level": reverb_wet_gain, "reverb_dry_level": reverb_dry_gain, "reverb_width": reverb_width, "reverb_freeze_mode": reverb_freeze_mode, "pitch_shift_semitones": pitch_shift_semitones, "limiter_threshold": limiter_threshold, "limiter_release": limiter_release_time, "gain_db": gain_db, "distortion_gain": distortion_gain, "chorus_rate": chorus_rate, "chorus_depth": chorus_depth, "chorus_delay": chorus_center_delay, "chorus_feedback": chorus_feedback, "chorus_mix": chorus_mix, "bitcrush_bit_depth": bitcrush_bit_depth, "clipping_threshold": clipping_threshold, "compressor_threshold": compressor_threshold, "compressor_ratio": compressor_ratio, "compressor_attack": compressor_attack, "compressor_release": compressor_release, "delay_seconds": delay_seconds, "delay_feedback": delay_feedback, "delay_mix": delay_mix, "sid": sid, } infer_pipeline = import_voice_converter() infer_pipeline.convert_audio( **kwargs, ) return f"File {input_path} inferred successfully.", output_path.replace( ".wav", f".{export_format.lower()}" ) # Batch infer def run_batch_infer_script( pitch: int, filter_radius: int, index_rate: float, volume_envelope: int, protect: float, hop_length: int, f0_method: str, input_folder: str, output_folder: str, pth_path: str, index_path: str, split_audio: bool, f0_autotune: bool, f0_autotune_strength: float, clean_audio: bool, clean_strength: float, export_format: str, f0_file: str, embedder_model: str, embedder_model_custom: str = None, formant_shifting: bool = False, formant_qfrency: float = 1.0, formant_timbre: float = 1.0, post_process: bool = False, reverb: bool = False, pitch_shift: bool = False, limiter: bool = False, gain: bool = False, distortion: bool = False, chorus: bool = False, bitcrush: bool = False, clipping: bool = False, compressor: bool = False, delay: bool = False, reverb_room_size: float = 0.5, reverb_damping: float = 0.5, reverb_wet_gain: float = 0.5, reverb_dry_gain: float = 0.5, reverb_width: float = 0.5, reverb_freeze_mode: float = 0.5, pitch_shift_semitones: float = 0.0, limiter_threshold: float = -6, limiter_release_time: float = 0.01, gain_db: float = 0.0, distortion_gain: float = 25, chorus_rate: float = 1.0, chorus_depth: float = 0.25, chorus_center_delay: float = 7, chorus_feedback: float = 0.0, chorus_mix: float = 0.5, bitcrush_bit_depth: int = 8, clipping_threshold: float = -6, compressor_threshold: float = 0, compressor_ratio: float = 1, compressor_attack: float = 1.0, compressor_release: float = 100, delay_seconds: float = 0.5, delay_feedback: float = 0.0, delay_mix: float = 0.5, sid: int = 0, ): kwargs = { "audio_input_paths": input_folder, "audio_output_path": output_folder, "model_path": pth_path, "index_path": index_path, "pitch": pitch, "filter_radius": filter_radius, "index_rate": index_rate, "volume_envelope": volume_envelope, "protect": protect, "hop_length": hop_length, "f0_method": f0_method, "pth_path": pth_path, "index_path": index_path, "split_audio": split_audio, "f0_autotune": f0_autotune, "f0_autotune_strength": f0_autotune_strength, "clean_audio": clean_audio, "clean_strength": clean_strength, "export_format": export_format, "f0_file": f0_file, "embedder_model": embedder_model, "embedder_model_custom": embedder_model_custom, "post_process": post_process, "formant_shifting": formant_shifting, "formant_qfrency": formant_qfrency, "formant_timbre": formant_timbre, "reverb": reverb, "pitch_shift": pitch_shift, "limiter": limiter, "gain": gain, "distortion": distortion, "chorus": chorus, "bitcrush": bitcrush, "clipping": clipping, "compressor": compressor, "delay": delay, "reverb_room_size": reverb_room_size, "reverb_damping": reverb_damping, "reverb_wet_level": reverb_wet_gain, "reverb_dry_level": reverb_dry_gain, "reverb_width": reverb_width, "reverb_freeze_mode": reverb_freeze_mode, "pitch_shift_semitones": pitch_shift_semitones, "limiter_threshold": limiter_threshold, "limiter_release": limiter_release_time, "gain_db": gain_db, "distortion_gain": distortion_gain, "chorus_rate": chorus_rate, "chorus_depth": chorus_depth, "chorus_delay": chorus_center_delay, "chorus_feedback": chorus_feedback, "chorus_mix": chorus_mix, "bitcrush_bit_depth": bitcrush_bit_depth, "clipping_threshold": clipping_threshold, "compressor_threshold": compressor_threshold, "compressor_ratio": compressor_ratio, "compressor_attack": compressor_attack, "compressor_release": compressor_release, "delay_seconds": delay_seconds, "delay_feedback": delay_feedback, "delay_mix": delay_mix, "sid": sid, } infer_pipeline = import_voice_converter() infer_pipeline.convert_audio_batch( **kwargs, ) return f"Files from {input_folder} inferred successfully." # TTS def run_tts_script( tts_file: str, tts_text: str, tts_voice: str, tts_rate: int, pitch: int, filter_radius: int, index_rate: float, volume_envelope: int, protect: float, hop_length: int, f0_method: str, output_tts_path: str, output_rvc_path: str, pth_path: str, index_path: str, split_audio: bool, f0_autotune: bool, f0_autotune_strength: float, clean_audio: bool, clean_strength: float, export_format: str, f0_file: str, embedder_model: str, embedder_model_custom: str = None, sid: int = 0, ): tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py") if os.path.exists(output_tts_path): os.remove(output_tts_path) command_tts = [ *map( str, [ python, tts_script_path, tts_file, tts_text, tts_voice, tts_rate, output_tts_path, ], ), ] subprocess.run(command_tts) infer_pipeline = import_voice_converter() infer_pipeline.convert_audio( pitch=pitch, filter_radius=filter_radius, index_rate=index_rate, volume_envelope=volume_envelope, protect=protect, hop_length=hop_length, f0_method=f0_method, audio_input_path=output_tts_path, audio_output_path=output_rvc_path, model_path=pth_path, index_path=index_path, split_audio=split_audio, f0_autotune=f0_autotune, f0_autotune_strength=f0_autotune_strength, clean_audio=clean_audio, clean_strength=clean_strength, export_format=export_format, f0_file=f0_file, embedder_model=embedder_model, embedder_model_custom=embedder_model_custom, sid=sid, formant_shifting=None, formant_qfrency=None, formant_timbre=None, post_process=None, reverb=None, pitch_shift=None, limiter=None, gain=None, distortion=None, chorus=None, bitcrush=None, clipping=None, compressor=None, delay=None, sliders=None, ) return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace( ".wav", f".{export_format.lower()}" ) # Tensorboard def run_tensorboard_script(): launch_tensorboard_pipeline() # Download def run_download_script(model_link: str): model_download_pipeline(model_link) return f"Model downloaded successfully." # Prerequisites def run_prerequisites_script( pretraineds_v1_f0: bool, pretraineds_v1_nof0: bool, pretraineds_v2_f0: bool, pretraineds_v2_nof0: bool, models: bool, exe: bool, ): prequisites_download_pipeline( pretraineds_v1_f0, pretraineds_v1_nof0, pretraineds_v2_f0, pretraineds_v2_nof0, models, exe, ) return "Prerequisites installed successfully." # Audio analyzer def run_audio_analyzer_script( input_path: str, save_plot_path: str = "logs/audio_analysis.png" ): audio_info, plot_path = analyze_audio(input_path, save_plot_path) print( f"Audio info of {input_path}: {audio_info}", f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}", ) return audio_info, plot_path # Parse arguments def parse_arguments(): parser = argparse.ArgumentParser( description="Run the main.py script with specific parameters." ) subparsers = parser.add_subparsers( title="subcommands", dest="mode", help="Choose a mode" ) # Parser for 'infer' mode infer_parser = subparsers.add_parser("infer", help="Run inference") pitch_description = ( "Set the pitch of the audio. Higher values result in a higher pitch." ) infer_parser.add_argument( "--pitch", type=int, help=pitch_description, choices=range(-24, 25), default=0, ) filter_radius_description = "Apply median filtering to the extracted pitch values if this value is greater than or equal to three. This can help reduce breathiness in the output audio." infer_parser.add_argument( "--filter_radius", type=int, help=filter_radius_description, choices=range(11), default=3, ) index_rate_description = "Control the influence of the index file on the output. Higher values mean stronger influence. Lower values can help reduce artifacts but may result in less accurate voice cloning." infer_parser.add_argument( "--index_rate", type=float, help=index_rate_description, choices=[i / 100.0 for i in range(0, 101)], default=0.3, ) volume_envelope_description = "Control the blending of the output's volume envelope. A value of 1 means the output envelope is fully used." infer_parser.add_argument( "--volume_envelope", type=float, help=volume_envelope_description, choices=[i / 100.0 for i in range(0, 101)], default=1, ) protect_description = "Protect consonants and breathing sounds from artifacts. A value of 0.5 offers the strongest protection, while lower values may reduce the protection level but potentially mitigate the indexing effect." infer_parser.add_argument( "--protect", type=float, help=protect_description, choices=[i / 1000.0 for i in range(0, 501)], default=0.33, ) hop_length_description = "Only applicable for the Crepe pitch extraction method. Determines the time it takes for the system to react to a significant pitch change. Smaller values require more processing time but can lead to better pitch accuracy." infer_parser.add_argument( "--hop_length", type=int, help=hop_length_description, choices=range(1, 513), default=128, ) f0_method_description = "Choose the pitch extraction algorithm for the conversion. 'rmvpe' is the default and generally recommended." infer_parser.add_argument( "--f0_method", type=str, help=f0_method_description, choices=[ "crepe", "crepe-tiny", "rmvpe", "fcpe", "hybrid[crepe+rmvpe]", "hybrid[crepe+fcpe]", "hybrid[rmvpe+fcpe]", "hybrid[crepe+rmvpe+fcpe]", ], default="rmvpe", ) infer_parser.add_argument( "--input_path", type=str, help="Full path to the input audio file.", required=True, ) infer_parser.add_argument( "--output_path", type=str, help="Full path to the output audio file.", required=True, ) pth_path_description = "Full path to the RVC model file (.pth)." infer_parser.add_argument( "--pth_path", type=str, help=pth_path_description, required=True ) index_path_description = "Full path to the index file (.index)." infer_parser.add_argument( "--index_path", type=str, help=index_path_description, required=True ) split_audio_description = "Split the audio into smaller segments before inference. This can improve the quality of the output for longer audio files." infer_parser.add_argument( "--split_audio", type=lambda x: bool(strtobool(x)), choices=[True, False], help=split_audio_description, default=False, ) f0_autotune_description = "Apply a light autotune to the inferred audio. Particularly useful for singing voice conversions." infer_parser.add_argument( "--f0_autotune", type=lambda x: bool(strtobool(x)), choices=[True, False], help=f0_autotune_description, default=False, ) f0_autotune_strength_description = "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid." infer_parser.add_argument( "--f0_autotune_strength", type=float, help=f0_autotune_strength_description, choices=[(i / 10) for i in range(11)], default=1.0, ) clean_audio_description = "Clean the output audio using noise reduction algorithms. Recommended for speech conversions." infer_parser.add_argument( "--clean_audio", type=lambda x: bool(strtobool(x)), choices=[True, False], help=clean_audio_description, default=False, ) clean_strength_description = "Adjust the intensity of the audio cleaning process. Higher values result in stronger cleaning, but may lead to a more compressed sound." infer_parser.add_argument( "--clean_strength", type=float, help=clean_strength_description, choices=[(i / 10) for i in range(11)], default=0.7, ) export_format_description = "Select the desired output audio format." infer_parser.add_argument( "--export_format", type=str, help=export_format_description, choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], default="WAV", ) embedder_model_description = ( "Choose the model used for generating speaker embeddings." ) infer_parser.add_argument( "--embedder_model", type=str, help=embedder_model_description, choices=[ "contentvec", "chinese-hubert-base", "japanese-hubert-base", "korean-hubert-base", "custom", ], default="contentvec", ) embedder_model_custom_description = "Specify the path to a custom model for speaker embedding. Only applicable if 'embedder_model' is set to 'custom'." infer_parser.add_argument( "--embedder_model_custom", type=str, help=embedder_model_custom_description, default=None, ) f0_file_description = "Full path to an external F0 file (.f0). This allows you to use pre-computed pitch values for the input audio." infer_parser.add_argument( "--f0_file", type=str, help=f0_file_description, default=None, ) formant_shifting_description = "Apply formant shifting to the input audio. This can help adjust the timbre of the voice." infer_parser.add_argument( "--formant_shifting", type=lambda x: bool(strtobool(x)), choices=[True, False], help=formant_shifting_description, default=False, required=False, ) formant_qfrency_description = "Control the frequency of the formant shifting effect. Higher values result in a more pronounced effect." infer_parser.add_argument( "--formant_qfrency", type=float, help=formant_qfrency_description, default=1.0, required=False, ) formant_timbre_description = "Control the timbre of the formant shifting effect. Higher values result in a more pronounced effect." infer_parser.add_argument( "--formant_timbre", type=float, help=formant_timbre_description, default=1.0, required=False, ) sid_description = "Speaker ID for multi-speaker models." infer_parser.add_argument( "--sid", type=int, help=sid_description, default=0, required=False, ) post_process_description = "Apply post-processing effects to the output audio." infer_parser.add_argument( "--post_process", type=lambda x: bool(strtobool(x)), choices=[True, False], help=post_process_description, default=False, required=False, ) reverb_description = "Apply reverb effect to the output audio." infer_parser.add_argument( "--reverb", type=lambda x: bool(strtobool(x)), choices=[True, False], help=reverb_description, default=False, required=False, ) pitch_shift_description = "Apply pitch shifting effect to the output audio." infer_parser.add_argument( "--pitch_shift", type=lambda x: bool(strtobool(x)), choices=[True, False], help=pitch_shift_description, default=False, required=False, ) limiter_description = "Apply limiter effect to the output audio." infer_parser.add_argument( "--limiter", type=lambda x: bool(strtobool(x)), choices=[True, False], help=limiter_description, default=False, required=False, ) gain_description = "Apply gain effect to the output audio." infer_parser.add_argument( "--gain", type=lambda x: bool(strtobool(x)), choices=[True, False], help=gain_description, default=False, required=False, ) distortion_description = "Apply distortion effect to the output audio." infer_parser.add_argument( "--distortion", type=lambda x: bool(strtobool(x)), choices=[True, False], help=distortion_description, default=False, required=False, ) chorus_description = "Apply chorus effect to the output audio." infer_parser.add_argument( "--chorus", type=lambda x: bool(strtobool(x)), choices=[True, False], help=chorus_description, default=False, required=False, ) bitcrush_description = "Apply bitcrush effect to the output audio." infer_parser.add_argument( "--bitcrush", type=lambda x: bool(strtobool(x)), choices=[True, False], help=bitcrush_description, default=False, required=False, ) clipping_description = "Apply clipping effect to the output audio." infer_parser.add_argument( "--clipping", type=lambda x: bool(strtobool(x)), choices=[True, False], help=clipping_description, default=False, required=False, ) compressor_description = "Apply compressor effect to the output audio." infer_parser.add_argument( "--compressor", type=lambda x: bool(strtobool(x)), choices=[True, False], help=compressor_description, default=False, required=False, ) delay_description = "Apply delay effect to the output audio." infer_parser.add_argument( "--delay", type=lambda x: bool(strtobool(x)), choices=[True, False], help=delay_description, default=False, required=False, ) reverb_room_size_description = "Control the room size of the reverb effect. Higher values result in a larger room size." infer_parser.add_argument( "--reverb_room_size", type=float, help=reverb_room_size_description, default=0.5, required=False, ) reverb_damping_description = "Control the damping of the reverb effect. Higher values result in a more damped sound." infer_parser.add_argument( "--reverb_damping", type=float, help=reverb_damping_description, default=0.5, required=False, ) reverb_wet_gain_description = "Control the wet gain of the reverb effect. Higher values result in a stronger reverb effect." infer_parser.add_argument( "--reverb_wet_gain", type=float, help=reverb_wet_gain_description, default=0.5, required=False, ) reverb_dry_gain_description = "Control the dry gain of the reverb effect. Higher values result in a stronger dry signal." infer_parser.add_argument( "--reverb_dry_gain", type=float, help=reverb_dry_gain_description, default=0.5, required=False, ) reverb_width_description = "Control the stereo width of the reverb effect. Higher values result in a wider stereo image." infer_parser.add_argument( "--reverb_width", type=float, help=reverb_width_description, default=0.5, required=False, ) reverb_freeze_mode_description = "Control the freeze mode of the reverb effect. Higher values result in a stronger freeze effect." infer_parser.add_argument( "--reverb_freeze_mode", type=float, help=reverb_freeze_mode_description, default=0.5, required=False, ) pitch_shift_semitones_description = "Control the pitch shift in semitones. Positive values increase the pitch, while negative values decrease it." infer_parser.add_argument( "--pitch_shift_semitones", type=float, help=pitch_shift_semitones_description, default=0.0, required=False, ) limiter_threshold_description = "Control the threshold of the limiter effect. Higher values result in a stronger limiting effect." infer_parser.add_argument( "--limiter_threshold", type=float, help=limiter_threshold_description, default=-6, required=False, ) limiter_release_time_description = "Control the release time of the limiter effect. Higher values result in a longer release time." infer_parser.add_argument( "--limiter_release_time", type=float, help=limiter_release_time_description, default=0.01, required=False, ) gain_db_description = "Control the gain in decibels. Positive values increase the gain, while negative values decrease it." infer_parser.add_argument( "--gain_db", type=float, help=gain_db_description, default=0.0, required=False, ) distortion_gain_description = "Control the gain of the distortion effect. Higher values result in a stronger distortion effect." infer_parser.add_argument( "--distortion_gain", type=float, help=distortion_gain_description, default=25, required=False, ) chorus_rate_description = "Control the rate of the chorus effect. Higher values result in a faster chorus effect." infer_parser.add_argument( "--chorus_rate", type=float, help=chorus_rate_description, default=1.0, required=False, ) chorus_depth_description = "Control the depth of the chorus effect. Higher values result in a stronger chorus effect." infer_parser.add_argument( "--chorus_depth", type=float, help=chorus_depth_description, default=0.25, required=False, ) chorus_center_delay_description = "Control the center delay of the chorus effect. Higher values result in a longer center delay." infer_parser.add_argument( "--chorus_center_delay", type=float, help=chorus_center_delay_description, default=7, required=False, ) chorus_feedback_description = "Control the feedback of the chorus effect. Higher values result in a stronger feedback effect." infer_parser.add_argument( "--chorus_feedback", type=float, help=chorus_feedback_description, default=0.0, required=False, ) chorus_mix_description = "Control the mix of the chorus effect. Higher values result in a stronger chorus effect." infer_parser.add_argument( "--chorus_mix", type=float, help=chorus_mix_description, default=0.5, required=False, ) bitcrush_bit_depth_description = "Control the bit depth of the bitcrush effect. Higher values result in a stronger bitcrush effect." infer_parser.add_argument( "--bitcrush_bit_depth", type=int, help=bitcrush_bit_depth_description, default=8, required=False, ) clipping_threshold_description = "Control the threshold of the clipping effect. Higher values result in a stronger clipping effect." infer_parser.add_argument( "--clipping_threshold", type=float, help=clipping_threshold_description, default=-6, required=False, ) compressor_threshold_description = "Control the threshold of the compressor effect. Higher values result in a stronger compressor effect." infer_parser.add_argument( "--compressor_threshold", type=float, help=compressor_threshold_description, default=0, required=False, ) compressor_ratio_description = "Control the ratio of the compressor effect. Higher values result in a stronger compressor effect." infer_parser.add_argument( "--compressor_ratio", type=float, help=compressor_ratio_description, default=1, required=False, ) compressor_attack_description = "Control the attack of the compressor effect. Higher values result in a stronger compressor effect." infer_parser.add_argument( "--compressor_attack", type=float, help=compressor_attack_description, default=1.0, required=False, ) compressor_release_description = "Control the release of the compressor effect. Higher values result in a stronger compressor effect." infer_parser.add_argument( "--compressor_release", type=float, help=compressor_release_description, default=100, required=False, ) delay_seconds_description = "Control the delay time in seconds. Higher values result in a longer delay time." infer_parser.add_argument( "--delay_seconds", type=float, help=delay_seconds_description, default=0.5, required=False, ) delay_feedback_description = "Control the feedback of the delay effect. Higher values result in a stronger feedback effect." infer_parser.add_argument( "--delay_feedback", type=float, help=delay_feedback_description, default=0.0, required=False, ) delay_mix_description = "Control the mix of the delay effect. Higher values result in a stronger delay effect." infer_parser.add_argument( "--delay_mix", type=float, help=delay_mix_description, default=0.5, required=False, ) # Parser for 'batch_infer' mode batch_infer_parser = subparsers.add_parser( "batch_infer", help="Run batch inference", ) batch_infer_parser.add_argument( "--pitch", type=int, help=pitch_description, choices=range(-24, 25), default=0, ) batch_infer_parser.add_argument( "--filter_radius", type=int, help=filter_radius_description, choices=range(11), default=3, ) batch_infer_parser.add_argument( "--index_rate", type=float, help=index_rate_description, choices=[i / 100.0 for i in range(0, 101)], default=0.3, ) batch_infer_parser.add_argument( "--volume_envelope", type=float, help=volume_envelope_description, choices=[i / 100.0 for i in range(0, 101)], default=1, ) batch_infer_parser.add_argument( "--protect", type=float, help=protect_description, choices=[i / 1000.0 for i in range(0, 501)], default=0.33, ) batch_infer_parser.add_argument( "--hop_length", type=int, help=hop_length_description, choices=range(1, 513), default=128, ) batch_infer_parser.add_argument( "--f0_method", type=str, help=f0_method_description, choices=[ "crepe", "crepe-tiny", "rmvpe", "fcpe", "hybrid[crepe+rmvpe]", "hybrid[crepe+fcpe]", "hybrid[rmvpe+fcpe]", "hybrid[crepe+rmvpe+fcpe]", ], default="rmvpe", ) batch_infer_parser.add_argument( "--input_folder", type=str, help="Path to the folder containing input audio files.", required=True, ) batch_infer_parser.add_argument( "--output_folder", type=str, help="Path to the folder for saving output audio files.", required=True, ) batch_infer_parser.add_argument( "--pth_path", type=str, help=pth_path_description, required=True ) batch_infer_parser.add_argument( "--index_path", type=str, help=index_path_description, required=True ) batch_infer_parser.add_argument( "--split_audio", type=lambda x: bool(strtobool(x)), choices=[True, False], help=split_audio_description, default=False, ) batch_infer_parser.add_argument( "--f0_autotune", type=lambda x: bool(strtobool(x)), choices=[True, False], help=f0_autotune_description, default=False, ) batch_infer_parser.add_argument( "--f0_autotune_strength", type=float, help=clean_strength_description, choices=[(i / 10) for i in range(11)], default=1.0, ) batch_infer_parser.add_argument( "--clean_audio", type=lambda x: bool(strtobool(x)), choices=[True, False], help=clean_audio_description, default=False, ) batch_infer_parser.add_argument( "--clean_strength", type=float, help=clean_strength_description, choices=[(i / 10) for i in range(11)], default=0.7, ) batch_infer_parser.add_argument( "--export_format", type=str, help=export_format_description, choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], default="WAV", ) batch_infer_parser.add_argument( "--embedder_model", type=str, help=embedder_model_description, choices=[ "contentvec", "chinese-hubert-base", "japanese-hubert-base", "korean-hubert-base", "custom", ], default="contentvec", ) batch_infer_parser.add_argument( "--embedder_model_custom", type=str, help=embedder_model_custom_description, default=None, ) batch_infer_parser.add_argument( "--f0_file", type=str, help=f0_file_description, default=None, ) batch_infer_parser.add_argument( "--formant_shifting", type=lambda x: bool(strtobool(x)), choices=[True, False], help=formant_shifting_description, default=False, required=False, ) batch_infer_parser.add_argument( "--formant_qfrency", type=float, help=formant_qfrency_description, default=1.0, required=False, ) batch_infer_parser.add_argument( "--formant_timbre", type=float, help=formant_timbre_description, default=1.0, required=False, ) batch_infer_parser.add_argument( "--sid", type=int, help=sid_description, default=0, required=False, ) batch_infer_parser.add_argument( "--post_process", type=lambda x: bool(strtobool(x)), choices=[True, False], help=post_process_description, default=False, required=False, ) batch_infer_parser.add_argument( "--reverb", type=lambda x: bool(strtobool(x)), choices=[True, False], help=reverb_description, default=False, required=False, ) batch_infer_parser.add_argument( "--pitch_shift", type=lambda x: bool(strtobool(x)), choices=[True, False], help=pitch_shift_description, default=False, required=False, ) batch_infer_parser.add_argument( "--limiter", type=lambda x: bool(strtobool(x)), choices=[True, False], help=limiter_description, default=False, required=False, ) batch_infer_parser.add_argument( "--gain", type=lambda x: bool(strtobool(x)), choices=[True, False], help=gain_description, default=False, required=False, ) batch_infer_parser.add_argument( "--distortion", type=lambda x: bool(strtobool(x)), choices=[True, False], help=distortion_description, default=False, required=False, ) batch_infer_parser.add_argument( "--chorus", type=lambda x: bool(strtobool(x)), choices=[True, False], help=chorus_description, default=False, required=False, ) batch_infer_parser.add_argument( "--bitcrush", type=lambda x: bool(strtobool(x)), choices=[True, False], help=bitcrush_description, default=False, required=False, ) batch_infer_parser.add_argument( "--clipping", type=lambda x: bool(strtobool(x)), choices=[True, False], help=clipping_description, default=False, required=False, ) batch_infer_parser.add_argument( "--compressor", type=lambda x: bool(strtobool(x)), choices=[True, False], help=compressor_description, default=False, required=False, ) batch_infer_parser.add_argument( "--delay", type=lambda x: bool(strtobool(x)), choices=[True, False], help=delay_description, default=False, required=False, ) batch_infer_parser.add_argument( "--reverb_room_size", type=float, help=reverb_room_size_description, default=0.5, required=False, ) batch_infer_parser.add_argument( "--reverb_damping", type=float, help=reverb_damping_description, default=0.5, required=False, ) batch_infer_parser.add_argument( "--reverb_wet_gain", type=float, help=reverb_wet_gain_description, default=0.5, required=False, ) batch_infer_parser.add_argument( "--reverb_dry_gain", type=float, help=reverb_dry_gain_description, default=0.5, required=False, ) batch_infer_parser.add_argument( "--reverb_width", type=float, help=reverb_width_description, default=0.5, required=False, ) batch_infer_parser.add_argument( "--reverb_freeze_mode", type=float, help=reverb_freeze_mode_description, default=0.5, required=False, ) batch_infer_parser.add_argument( "--pitch_shift_semitones", type=float, help=pitch_shift_semitones_description, default=0.0, required=False, ) batch_infer_parser.add_argument( "--limiter_threshold", type=float, help=limiter_threshold_description, default=-6, required=False, ) batch_infer_parser.add_argument( "--limiter_release_time", type=float, help=limiter_release_time_description, default=0.01, required=False, ) batch_infer_parser.add_argument( "--gain_db", type=float, help=gain_db_description, default=0.0, required=False, ) batch_infer_parser.add_argument( "--distortion_gain", type=float, help=distortion_gain_description, default=25, required=False, ) batch_infer_parser.add_argument( "--chorus_rate", type=float, help=chorus_rate_description, default=1.0, required=False, ) batch_infer_parser.add_argument( "--chorus_depth", type=float, help=chorus_depth_description, default=0.25, required=False, ) batch_infer_parser.add_argument( "--chorus_center_delay", type=float, help=chorus_center_delay_description, default=7, required=False, ) batch_infer_parser.add_argument( "--chorus_feedback", type=float, help=chorus_feedback_description, default=0.0, required=False, ) batch_infer_parser.add_argument( "--chorus_mix", type=float, help=chorus_mix_description, default=0.5, required=False, ) batch_infer_parser.add_argument( "--bitcrush_bit_depth", type=int, help=bitcrush_bit_depth_description, default=8, required=False, ) batch_infer_parser.add_argument( "--clipping_threshold", type=float, help=clipping_threshold_description, default=-6, required=False, ) batch_infer_parser.add_argument( "--compressor_threshold", type=float, help=compressor_threshold_description, default=0, required=False, ) batch_infer_parser.add_argument( "--compressor_ratio", type=float, help=compressor_ratio_description, default=1, required=False, ) batch_infer_parser.add_argument( "--compressor_attack", type=float, help=compressor_attack_description, default=1.0, required=False, ) batch_infer_parser.add_argument( "--compressor_release", type=float, help=compressor_release_description, default=100, required=False, ) batch_infer_parser.add_argument( "--delay_seconds", type=float, help=delay_seconds_description, default=0.5, required=False, ) batch_infer_parser.add_argument( "--delay_feedback", type=float, help=delay_feedback_description, default=0.0, required=False, ) batch_infer_parser.add_argument( "--delay_mix", type=float, help=delay_mix_description, default=0.5, required=False, ) # Parser for 'tts' mode tts_parser = subparsers.add_parser("tts", help="Run TTS inference") tts_parser.add_argument( "--tts_file", type=str, help="File with a text to be synthesized", required=True ) tts_parser.add_argument( "--tts_text", type=str, help="Text to be synthesized", required=True ) tts_parser.add_argument( "--tts_voice", type=str, help="Voice to be used for TTS synthesis.", choices=locales, required=True, ) tts_parser.add_argument( "--tts_rate", type=int, help="Control the speaking rate of the TTS. Values range from -100 (slower) to 100 (faster).", choices=range(-100, 101), default=0, ) tts_parser.add_argument( "--pitch", type=int, help=pitch_description, choices=range(-24, 25), default=0, ) tts_parser.add_argument( "--filter_radius", type=int, help=filter_radius_description, choices=range(11), default=3, ) tts_parser.add_argument( "--index_rate", type=float, help=index_rate_description, choices=[(i / 10) for i in range(11)], default=0.3, ) tts_parser.add_argument( "--volume_envelope", type=float, help=volume_envelope_description, choices=[(i / 10) for i in range(11)], default=1, ) tts_parser.add_argument( "--protect", type=float, help=protect_description, choices=[(i / 10) for i in range(6)], default=0.33, ) tts_parser.add_argument( "--hop_length", type=int, help=hop_length_description, choices=range(1, 513), default=128, ) tts_parser.add_argument( "--f0_method", type=str, help=f0_method_description, choices=[ "crepe", "crepe-tiny", "rmvpe", "fcpe", "hybrid[crepe+rmvpe]", "hybrid[crepe+fcpe]", "hybrid[rmvpe+fcpe]", "hybrid[crepe+rmvpe+fcpe]", ], default="rmvpe", ) tts_parser.add_argument( "--output_tts_path", type=str, help="Full path to save the synthesized TTS audio.", required=True, ) tts_parser.add_argument( "--output_rvc_path", type=str, help="Full path to save the voice-converted audio using the synthesized TTS.", required=True, ) tts_parser.add_argument( "--pth_path", type=str, help=pth_path_description, required=True ) tts_parser.add_argument( "--index_path", type=str, help=index_path_description, required=True ) tts_parser.add_argument( "--split_audio", type=lambda x: bool(strtobool(x)), choices=[True, False], help=split_audio_description, default=False, ) tts_parser.add_argument( "--f0_autotune", type=lambda x: bool(strtobool(x)), choices=[True, False], help=f0_autotune_description, default=False, ) tts_parser.add_argument( "--f0_autotune_strength", type=float, help=clean_strength_description, choices=[(i / 10) for i in range(11)], default=1.0, ) tts_parser.add_argument( "--clean_audio", type=lambda x: bool(strtobool(x)), choices=[True, False], help=clean_audio_description, default=False, ) tts_parser.add_argument( "--clean_strength", type=float, help=clean_strength_description, choices=[(i / 10) for i in range(11)], default=0.7, ) tts_parser.add_argument( "--export_format", type=str, help=export_format_description, choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], default="WAV", ) tts_parser.add_argument( "--embedder_model", type=str, help=embedder_model_description, choices=[ "contentvec", "chinese-hubert-base", "japanese-hubert-base", "korean-hubert-base", "custom", ], default="contentvec", ) tts_parser.add_argument( "--embedder_model_custom", type=str, help=embedder_model_custom_description, default=None, ) tts_parser.add_argument( "--f0_file", type=str, help=f0_file_description, default=None, ) # Parser for 'tensorboard' mode subparsers.add_parser( "tensorboard", help="Launch TensorBoard for monitoring training progress." ) # Parser for 'download' mode download_parser = subparsers.add_parser( "download", help="Download a model from a provided link." ) download_parser.add_argument( "--model_link", type=str, help="Direct link to the model file.", required=True ) # Parser for 'prerequisites' mode prerequisites_parser = subparsers.add_parser( "prerequisites", help="Install prerequisites for RVC." ) prerequisites_parser.add_argument( "--pretraineds_v1_f0", type=lambda x: bool(strtobool(x)), choices=[True, False], default=False, help="Download pretrained models for RVC v1.", ) prerequisites_parser.add_argument( "--pretraineds_v2_f0", type=lambda x: bool(strtobool(x)), choices=[True, False], default=True, help="Download pretrained models for RVC v2.", ) prerequisites_parser.add_argument( "--pretraineds_v1_nof0", type=lambda x: bool(strtobool(x)), choices=[True, False], default=False, help="Download non f0 pretrained models for RVC v1.", ) prerequisites_parser.add_argument( "--pretraineds_v2_nof0", type=lambda x: bool(strtobool(x)), choices=[True, False], default=False, help="Download non f0 pretrained models for RVC v2.", ) prerequisites_parser.add_argument( "--models", type=lambda x: bool(strtobool(x)), choices=[True, False], default=True, help="Download additional models.", ) prerequisites_parser.add_argument( "--exe", type=lambda x: bool(strtobool(x)), choices=[True, False], default=True, help="Download required executables.", ) # Parser for 'audio_analyzer' mode audio_analyzer = subparsers.add_parser( "audio_analyzer", help="Analyze an audio file." ) audio_analyzer.add_argument( "--input_path", type=str, help="Path to the input audio file.", required=True ) return parser.parse_args() def main(): if len(sys.argv) == 1: print("Please run the script with '-h' for more information.") sys.exit(1) args = parse_arguments() try: if args.mode == "infer": run_infer_script( pitch=args.pitch, filter_radius=args.filter_radius, index_rate=args.index_rate, volume_envelope=args.volume_envelope, protect=args.protect, hop_length=args.hop_length, f0_method=args.f0_method, input_path=args.input_path, output_path=args.output_path, pth_path=args.pth_path, index_path=args.index_path, split_audio=args.split_audio, f0_autotune=args.f0_autotune, f0_autotune_strength=args.f0_autotune_strength, clean_audio=args.clean_audio, clean_strength=args.clean_strength, export_format=args.export_format, embedder_model=args.embedder_model, embedder_model_custom=args.embedder_model_custom, f0_file=args.f0_file, formant_shifting=args.formant_shifting, formant_qfrency=args.formant_qfrency, formant_timbre=args.formant_timbre, sid=args.sid, post_process=args.post_process, reverb=args.reverb, pitch_shift=args.pitch_shift, limiter=args.limiter, gain=args.gain, distortion=args.distortion, chorus=args.chorus, bitcrush=args.bitcrush, clipping=args.clipping, compressor=args.compressor, delay=args.delay, reverb_room_size=args.reverb_room_size, reverb_damping=args.reverb_damping, reverb_wet_gain=args.reverb_wet_gain, reverb_dry_gain=args.reverb_dry_gain, reverb_width=args.reverb_width, reverb_freeze_mode=args.reverb_freeze_mode, pitch_shift_semitones=args.pitch_shift_semitones, limiter_threshold=args.limiter_threshold, limiter_release_time=args.limiter_release_time, gain_db=args.gain_db, distortion_gain=args.distortion_gain, chorus_rate=args.chorus_rate, chorus_depth=args.chorus_depth, chorus_center_delay=args.chorus_center_delay, chorus_feedback=args.chorus_feedback, chorus_mix=args.chorus_mix, bitcrush_bit_depth=args.bitcrush_bit_depth, clipping_threshold=args.clipping_threshold, compressor_threshold=args.compressor_threshold, compressor_ratio=args.compressor_ratio, compressor_attack=args.compressor_attack, compressor_release=args.compressor_release, delay_seconds=args.delay_seconds, delay_feedback=args.delay_feedback, delay_mix=args.delay_mix, ) elif args.mode == "batch_infer": run_batch_infer_script( pitch=args.pitch, filter_radius=args.filter_radius, index_rate=args.index_rate, volume_envelope=args.volume_envelope, protect=args.protect, hop_length=args.hop_length, f0_method=args.f0_method, input_folder=args.input_folder, output_folder=args.output_folder, pth_path=args.pth_path, index_path=args.index_path, split_audio=args.split_audio, f0_autotune=args.f0_autotune, f0_autotune_strength=args.f0_autotune_strength, clean_audio=args.clean_audio, clean_strength=args.clean_strength, export_format=args.export_format, embedder_model=args.embedder_model, embedder_model_custom=args.embedder_model_custom, f0_file=args.f0_file, formant_shifting=args.formant_shifting, formant_qfrency=args.formant_qfrency, formant_timbre=args.formant_timbre, sid=args.sid, post_process=args.post_process, reverb=args.reverb, pitch_shift=args.pitch_shift, limiter=args.limiter, gain=args.gain, distortion=args.distortion, chorus=args.chorus, bitcrush=args.bitcrush, clipping=args.clipping, compressor=args.compressor, delay=args.delay, reverb_room_size=args.reverb_room_size, reverb_damping=args.reverb_damping, reverb_wet_gain=args.reverb_wet_gain, reverb_dry_gain=args.reverb_dry_gain, reverb_width=args.reverb_width, reverb_freeze_mode=args.reverb_freeze_mode, pitch_shift_semitones=args.pitch_shift_semitones, limiter_threshold=args.limiter_threshold, limiter_release_time=args.limiter_release_time, gain_db=args.gain_db, distortion_gain=args.distortion_gain, chorus_rate=args.chorus_rate, chorus_depth=args.chorus_depth, chorus_center_delay=args.chorus_center_delay, chorus_feedback=args.chorus_feedback, chorus_mix=args.chorus_mix, bitcrush_bit_depth=args.bitcrush_bit_depth, clipping_threshold=args.clipping_threshold, compressor_threshold=args.compressor_threshold, compressor_ratio=args.compressor_ratio, compressor_attack=args.compressor_attack, compressor_release=args.compressor_release, delay_seconds=args.delay_seconds, delay_feedback=args.delay_feedback, delay_mix=args.delay_mix, ) elif args.mode == "tts": run_tts_script( tts_file=args.tts_file, tts_text=args.tts_text, tts_voice=args.tts_voice, tts_rate=args.tts_rate, pitch=args.pitch, filter_radius=args.filter_radius, index_rate=args.index_rate, volume_envelope=args.volume_envelope, protect=args.protect, hop_length=args.hop_length, f0_method=args.f0_method, output_tts_path=args.output_tts_path, output_rvc_path=args.output_rvc_path, pth_path=args.pth_path, index_path=args.index_path, split_audio=args.split_audio, f0_autotune=args.f0_autotune, f0_autotune_strength=args.f0_autotune_strength, clean_audio=args.clean_audio, clean_strength=args.clean_strength, export_format=args.export_format, embedder_model=args.embedder_model, embedder_model_custom=args.embedder_model_custom, f0_file=args.f0_file, ) elif args.mode == "tensorboard": run_tensorboard_script() elif args.mode == "download": run_download_script( model_link=args.model_link, ) elif args.mode == "prerequisites": run_prerequisites_script( pretraineds_v1_f0=args.pretraineds_v1_f0, pretraineds_v1_nof0=args.pretraineds_v1_nof0, pretraineds_v2_f0=args.pretraineds_v2_f0, pretraineds_v2_nof0=args.pretraineds_v2_nof0, models=args.models, exe=args.exe, ) elif args.mode == "audio_analyzer": run_audio_analyzer_script( input_path=args.input_path, ) except Exception as error: print(f"An error occurred during execution: {error}") import traceback traceback.print_exc() if __name__ == "__main__": main()