# Will be fixed soon, but meanwhile: import os if os.getenv('SPACES_ZERO_GPU') == "true": os.environ['SPACES_ZERO_GPU'] = "1" import gradio as gr import random import torch import os from torch import inference_mode from typing import Optional, List import numpy as np from models import load_model import utils import spaces import huggingface_hub from inversion_utils import inversion_forward_process, inversion_reverse_process LDM2 = "cvssp/audioldm2" MUSIC = "cvssp/audioldm2-music" LDM2_LARGE = "cvssp/audioldm2-large" STABLEAUD = "chaowenguo/stable-audio-open-1.0" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ldm2 = load_model(model_id=LDM2, device=device) ldm2_large = load_model(model_id=LDM2_LARGE, device=device) ldm2_music = load_model(model_id=MUSIC, device=device) ldm_stableaud = load_model(model_id=STABLEAUD, device=device, token=os.getenv('PRIV_TOKEN')) def randomize_seed_fn(seed, randomize_seed): if randomize_seed: seed = random.randint(0, np.iinfo(np.int32).max) torch.manual_seed(seed) return seed def invert(ldm_stable, x0, prompt_src, num_diffusion_steps, cfg_scale_src, duration, save_compute): # ldm_stable.model.scheduler.set_timesteps(num_diffusion_steps, device=device) with inference_mode(): w0 = ldm_stable.vae_encode(x0) # find Zs and wts - forward process _, zs, wts, extra_info = inversion_forward_process(ldm_stable, w0, etas=1, prompts=[prompt_src], cfg_scales=[cfg_scale_src], num_inference_steps=num_diffusion_steps, numerical_fix=True, duration=duration, save_compute=save_compute) return zs, wts, extra_info def sample(ldm_stable, zs, wts, extra_info, prompt_tar, tstart, cfg_scale_tar, duration, save_compute): # reverse process (via Zs and wT) tstart = torch.tensor(tstart, dtype=torch.int) w0, _ = inversion_reverse_process(ldm_stable, xT=wts, tstart=tstart, etas=1., prompts=[prompt_tar], neg_prompts=[""], cfg_scales=[cfg_scale_tar], zs=zs[:int(tstart)], duration=duration, extra_info=extra_info, save_compute=save_compute) # vae decode image with inference_mode(): x0_dec = ldm_stable.vae_decode(w0) if 'stable-audio' not in ldm_stable.model_id: if x0_dec.dim() < 4: x0_dec = x0_dec[None, :, :, :] with torch.no_grad(): audio = ldm_stable.decode_to_mel(x0_dec) else: audio = x0_dec.squeeze(0).T return (ldm_stable.get_sr(), audio.squeeze().cpu().numpy()) def get_duration(input_audio, model_id: str, do_inversion: bool, wts: Optional[torch.Tensor], zs: Optional[torch.Tensor], extra_info: Optional[List], saved_inv_model: str, source_prompt: str = "", target_prompt: str = "", steps: int = 200, cfg_scale_src: float = 3.5, cfg_scale_tar: float = 12, t_start: int = 45, randomize_seed: bool = True, save_compute: bool = True, oauth_token: Optional[gr.OAuthToken] = None): if model_id == LDM2: factor = 1 elif model_id == LDM2_LARGE: factor = 2.5 elif model_id == STABLEAUD: factor = 3.2 else: # MUSIC factor = 1 forwards = 0 if do_inversion or randomize_seed: forwards = steps if source_prompt == "" else steps * 2 # x2 when there is a prompt text forwards += int(t_start / 100 * steps) * 2 duration = min(utils.get_duration(input_audio), utils.MAX_DURATION) time_for_maxlength = factor * forwards * 0.15 # 0.25 is the time per forward pass if model_id != STABLEAUD: time_for_maxlength = time_for_maxlength / utils.MAX_DURATION * duration print('expected time:', time_for_maxlength) spare_time = 5 return max(10, time_for_maxlength + spare_time) def verify_model_params(model_id: str, input_audio, src_prompt: str, tar_prompt: str, cfg_scale_src: float, oauth_token: gr.OAuthToken | None): if input_audio is None: raise gr.Error('Input audio missing!') if tar_prompt == "": raise gr.Error("Please provide a target prompt to edit the audio.") if src_prompt != "": if model_id == STABLEAUD and cfg_scale_src != 1: gr.Info("Consider using Source Guidance Scale=1 for Stable Audio Open 1.0.") elif model_id != STABLEAUD and cfg_scale_src != 3: gr.Info(f"Consider using Source Guidance Scale=3 for {model_id}.") if model_id == STABLEAUD: if oauth_token is None: raise gr.Error("You must be logged in to use Stable Audio Open 1.0. Please log in and try again.") try: huggingface_hub.get_hf_file_metadata(huggingface_hub.hf_hub_url(STABLEAUD, 'transformer/config.json'), token=oauth_token.token) print('Has Access') # except huggingface_hub.utils._errors.GatedRepoError: except huggingface_hub.errors.GatedRepoError: raise gr.Error("You need to accept the license agreement to use Stable Audio Open 1.0. " "Visit the " "model page to get access.") @spaces.GPU(duration=get_duration) def edit(input_audio, model_id: str, do_inversion: bool, wts: Optional[torch.Tensor], zs: Optional[torch.Tensor], extra_info: Optional[List], saved_inv_model: str, source_prompt: str = "", target_prompt: str = "", steps: int = 200, cfg_scale_src: float = 3.5, cfg_scale_tar: float = 12, t_start: int = 45, randomize_seed: bool = True, save_compute: bool = True, oauth_token: Optional[gr.OAuthToken] = None): print(model_id) if model_id == LDM2: ldm_stable = ldm2 elif model_id == LDM2_LARGE: ldm_stable = ldm2_large elif model_id == STABLEAUD: ldm_stable = ldm_stableaud else: # MUSIC ldm_stable = ldm2_music ldm_stable.model.scheduler.set_timesteps(steps, device=device) # If the inversion was done for a different model, we need to re-run the inversion if not do_inversion and (saved_inv_model is None or saved_inv_model != model_id): do_inversion = True if input_audio is None: raise gr.Error('Input audio missing!') x0, _, duration = utils.load_audio(input_audio, ldm_stable.get_fn_STFT(), device=device, stft=('stable-audio' not in ldm_stable.model_id), model_sr=ldm_stable.get_sr()) if wts is None or zs is None: do_inversion = True if do_inversion or randomize_seed: # always re-run inversion zs_tensor, wts_tensor, extra_info_list = invert(ldm_stable=ldm_stable, x0=x0, prompt_src=source_prompt, num_diffusion_steps=steps, cfg_scale_src=cfg_scale_src, duration=duration, save_compute=save_compute) wts = wts_tensor zs = zs_tensor extra_info = extra_info_list saved_inv_model = model_id do_inversion = False else: wts_tensor = wts.to(device) zs_tensor = zs.to(device) extra_info_list = [e.to(device) for e in extra_info if e is not None] output = sample(ldm_stable, zs_tensor, wts_tensor, extra_info_list, prompt_tar=target_prompt, tstart=int(t_start / 100 * steps), cfg_scale_tar=cfg_scale_tar, duration=duration, save_compute=save_compute) return output, wts.cpu(), zs.cpu(), [e.cpu() for e in extra_info if e is not None], saved_inv_model, do_inversion # return output, wtszs_file, saved_inv_model, do_inversion def get_example(): case = [ ['Examples/Beethoven.mp3', '', 'A recording of an arcade game soundtrack.', 45, 'cvssp/audioldm2-music', '27s', 'Examples/Beethoven_arcade.mp3', ], ['Examples/Beethoven.mp3', 'A high quality recording of wind instruments and strings playing.', 'A high quality recording of a piano playing.', 45, 'cvssp/audioldm2-music', '27s', 'Examples/Beethoven_piano.mp3', ], ['Examples/Beethoven.mp3', '', 'Heavy Rock.', 40, 'stabilityai/stable-audio-open-1.0', '27s', 'Examples/Beethoven_rock.mp3', ], ['Examples/ModalJazz.mp3', 'Trumpets playing alongside a piano, bass and drums in an upbeat old-timey cool jazz song.', 'A banjo playing alongside a piano, bass and drums in an upbeat old-timey cool country song.', 45, 'cvssp/audioldm2-music', '106s', 'Examples/ModalJazz_banjo.mp3',], ['Examples/Shadows.mp3', '', '8-bit arcade game soundtrack.', 40, 'stabilityai/stable-audio-open-1.0', '34s', 'Examples/Shadows_arcade.mp3',], ['Examples/Cat.mp3', '', 'A dog barking.', 75, 'cvssp/audioldm2-large', '10s', 'Examples/Cat_dog.mp3',] ] return case intro = """
For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings.
NEW - 15.10.24: You can now edit using Stable Audio Open 1.0. You must be logged in after accepting the license agreement to use it.
T-start=40%
.Source Guidance Scale=1
,
and you can try fewer timesteps (even 20!).
NEW - 15.10.24: Parallel editing is enabled by default.
To disable, uncheck Efficient editing
under "More Options".
Saves a bit of time.
MAX_DURATION
parameter
inside utils.py
to None
.