Spaces:
Running
Running
dropdown>radio, t-start percentage, intro text change
Browse files
app.py
CHANGED
@@ -20,7 +20,7 @@ LDM2_LARGE = "cvssp/audioldm2-large"
|
|
20 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
21 |
ldm2 = load_model(model_id=LDM2, device=device)
|
22 |
ldm2_large = load_model(model_id=LDM2_LARGE, device=device)
|
23 |
-
ldm2_music = load_model(model_id=
|
24 |
|
25 |
|
26 |
def randomize_seed_fn(seed, randomize_seed):
|
@@ -46,7 +46,6 @@ def invert(ldm_stable, x0, prompt_src, num_diffusion_steps, cfg_scale_src): # ,
|
|
46 |
return zs, wts
|
47 |
|
48 |
|
49 |
-
|
50 |
def sample(ldm_stable, zs, wts, steps, prompt_tar, tstart, cfg_scale_tar): # , ldm_stable):
|
51 |
# reverse process (via Zs and wT)
|
52 |
tstart = torch.tensor(tstart, dtype=torch.int)
|
@@ -71,14 +70,16 @@ def sample(ldm_stable, zs, wts, steps, prompt_tar, tstart, cfg_scale_tar): # ,
|
|
71 |
|
72 |
return f.name
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
82 |
|
83 |
def edit(input_audio,
|
84 |
model_id: str,
|
@@ -89,7 +90,7 @@ def edit(input_audio,
|
|
89 |
steps=200,
|
90 |
cfg_scale_src=3.5,
|
91 |
cfg_scale_tar=12,
|
92 |
-
t_start=
|
93 |
randomize_seed=True):
|
94 |
|
95 |
# global ldm_stable, current_loaded_model
|
@@ -104,10 +105,8 @@ def edit(input_audio,
|
|
104 |
ldm_stable = ldm2
|
105 |
elif model_id == LDM2_LARGE:
|
106 |
ldm_stable = ldm2_large
|
107 |
-
else:
|
108 |
ldm_stable = ldm2_music
|
109 |
-
|
110 |
-
|
111 |
|
112 |
# If the inversion was done for a different model, we need to re-run the inversion
|
113 |
if not do_inversion and (saved_inv_model is None or saved_inv_model != model_id):
|
@@ -123,25 +122,22 @@ def edit(input_audio,
|
|
123 |
zs = gr.State(value=zs_tensor)
|
124 |
saved_inv_model = model_id
|
125 |
do_inversion = False
|
126 |
-
|
127 |
# make sure t_start is in the right limit
|
128 |
-
t_start = change_tstart_range(t_start, steps)
|
129 |
|
130 |
-
output = sample(ldm_stable, zs.value, wts.value, steps, prompt_tar=target_prompt,
|
131 |
-
cfg_scale_tar=cfg_scale_tar)
|
132 |
|
133 |
return output, wts, zs, saved_inv_model, do_inversion
|
134 |
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
def get_example():
|
140 |
case = [
|
141 |
['Examples/Beethoven.wav',
|
142 |
'',
|
143 |
'A recording of an arcade game soundtrack.',
|
144 |
-
|
145 |
'cvssp/audioldm2-music',
|
146 |
'27s',
|
147 |
'Examples/Beethoven_arcade.wav',
|
@@ -149,7 +145,7 @@ def get_example():
|
|
149 |
['Examples/Beethoven.wav',
|
150 |
'A high quality recording of wind instruments and strings playing.',
|
151 |
'A high quality recording of a piano playing.',
|
152 |
-
|
153 |
'cvssp/audioldm2-music',
|
154 |
'27s',
|
155 |
'Examples/Beethoven_piano.wav',
|
@@ -157,14 +153,14 @@ def get_example():
|
|
157 |
['Examples/ModalJazz.wav',
|
158 |
'Trumpets playing alongside a piano, bass and drums in an upbeat old-timey cool jazz song.',
|
159 |
'A banjo playing alongside a piano, bass and drums in an upbeat old-timey cool country song.',
|
160 |
-
|
161 |
'cvssp/audioldm2-music',
|
162 |
'106s',
|
163 |
'Examples/ModalJazz_banjo.wav',],
|
164 |
['Examples/Cat.wav',
|
165 |
'',
|
166 |
'A dog barking.',
|
167 |
-
|
168 |
'cvssp/audioldm2-large',
|
169 |
'10s',
|
170 |
'Examples/Cat_dog.wav',]
|
@@ -173,15 +169,15 @@ def get_example():
|
|
173 |
|
174 |
|
175 |
intro = """
|
176 |
-
<h1 style="font-weight: 1400; text-align: center; margin-bottom: 7px;">
|
177 |
-
<h2 style="font-weight: 1400; text-align: center; margin-bottom: 7px;"> Audio
|
178 |
<h3 style="margin-bottom: 10px; text-align: center;">
|
179 |
<a href="https://arxiv.org/abs/2402.10009">[Paper]</a> |
|
180 |
<a href="https://hilamanor.github.io/AudioEditing/">[Project page]</a> |
|
181 |
<a href="https://github.com/HilaManor/AudioEditingCode">[Code]</a>
|
182 |
</h3>
|
183 |
<p style="font-size:large">
|
184 |
-
Demo for the method introduced in:
|
185 |
<b <a href="https://arxiv.org/abs/2402.10009" style="text-decoration: underline;" target="_blank"> Zero-Shot Unsupervised and Text-Based Audio Editing Using DDPM Inversion </a> </b>
|
186 |
</p>
|
187 |
<p style="font-size:larger">
|
@@ -228,22 +224,24 @@ with gr.Blocks(css='style.css') as demo:
|
|
228 |
output_audio = gr.Audio(label="Edited Audio", interactive=False, scale=1)
|
229 |
|
230 |
with gr.Row():
|
231 |
-
|
232 |
-
|
233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
with gr.Row():
|
236 |
with gr.Column():
|
237 |
submit = gr.Button("Edit")
|
238 |
|
239 |
-
with gr.Row():
|
240 |
-
t_start = gr.Slider(minimum=10, maximum=240, value=30, step=1, label="T-start", interactive=True, scale=3,
|
241 |
-
info="Higher T-start -> stronger edit. Lower T-start -> closer to original audio")
|
242 |
-
model_id = gr.Dropdown(label="AudioLDM2 Version", choices=["cvssp/audioldm2",
|
243 |
-
"cvssp/audioldm2-large",
|
244 |
-
"cvssp/audioldm2-music"],
|
245 |
-
info="Choose a checkpoint suitable for your intended audio and edit",
|
246 |
-
value="cvssp/audioldm2-music", interactive=True, type="value", scale=2)
|
247 |
with gr.Accordion("More Options", open=False):
|
248 |
with gr.Row():
|
249 |
src_prompt = gr.Textbox(label="Source Prompt", lines=2, interactive=True, info= "Optional: Describe the original audio input",
|
|
|
20 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
21 |
ldm2 = load_model(model_id=LDM2, device=device)
|
22 |
ldm2_large = load_model(model_id=LDM2_LARGE, device=device)
|
23 |
+
ldm2_music = load_model(model_id=MUSIC, device=device)
|
24 |
|
25 |
|
26 |
def randomize_seed_fn(seed, randomize_seed):
|
|
|
46 |
return zs, wts
|
47 |
|
48 |
|
|
|
49 |
def sample(ldm_stable, zs, wts, steps, prompt_tar, tstart, cfg_scale_tar): # , ldm_stable):
|
50 |
# reverse process (via Zs and wT)
|
51 |
tstart = torch.tensor(tstart, dtype=torch.int)
|
|
|
70 |
|
71 |
return f.name
|
72 |
|
73 |
+
|
74 |
+
# def change_tstart_range(t_start, steps):
|
75 |
+
# maximum = int(0.8 * steps)
|
76 |
+
# minimum = int(0.15 * steps)
|
77 |
+
# if t_start > maximum:
|
78 |
+
# t_start = maximum
|
79 |
+
# elif t_start < minimum:
|
80 |
+
# t_start = minimum
|
81 |
+
# return t_start
|
82 |
+
|
83 |
|
84 |
def edit(input_audio,
|
85 |
model_id: str,
|
|
|
90 |
steps=200,
|
91 |
cfg_scale_src=3.5,
|
92 |
cfg_scale_tar=12,
|
93 |
+
t_start=45,
|
94 |
randomize_seed=True):
|
95 |
|
96 |
# global ldm_stable, current_loaded_model
|
|
|
105 |
ldm_stable = ldm2
|
106 |
elif model_id == LDM2_LARGE:
|
107 |
ldm_stable = ldm2_large
|
108 |
+
else: # MUSIC
|
109 |
ldm_stable = ldm2_music
|
|
|
|
|
110 |
|
111 |
# If the inversion was done for a different model, we need to re-run the inversion
|
112 |
if not do_inversion and (saved_inv_model is None or saved_inv_model != model_id):
|
|
|
122 |
zs = gr.State(value=zs_tensor)
|
123 |
saved_inv_model = model_id
|
124 |
do_inversion = False
|
125 |
+
|
126 |
# make sure t_start is in the right limit
|
127 |
+
# t_start = change_tstart_range(t_start, steps)
|
128 |
|
129 |
+
output = sample(ldm_stable, zs.value, wts.value, steps, prompt_tar=target_prompt,
|
130 |
+
tstart=int(t_start / 100 * steps), cfg_scale_tar=cfg_scale_tar)
|
131 |
|
132 |
return output, wts, zs, saved_inv_model, do_inversion
|
133 |
|
134 |
|
|
|
|
|
|
|
135 |
def get_example():
|
136 |
case = [
|
137 |
['Examples/Beethoven.wav',
|
138 |
'',
|
139 |
'A recording of an arcade game soundtrack.',
|
140 |
+
45,
|
141 |
'cvssp/audioldm2-music',
|
142 |
'27s',
|
143 |
'Examples/Beethoven_arcade.wav',
|
|
|
145 |
['Examples/Beethoven.wav',
|
146 |
'A high quality recording of wind instruments and strings playing.',
|
147 |
'A high quality recording of a piano playing.',
|
148 |
+
45,
|
149 |
'cvssp/audioldm2-music',
|
150 |
'27s',
|
151 |
'Examples/Beethoven_piano.wav',
|
|
|
153 |
['Examples/ModalJazz.wav',
|
154 |
'Trumpets playing alongside a piano, bass and drums in an upbeat old-timey cool jazz song.',
|
155 |
'A banjo playing alongside a piano, bass and drums in an upbeat old-timey cool country song.',
|
156 |
+
45,
|
157 |
'cvssp/audioldm2-music',
|
158 |
'106s',
|
159 |
'Examples/ModalJazz_banjo.wav',],
|
160 |
['Examples/Cat.wav',
|
161 |
'',
|
162 |
'A dog barking.',
|
163 |
+
75,
|
164 |
'cvssp/audioldm2-large',
|
165 |
'10s',
|
166 |
'Examples/Cat_dog.wav',]
|
|
|
169 |
|
170 |
|
171 |
intro = """
|
172 |
+
<h1 style="font-weight: 1400; text-align: center; margin-bottom: 7px;"> ZETA Editing 🎧 </h1>
|
173 |
+
<h2 style="font-weight: 1400; text-align: center; margin-bottom: 7px;"> Zero-Shot Text-Based Audio Editing Using DDPM Inversion 🎛️ </h2>
|
174 |
<h3 style="margin-bottom: 10px; text-align: center;">
|
175 |
<a href="https://arxiv.org/abs/2402.10009">[Paper]</a> |
|
176 |
<a href="https://hilamanor.github.io/AudioEditing/">[Project page]</a> |
|
177 |
<a href="https://github.com/HilaManor/AudioEditingCode">[Code]</a>
|
178 |
</h3>
|
179 |
<p style="font-size:large">
|
180 |
+
Demo for the text-based editing method introduced in:
|
181 |
<b <a href="https://arxiv.org/abs/2402.10009" style="text-decoration: underline;" target="_blank"> Zero-Shot Unsupervised and Text-Based Audio Editing Using DDPM Inversion </a> </b>
|
182 |
</p>
|
183 |
<p style="font-size:larger">
|
|
|
224 |
output_audio = gr.Audio(label="Edited Audio", interactive=False, scale=1)
|
225 |
|
226 |
with gr.Row():
|
227 |
+
tar_prompt = gr.Textbox(label="Prompt", info="Describe your desired edited output", placeholder="a recording of a happy upbeat arcade game soundtrack",
|
228 |
+
lines=2, interactive=True)
|
229 |
+
|
230 |
+
with gr.Row():
|
231 |
+
t_start = gr.Slider(minimum=15, maximum=85, value=45, step=1, label="T-start (%)", interactive=True, scale=3,
|
232 |
+
info="Higher T-start -> stronger edit. Lower T-start -> closer to original audio.")
|
233 |
+
# model_id = gr.Dropdown(label="AudioLDM2 Version",
|
234 |
+
model_id = gr.Radio(label="AudioLDM2 Version",
|
235 |
+
choices=["cvssp/audioldm2",
|
236 |
+
"cvssp/audioldm2-large",
|
237 |
+
"cvssp/audioldm2-music"],
|
238 |
+
info="Choose a checkpoint suitable for your intended audio and edit",
|
239 |
+
value="cvssp/audioldm2-music", interactive=True, type="value", scale=2)
|
240 |
|
241 |
with gr.Row():
|
242 |
with gr.Column():
|
243 |
submit = gr.Button("Edit")
|
244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
with gr.Accordion("More Options", open=False):
|
246 |
with gr.Row():
|
247 |
src_prompt = gr.Textbox(label="Source Prompt", lines=2, interactive=True, info= "Optional: Describe the original audio input",
|