hilamanor commited on
Commit
4397c18
β€’
1 Parent(s): 53217a2

dropdown>radio, t-start percentage, intro text change

Browse files
Files changed (1) hide show
  1. app.py +37 -39
app.py CHANGED
@@ -20,7 +20,7 @@ LDM2_LARGE = "cvssp/audioldm2-large"
20
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
  ldm2 = load_model(model_id=LDM2, device=device)
22
  ldm2_large = load_model(model_id=LDM2_LARGE, device=device)
23
- ldm2_music = load_model(model_id= MUSIC, device=device)
24
 
25
 
26
  def randomize_seed_fn(seed, randomize_seed):
@@ -46,7 +46,6 @@ def invert(ldm_stable, x0, prompt_src, num_diffusion_steps, cfg_scale_src): # ,
46
  return zs, wts
47
 
48
 
49
-
50
  def sample(ldm_stable, zs, wts, steps, prompt_tar, tstart, cfg_scale_tar): # , ldm_stable):
51
  # reverse process (via Zs and wT)
52
  tstart = torch.tensor(tstart, dtype=torch.int)
@@ -71,14 +70,16 @@ def sample(ldm_stable, zs, wts, steps, prompt_tar, tstart, cfg_scale_tar): # ,
71
 
72
  return f.name
73
 
74
- def change_tstart_range(t_start, steps):
75
- maximum = int(0.8 * steps)
76
- minimum = int(0.15 * steps)
77
- if t_start > maximum:
78
- t_start = maximum
79
- elif t_start < minimum:
80
- t_start = minimum
81
- return t_start
 
 
82
 
83
  def edit(input_audio,
84
  model_id: str,
@@ -89,7 +90,7 @@ def edit(input_audio,
89
  steps=200,
90
  cfg_scale_src=3.5,
91
  cfg_scale_tar=12,
92
- t_start=90,
93
  randomize_seed=True):
94
 
95
  # global ldm_stable, current_loaded_model
@@ -104,10 +105,8 @@ def edit(input_audio,
104
  ldm_stable = ldm2
105
  elif model_id == LDM2_LARGE:
106
  ldm_stable = ldm2_large
107
- else: # MUSIC
108
  ldm_stable = ldm2_music
109
-
110
-
111
 
112
  # If the inversion was done for a different model, we need to re-run the inversion
113
  if not do_inversion and (saved_inv_model is None or saved_inv_model != model_id):
@@ -123,25 +122,22 @@ def edit(input_audio,
123
  zs = gr.State(value=zs_tensor)
124
  saved_inv_model = model_id
125
  do_inversion = False
126
-
127
  # make sure t_start is in the right limit
128
- t_start = change_tstart_range(t_start, steps)
129
 
130
- output = sample(ldm_stable, zs.value, wts.value, steps, prompt_tar=target_prompt, tstart=t_start,
131
- cfg_scale_tar=cfg_scale_tar)
132
 
133
  return output, wts, zs, saved_inv_model, do_inversion
134
 
135
 
136
-
137
-
138
-
139
  def get_example():
140
  case = [
141
  ['Examples/Beethoven.wav',
142
  '',
143
  'A recording of an arcade game soundtrack.',
144
- 90,
145
  'cvssp/audioldm2-music',
146
  '27s',
147
  'Examples/Beethoven_arcade.wav',
@@ -149,7 +145,7 @@ def get_example():
149
  ['Examples/Beethoven.wav',
150
  'A high quality recording of wind instruments and strings playing.',
151
  'A high quality recording of a piano playing.',
152
- 90,
153
  'cvssp/audioldm2-music',
154
  '27s',
155
  'Examples/Beethoven_piano.wav',
@@ -157,14 +153,14 @@ def get_example():
157
  ['Examples/ModalJazz.wav',
158
  'Trumpets playing alongside a piano, bass and drums in an upbeat old-timey cool jazz song.',
159
  'A banjo playing alongside a piano, bass and drums in an upbeat old-timey cool country song.',
160
- 90,
161
  'cvssp/audioldm2-music',
162
  '106s',
163
  'Examples/ModalJazz_banjo.wav',],
164
  ['Examples/Cat.wav',
165
  '',
166
  'A dog barking.',
167
- 150,
168
  'cvssp/audioldm2-large',
169
  '10s',
170
  'Examples/Cat_dog.wav',]
@@ -173,15 +169,15 @@ def get_example():
173
 
174
 
175
  intro = """
176
- <h1 style="font-weight: 1400; text-align: center; margin-bottom: 7px;"> AUDI 🎧 </h1>
177
- <h2 style="font-weight: 1400; text-align: center; margin-bottom: 7px;"> Audio editing Using DDPM Inversion πŸŽ›οΈ </h2>
178
  <h3 style="margin-bottom: 10px; text-align: center;">
179
  <a href="https://arxiv.org/abs/2402.10009">[Paper]</a>&nbsp;|&nbsp;
180
  <a href="https://hilamanor.github.io/AudioEditing/">[Project page]</a>&nbsp;|&nbsp;
181
  <a href="https://github.com/HilaManor/AudioEditingCode">[Code]</a>
182
  </h3>
183
  <p style="font-size:large">
184
- Demo for the method introduced in:
185
  <b <a href="https://arxiv.org/abs/2402.10009" style="text-decoration: underline;" target="_blank"> Zero-Shot Unsupervised and Text-Based Audio Editing Using DDPM Inversion </a> </b>
186
  </p>
187
  <p style="font-size:larger">
@@ -228,22 +224,24 @@ with gr.Blocks(css='style.css') as demo:
228
  output_audio = gr.Audio(label="Edited Audio", interactive=False, scale=1)
229
 
230
  with gr.Row():
231
- tar_prompt = gr.Textbox(label="Prompt", info="Describe your desired edited output", placeholder="a recording of a happy upbeat arcade game soundtrack",
232
- lines=2, interactive=True)
233
-
 
 
 
 
 
 
 
 
 
 
234
 
235
  with gr.Row():
236
  with gr.Column():
237
  submit = gr.Button("Edit")
238
 
239
- with gr.Row():
240
- t_start = gr.Slider(minimum=10, maximum=240, value=30, step=1, label="T-start", interactive=True, scale=3,
241
- info="Higher T-start -> stronger edit. Lower T-start -> closer to original audio")
242
- model_id = gr.Dropdown(label="AudioLDM2 Version", choices=["cvssp/audioldm2",
243
- "cvssp/audioldm2-large",
244
- "cvssp/audioldm2-music"],
245
- info="Choose a checkpoint suitable for your intended audio and edit",
246
- value="cvssp/audioldm2-music", interactive=True, type="value", scale=2)
247
  with gr.Accordion("More Options", open=False):
248
  with gr.Row():
249
  src_prompt = gr.Textbox(label="Source Prompt", lines=2, interactive=True, info= "Optional: Describe the original audio input",
 
20
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
  ldm2 = load_model(model_id=LDM2, device=device)
22
  ldm2_large = load_model(model_id=LDM2_LARGE, device=device)
23
+ ldm2_music = load_model(model_id=MUSIC, device=device)
24
 
25
 
26
  def randomize_seed_fn(seed, randomize_seed):
 
46
  return zs, wts
47
 
48
 
 
49
  def sample(ldm_stable, zs, wts, steps, prompt_tar, tstart, cfg_scale_tar): # , ldm_stable):
50
  # reverse process (via Zs and wT)
51
  tstart = torch.tensor(tstart, dtype=torch.int)
 
70
 
71
  return f.name
72
 
73
+
74
+ # def change_tstart_range(t_start, steps):
75
+ # maximum = int(0.8 * steps)
76
+ # minimum = int(0.15 * steps)
77
+ # if t_start > maximum:
78
+ # t_start = maximum
79
+ # elif t_start < minimum:
80
+ # t_start = minimum
81
+ # return t_start
82
+
83
 
84
  def edit(input_audio,
85
  model_id: str,
 
90
  steps=200,
91
  cfg_scale_src=3.5,
92
  cfg_scale_tar=12,
93
+ t_start=45,
94
  randomize_seed=True):
95
 
96
  # global ldm_stable, current_loaded_model
 
105
  ldm_stable = ldm2
106
  elif model_id == LDM2_LARGE:
107
  ldm_stable = ldm2_large
108
+ else: # MUSIC
109
  ldm_stable = ldm2_music
 
 
110
 
111
  # If the inversion was done for a different model, we need to re-run the inversion
112
  if not do_inversion and (saved_inv_model is None or saved_inv_model != model_id):
 
122
  zs = gr.State(value=zs_tensor)
123
  saved_inv_model = model_id
124
  do_inversion = False
125
+
126
  # make sure t_start is in the right limit
127
+ # t_start = change_tstart_range(t_start, steps)
128
 
129
+ output = sample(ldm_stable, zs.value, wts.value, steps, prompt_tar=target_prompt,
130
+ tstart=int(t_start / 100 * steps), cfg_scale_tar=cfg_scale_tar)
131
 
132
  return output, wts, zs, saved_inv_model, do_inversion
133
 
134
 
 
 
 
135
  def get_example():
136
  case = [
137
  ['Examples/Beethoven.wav',
138
  '',
139
  'A recording of an arcade game soundtrack.',
140
+ 45,
141
  'cvssp/audioldm2-music',
142
  '27s',
143
  'Examples/Beethoven_arcade.wav',
 
145
  ['Examples/Beethoven.wav',
146
  'A high quality recording of wind instruments and strings playing.',
147
  'A high quality recording of a piano playing.',
148
+ 45,
149
  'cvssp/audioldm2-music',
150
  '27s',
151
  'Examples/Beethoven_piano.wav',
 
153
  ['Examples/ModalJazz.wav',
154
  'Trumpets playing alongside a piano, bass and drums in an upbeat old-timey cool jazz song.',
155
  'A banjo playing alongside a piano, bass and drums in an upbeat old-timey cool country song.',
156
+ 45,
157
  'cvssp/audioldm2-music',
158
  '106s',
159
  'Examples/ModalJazz_banjo.wav',],
160
  ['Examples/Cat.wav',
161
  '',
162
  'A dog barking.',
163
+ 75,
164
  'cvssp/audioldm2-large',
165
  '10s',
166
  'Examples/Cat_dog.wav',]
 
169
 
170
 
171
  intro = """
172
+ <h1 style="font-weight: 1400; text-align: center; margin-bottom: 7px;"> ZETA Editing 🎧 </h1>
173
+ <h2 style="font-weight: 1400; text-align: center; margin-bottom: 7px;"> Zero-Shot Text-Based Audio Editing Using DDPM Inversion πŸŽ›οΈ </h2>
174
  <h3 style="margin-bottom: 10px; text-align: center;">
175
  <a href="https://arxiv.org/abs/2402.10009">[Paper]</a>&nbsp;|&nbsp;
176
  <a href="https://hilamanor.github.io/AudioEditing/">[Project page]</a>&nbsp;|&nbsp;
177
  <a href="https://github.com/HilaManor/AudioEditingCode">[Code]</a>
178
  </h3>
179
  <p style="font-size:large">
180
+ Demo for the text-based editing method introduced in:
181
  <b <a href="https://arxiv.org/abs/2402.10009" style="text-decoration: underline;" target="_blank"> Zero-Shot Unsupervised and Text-Based Audio Editing Using DDPM Inversion </a> </b>
182
  </p>
183
  <p style="font-size:larger">
 
224
  output_audio = gr.Audio(label="Edited Audio", interactive=False, scale=1)
225
 
226
  with gr.Row():
227
+ tar_prompt = gr.Textbox(label="Prompt", info="Describe your desired edited output", placeholder="a recording of a happy upbeat arcade game soundtrack",
228
+ lines=2, interactive=True)
229
+
230
+ with gr.Row():
231
+ t_start = gr.Slider(minimum=15, maximum=85, value=45, step=1, label="T-start (%)", interactive=True, scale=3,
232
+ info="Higher T-start -> stronger edit. Lower T-start -> closer to original audio.")
233
+ # model_id = gr.Dropdown(label="AudioLDM2 Version",
234
+ model_id = gr.Radio(label="AudioLDM2 Version",
235
+ choices=["cvssp/audioldm2",
236
+ "cvssp/audioldm2-large",
237
+ "cvssp/audioldm2-music"],
238
+ info="Choose a checkpoint suitable for your intended audio and edit",
239
+ value="cvssp/audioldm2-music", interactive=True, type="value", scale=2)
240
 
241
  with gr.Row():
242
  with gr.Column():
243
  submit = gr.Button("Edit")
244
 
 
 
 
 
 
 
 
 
245
  with gr.Accordion("More Options", open=False):
246
  with gr.Row():
247
  src_prompt = gr.Textbox(label="Source Prompt", lines=2, interactive=True, info= "Optional: Describe the original audio input",