ylacombe HF staff commited on
Commit
6acebe0
1 Parent(s): 2d82fd0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -113
app.py CHANGED
@@ -1,6 +1,3 @@
1
-
2
-
3
-
4
  import torch
5
 
6
  import gradio as gr
@@ -17,9 +14,6 @@ import os
17
  import time
18
  import demucs.api
19
 
20
- import tqdm
21
-
22
- os.environ["GRADIO_TEMP_DIR"] = "/home/yoach/spaces/tmp"
23
 
24
 
25
  MODEL_NAME = "openai/whisper-large-v3"
@@ -45,32 +39,30 @@ def separate_vocal(path):
45
  return path
46
 
47
 
48
- def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
49
  if inputs_path is None:
50
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
51
 
52
- pbar = tqdm.tqdm(total=4, desc="Overall progression")
53
-
54
  sampling_rate, inputs = wavfile.read(inputs_path)
55
 
56
- pbar.update(1)
57
- pbar.set_description("Transcribe using Whisper.")
58
  out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
59
 
60
  text = out["text"]
61
 
62
- pbar.update(1)
63
- pbar.set_description("Merge chunks.")
64
  chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, sampling_rate)
65
 
66
- pbar.update(1)
67
- pbar.set_description("Create dataset.")
68
-
69
-
70
  transcripts = []
71
  audios = []
72
  with tempfile.TemporaryDirectory() as tmpdirname:
73
- for i,chunk in tqdm.tqdm(enumerate(chunks), desc="Creating dataset (and clean audio if asked for)"):
74
 
75
  # TODO: make sure 1D or 2D?
76
  arr = chunk["audio"]
@@ -87,12 +79,10 @@ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAut
87
 
88
  dataset = Dataset.from_dict({"audio": audios, "transcript": transcripts}).cast_column("audio", Audio())
89
 
90
- pbar.update(1)
91
- pbar.set_description("Push dataset.")
92
- dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
93
 
94
- pbar.close()
95
- return text, [[transcript] for transcript in transcripts]
96
 
97
 
98
  def _return_yt_html_embed(yt_url):
@@ -135,18 +125,11 @@ def download_yt_audio(yt_url, filename):
135
  raise gr.Error(str(err))
136
 
137
 
138
- def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, max_filesize=75.0, dataset_sampling_rate = 24000,
139
- progress=gr.Progress(track_tqdm=True)):
140
-
141
- pbar = tqdm.tqdm(total=5, desc="Overall progression")
142
-
143
  html_embed_str = _return_yt_html_embed(yt_url)
144
 
145
  with tempfile.TemporaryDirectory() as tmpdirname:
146
  filepath = os.path.join(tmpdirname, "video.mp4")
147
- pbar.update(1)
148
- pbar.set_description("Download Youtube video.")
149
-
150
  download_yt_audio(yt_url, filepath)
151
  with open(filepath, "rb") as f:
152
  inputs_path = f.read()
@@ -154,25 +137,18 @@ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthT
154
  inputs = ffmpeg_read(inputs_path, pipe.feature_extractor.sampling_rate)
155
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
156
 
157
- pbar.update(1)
158
- pbar.set_description("Transcribe using Whisper.")
159
  out = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
160
 
161
  text = out["text"]
162
 
163
  inputs = ffmpeg_read(inputs_path, dataset_sampling_rate)
164
 
165
- pbar.update(1)
166
- pbar.set_description("Merge chunks.")
167
  chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, dataset_sampling_rate)
168
 
169
- pbar.update(1)
170
- pbar.set_description("Create dataset.")
171
-
172
  transcripts = []
173
  audios = []
174
  with tempfile.TemporaryDirectory() as tmpdirname:
175
- for i,chunk in tqdm.tqdm(enumerate(chunks), desc="Creating dataset (and clean audio if asked for)."):
176
 
177
  # TODO: make sure 1D or 2D?
178
  arr = chunk["audio"]
@@ -189,28 +165,23 @@ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthT
189
 
190
  dataset = Dataset.from_dict({"audio": audios, "transcript": transcripts}).cast_column("audio", Audio())
191
 
192
- pbar.update(1)
193
- pbar.set_description("Push dataset.")
194
- dataset.push_to_hub(dataset_name, token=oauth_token.token if oauth_token else oauth_token)
195
 
196
- pbar.close()
197
 
198
- return html_embed_str, text, [[transcript] for transcript in transcripts]
199
 
200
 
201
  def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_chars = ".!:;?", min_duration = 5):
202
  # merge chunks as long as merged audio duration is lower than min_duration and that a stop character is not met
203
  # return list of dictionnaries (text, audio)
204
  # min duration is in seconds
205
- pbar = tqdm.tqdm(total=len(chunks), desc="Post-processing transcribed chunks")
206
  min_duration = int(min_duration * sampling_rate)
207
-
208
 
209
  new_chunks = []
210
  while chunks:
211
  current_chunk = chunks.pop(0)
212
- pbar.update(1)
213
-
214
  begin, end = current_chunk["timestamp"]
215
  begin, end = int(begin*sampling_rate), int(end*sampling_rate)
216
 
@@ -222,7 +193,7 @@ def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_c
222
  chunk_to_concat = [audio_array[begin:end]]
223
  while chunks and (text[-1] not in stop_chars or (current_dur<min_duration)):
224
  ch = chunks.pop(0)
225
- pbar.update(1)
226
  begin, end = ch["timestamp"]
227
  begin, end = int(begin*sampling_rate), int(end*sampling_rate)
228
  current_dur += end-begin
@@ -238,75 +209,53 @@ def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_c
238
  "audio": np.concatenate(chunk_to_concat),
239
  })
240
  print(f"LENGTH CHUNK #{len(new_chunks)}: {current_dur/sampling_rate}s")
241
-
242
- pbar.close()
243
 
244
  return new_chunks
245
 
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
- css = """
249
- #container{
250
- margin: 0 auto;
251
- max-width: 80rem;
252
- }
253
- #intro{
254
- max-width: 100%;
255
- text-align: center;
256
- margin: 0 auto;
257
- }
258
- """
259
- with gr.Blocks(css=css) as demo:
260
  with gr.Row():
261
  gr.LoginButton().activate()
262
  gr.LogoutButton()
263
-
264
- with gr.Tab("Microphone or Audio file"):
265
- gr.Markdown("Create your own TTS dataset using your own recordings", elem_id="intro")
266
- gr.Markdown(
267
- "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
268
- f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to automatically transcribe audio files"
269
- " of arbitrary length. It then merge chunks of audio and push it to the hub."
270
- )
271
- with gr.Column():
272
- audio_file = gr.Audio(type="filepath")
273
- task_file = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
274
- cleaning_file = gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio")
275
- textbox_file = gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name")
276
-
277
- with gr.Row():
278
- clear_file = gr.ClearButton([audio_file, task_file, cleaning_file, textbox_file])
279
- submit_file = gr.Button("Submit")
280
-
281
- with gr.Column():
282
- transcript_file = gr.Textbox(label="Transcription")
283
- dataset_file = gr.Dataset(components=["text"], headers=["Transcripts"])
284
-
285
-
286
- with gr.Tab("YouTube"):
287
- gr.Markdown("Create your own TTS dataset using Youtube", elem_id="intro")
288
- gr.Markdown(
289
- "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
290
- f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to automatically transcribe audio files"
291
- " of arbitrary length. It then merge chunks of audio and push it to the hub."
292
- )
293
- with gr.Column():
294
- audio_youtube = gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")
295
- task_youtube = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
296
- cleaning_youtube = gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio")
297
- textbox_youtube = gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name")
298
-
299
- with gr.Row():
300
- clear_youtube = gr.ClearButton([audio_youtube, task_youtube, cleaning_youtube, textbox_youtube])
301
- submit_youtube = gr.Button("Submit")
302
-
303
- with gr.Column():
304
- html_youtube = gr.HTML()
305
- transcript_youtube = gr.Textbox(label="Transcription")
306
- dataset_youtube = gr.Dataset(components=["text"], headers=["Transcripts"])
307
-
308
-
309
- submit_file.click(transcribe, inputs=[audio_file, task_file, cleaning_file, textbox_file], outputs=[transcript_file, dataset_file])
310
- submit_youtube.click(yt_transcribe, inputs=[audio_youtube, task_youtube, cleaning_youtube, textbox_youtube], outputs=[html_youtube, transcript_youtube, dataset_youtube])
311
-
312
- demo.launch(debug=True)
 
 
 
 
1
  import torch
2
 
3
  import gradio as gr
 
14
  import time
15
  import demucs.api
16
 
 
 
 
17
 
18
 
19
  MODEL_NAME = "openai/whisper-large-v3"
 
39
  return path
40
 
41
 
42
+
43
+ # def separate_vocal(path, track_name, output_folder, demucs_model_name = "htdemucs_ft"):
44
+ #
45
+ # os.system(f"python3 -m demucs.separate --two-stems=vocals -n {demucs_model_name} {path} -o {output_folder}")
46
+ #
47
+ # return os.path.join(output_folder, demucs_model_name, track_name, "vocals.wav")
48
+
49
+
50
+ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None):
51
  if inputs_path is None:
52
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
53
 
 
 
54
  sampling_rate, inputs = wavfile.read(inputs_path)
55
 
 
 
56
  out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
57
 
58
  text = out["text"]
59
 
 
 
60
  chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, sampling_rate)
61
 
 
 
 
 
62
  transcripts = []
63
  audios = []
64
  with tempfile.TemporaryDirectory() as tmpdirname:
65
+ for i,chunk in enumerate(chunks):
66
 
67
  # TODO: make sure 1D or 2D?
68
  arr = chunk["audio"]
 
79
 
80
  dataset = Dataset.from_dict({"audio": audios, "transcript": transcripts}).cast_column("audio", Audio())
81
 
82
+
83
+ dataset.push_to_hub(dataset_name, token=oauth_token.token)
 
84
 
85
+ return text
 
86
 
87
 
88
  def _return_yt_html_embed(yt_url):
 
125
  raise gr.Error(str(err))
126
 
127
 
128
+ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken | None, max_filesize=75.0, dataset_sampling_rate = 24000):
 
 
 
 
129
  html_embed_str = _return_yt_html_embed(yt_url)
130
 
131
  with tempfile.TemporaryDirectory() as tmpdirname:
132
  filepath = os.path.join(tmpdirname, "video.mp4")
 
 
 
133
  download_yt_audio(yt_url, filepath)
134
  with open(filepath, "rb") as f:
135
  inputs_path = f.read()
 
137
  inputs = ffmpeg_read(inputs_path, pipe.feature_extractor.sampling_rate)
138
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
139
 
 
 
140
  out = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
141
 
142
  text = out["text"]
143
 
144
  inputs = ffmpeg_read(inputs_path, dataset_sampling_rate)
145
 
 
 
146
  chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, dataset_sampling_rate)
147
 
 
 
 
148
  transcripts = []
149
  audios = []
150
  with tempfile.TemporaryDirectory() as tmpdirname:
151
+ for i,chunk in enumerate(chunks):
152
 
153
  # TODO: make sure 1D or 2D?
154
  arr = chunk["audio"]
 
165
 
166
  dataset = Dataset.from_dict({"audio": audios, "transcript": transcripts}).cast_column("audio", Audio())
167
 
168
+
169
+ dataset.push_to_hub(dataset_name, token=oauth_token.token)
 
170
 
 
171
 
172
+ return html_embed_str, text
173
 
174
 
175
  def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_chars = ".!:;?", min_duration = 5):
176
  # merge chunks as long as merged audio duration is lower than min_duration and that a stop character is not met
177
  # return list of dictionnaries (text, audio)
178
  # min duration is in seconds
179
+
180
  min_duration = int(min_duration * sampling_rate)
 
181
 
182
  new_chunks = []
183
  while chunks:
184
  current_chunk = chunks.pop(0)
 
 
185
  begin, end = current_chunk["timestamp"]
186
  begin, end = int(begin*sampling_rate), int(end*sampling_rate)
187
 
 
193
  chunk_to_concat = [audio_array[begin:end]]
194
  while chunks and (text[-1] not in stop_chars or (current_dur<min_duration)):
195
  ch = chunks.pop(0)
196
+
197
  begin, end = ch["timestamp"]
198
  begin, end = int(begin*sampling_rate), int(end*sampling_rate)
199
  current_dur += end-begin
 
209
  "audio": np.concatenate(chunk_to_concat),
210
  })
211
  print(f"LENGTH CHUNK #{len(new_chunks)}: {current_dur/sampling_rate}s")
 
 
212
 
213
  return new_chunks
214
 
215
 
216
+
217
+ mf_transcribe = gr.Interface(
218
+ fn=transcribe,
219
+ inputs=[
220
+ gr.Audio(type="filepath"),
221
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
222
+ gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio"),
223
+ gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name"),
224
+ ],
225
+ outputs="text",
226
+ theme="huggingface",
227
+ title="Create your own TTS dataset using your own recordings",
228
+ description=(
229
+ "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
230
+ f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to automatically transcribe audio files"
231
+ " of arbitrary length. It then merge chunks of audio and push it to the hub."
232
+ ),
233
+ allow_flagging="never",
234
+ )
235
+
236
+ yt_transcribe = gr.Interface(
237
+ fn=yt_transcribe,
238
+ inputs=[
239
+ gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
240
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
241
+ gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio"),
242
+ gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name"),
243
+ ],
244
+ outputs=["html", "text"],
245
+ theme="huggingface",
246
+ title="Create your own TTS dataset using Youtube",
247
+ description=(
248
+ "This demo allows use to create a text-to-speech dataset from an input audio snippet and push it to hub to keep track of it."
249
+ f"Demo uses the checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to automatically transcribe audio files"
250
+ " of arbitrary length. It then merge chunks of audio and push it to the hub."
251
+ ),
252
+ allow_flagging="never",
253
+ )
254
 
255
+ with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
256
  with gr.Row():
257
  gr.LoginButton().activate()
258
  gr.LogoutButton()
259
+ gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Microphone or Audio file", "YouTube"])
260
+
261
+ demo.launch(debug=True)