ylacombe HF staff commited on
Commit
3551573
1 Parent(s): 000caf9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -25
app.py CHANGED
@@ -12,9 +12,13 @@ from transformers.pipelines.audio_utils import ffmpeg_read
12
  import tempfile
13
  import os
14
  import time
 
 
 
15
 
16
 
17
  MODEL_NAME = "openai/whisper-large-v3"
 
18
  BATCH_SIZE = 8
19
  FILE_LIMIT_MB = 1000
20
  YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
@@ -28,8 +32,23 @@ pipe = pipeline(
28
  device=device,
29
  )
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- def transcribe(inputs_path, task, dataset_name, oauth_token: gr.OAuthToken):
33
  if inputs_path is None:
34
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
35
 
@@ -39,18 +58,23 @@ def transcribe(inputs_path, task, dataset_name, oauth_token: gr.OAuthToken):
39
 
40
  text = out["text"]
41
 
42
- chunks = naive_postprocess_whisper_chunks(out["chunks"])
43
-
44
  transcripts = []
45
  audios = []
46
  with tempfile.TemporaryDirectory() as tmpdirname:
47
  for i,chunk in enumerate(chunks):
48
- begin, end = chunk["timestamp"]
49
- begin, end = int(begin*sampling_rate), int(end*sampling_rate)
50
  # TODO: make sure 1D or 2D?
51
- arr = inputs[begin:end]
52
  path = os.path.join(tmpdirname, f"{i}.wav")
53
  wavfile.write(path, sampling_rate, arr)
 
 
 
 
 
 
54
  audios.append(path)
55
  transcripts.append(chunk["text"])
56
 
@@ -102,7 +126,7 @@ def download_yt_audio(yt_url, filename):
102
  raise gr.Error(str(err))
103
 
104
 
105
- def yt_transcribe(yt_url, task, dataset_name, oauth_token: gr.OAuthToken, max_filesize=75.0, dataset_sampling_rate = 24000):
106
  html_embed_str = _return_yt_html_embed(yt_url)
107
 
108
  with tempfile.TemporaryDirectory() as tmpdirname:
@@ -117,21 +141,26 @@ def yt_transcribe(yt_url, task, dataset_name, oauth_token: gr.OAuthToken, max_fi
117
  out = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
118
 
119
  text = out["text"]
120
-
121
- chunks = naive_postprocess_whisper_chunks(out["chunks"])
122
-
123
  inputs = ffmpeg_read(inputs_path, dataset_sampling_rate)
124
 
 
 
125
  transcripts = []
126
  audios = []
127
  with tempfile.TemporaryDirectory() as tmpdirname:
128
  for i,chunk in enumerate(chunks):
129
- begin, end = chunk["timestamp"]
130
- begin, end = int(begin*dataset_sampling_rate), int(end*dataset_sampling_rate)
131
  # TODO: make sure 1D or 2D?
132
- arr = inputs[begin:end]
133
  path = os.path.join(tmpdirname, f"{i}.wav")
134
  wavfile.write(path, dataset_sampling_rate, arr)
 
 
 
 
 
 
135
  audios.append(path)
136
  transcripts.append(chunk["text"])
137
 
@@ -144,39 +173,57 @@ def yt_transcribe(yt_url, task, dataset_name, oauth_token: gr.OAuthToken, max_fi
144
  return html_embed_str, text
145
 
146
 
147
- def naive_postprocess_whisper_chunks(chunks, stop_chars = ".!:;?", min_duration = 5):
148
- new_chunks = []
 
 
149
 
 
 
 
150
  while chunks:
151
  current_chunk = chunks.pop(0)
152
  begin, end = current_chunk["timestamp"]
 
 
 
 
153
  text = current_chunk["text"]
154
 
155
- while chunks and (text[-1] not in stop_chars or (end-begin<min_duration)):
 
 
 
156
  ch = chunks.pop(0)
157
- end = ch["timestamp"][1]
 
 
 
 
158
  text = "".join([text, ch["text"]])
159
 
 
 
 
 
 
 
160
  new_chunks.append({
161
  "text": text.strip(),
162
- "timestamp": (begin, end),
163
  })
164
- print(f"LENGTH CHUNK #{len(new_chunks)}: {end-begin}s")
165
 
166
  return new_chunks
167
 
168
 
169
 
170
-
171
-
172
-
173
- demo = gr.Blocks()
174
-
175
  mf_transcribe = gr.Interface(
176
  fn=transcribe,
177
  inputs=[
178
  gr.Audio(type="filepath"),
179
  gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
 
180
  gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name"),
181
  ],
182
  outputs="text",
@@ -195,6 +242,7 @@ yt_transcribe = gr.Interface(
195
  inputs=[
196
  gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
197
  gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
 
198
  gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name"),
199
  ],
200
  outputs=["html", "text"],
@@ -208,7 +256,7 @@ yt_transcribe = gr.Interface(
208
  allow_flagging="never",
209
  )
210
 
211
- with demo:
212
  with gr.Row():
213
  gr.LoginButton()
214
  gr.LogoutButton()
 
12
  import tempfile
13
  import os
14
  import time
15
+ import demucs.api
16
+
17
+ os.environ["GRADIO_TEMP_DIR"] = "/home/yoach/spaces/tmp"
18
 
19
 
20
  MODEL_NAME = "openai/whisper-large-v3"
21
+ DEMUCS_MODEL_NAME = "htdemucs_ft"
22
  BATCH_SIZE = 8
23
  FILE_LIMIT_MB = 1000
24
  YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
 
32
  device=device,
33
  )
34
 
35
+ separator = demucs.api.Separator(model = DEMUCS_MODEL_NAME, )
36
+
37
+ def separate_vocal(path):
38
+ origin, separated = separator.separate_audio_file(path)
39
+ demucs.api.save_audio(separated["vocals"], path, samplerate=separator.samplerate)
40
+ return path
41
+
42
+
43
+
44
+ # def separate_vocal(path, track_name, output_folder, demucs_model_name = "htdemucs_ft"):
45
+ #
46
+ # os.system(f"python3 -m demucs.separate --two-stems=vocals -n {demucs_model_name} {path} -o {output_folder}")
47
+ #
48
+ # return os.path.join(output_folder, demucs_model_name, track_name, "vocals.wav")
49
+
50
 
51
+ def transcribe(inputs_path, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken):
52
  if inputs_path is None:
53
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
54
 
 
58
 
59
  text = out["text"]
60
 
61
+ chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, sampling_rate)
62
+
63
  transcripts = []
64
  audios = []
65
  with tempfile.TemporaryDirectory() as tmpdirname:
66
  for i,chunk in enumerate(chunks):
67
+
 
68
  # TODO: make sure 1D or 2D?
69
+ arr = chunk["audio"]
70
  path = os.path.join(tmpdirname, f"{i}.wav")
71
  wavfile.write(path, sampling_rate, arr)
72
+
73
+ if use_demucs == "separate-audio":
74
+ # use demucs tp separate vocals
75
+ print(f"Separating vocals #{i}")
76
+ path = separate_vocal(path)
77
+
78
  audios.append(path)
79
  transcripts.append(chunk["text"])
80
 
 
126
  raise gr.Error(str(err))
127
 
128
 
129
+ def yt_transcribe(yt_url, task, use_demucs, dataset_name, oauth_token: gr.OAuthToken, max_filesize=75.0, dataset_sampling_rate = 24000):
130
  html_embed_str = _return_yt_html_embed(yt_url)
131
 
132
  with tempfile.TemporaryDirectory() as tmpdirname:
 
141
  out = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
142
 
143
  text = out["text"]
144
+
 
 
145
  inputs = ffmpeg_read(inputs_path, dataset_sampling_rate)
146
 
147
+ chunks = naive_postprocess_whisper_chunks(out["chunks"], inputs, dataset_sampling_rate)
148
+
149
  transcripts = []
150
  audios = []
151
  with tempfile.TemporaryDirectory() as tmpdirname:
152
  for i,chunk in enumerate(chunks):
153
+
 
154
  # TODO: make sure 1D or 2D?
155
+ arr = chunk["audio"]
156
  path = os.path.join(tmpdirname, f"{i}.wav")
157
  wavfile.write(path, dataset_sampling_rate, arr)
158
+
159
+ if use_demucs == "separate-audio":
160
+ # use demucs tp separate vocals
161
+ print(f"Separating vocals #{i}")
162
+ path = separate_vocal(path)
163
+
164
  audios.append(path)
165
  transcripts.append(chunk["text"])
166
 
 
173
  return html_embed_str, text
174
 
175
 
176
+ def naive_postprocess_whisper_chunks(chunks, audio_array, sampling_rate, stop_chars = ".!:;?", min_duration = 5):
177
+ # merge chunks as long as merged audio duration is lower than min_duration and that a stop character is not met
178
+ # return list of dictionnaries (text, audio)
179
+ # min duration is in seconds
180
 
181
+ min_duration = int(min_duration * sampling_rate)
182
+
183
+ new_chunks = []
184
  while chunks:
185
  current_chunk = chunks.pop(0)
186
  begin, end = current_chunk["timestamp"]
187
+ begin, end = int(begin*sampling_rate), int(end*sampling_rate)
188
+
189
+ current_dur = end-begin
190
+
191
  text = current_chunk["text"]
192
 
193
+ print("new audio", begin/sampling_rate, end/sampling_rate)
194
+
195
+ chunk_to_concat = [audio_array[begin:end]]
196
+ while chunks and (text[-1] not in stop_chars or (current_dur<min_duration)):
197
  ch = chunks.pop(0)
198
+
199
+ begin, end = ch["timestamp"]
200
+ begin, end = int(begin*sampling_rate), int(end*sampling_rate)
201
+ current_dur += end-begin
202
+
203
  text = "".join([text, ch["text"]])
204
 
205
+ # TODO: add silence ?
206
+ chunk_to_concat.append(audio_array[begin:end])
207
+
208
+ print("adding audio chunk", begin/sampling_rate, end/sampling_rate, len(audio_array[begin:end])/sampling_rate)
209
+ print(ch["timestamp"])
210
+
211
  new_chunks.append({
212
  "text": text.strip(),
213
+ "audio": np.concatenate(chunk_to_concat),
214
  })
215
+ print(f"LENGTH CHUNK #{len(new_chunks)}: {current_dur/sampling_rate}s")
216
 
217
  return new_chunks
218
 
219
 
220
 
 
 
 
 
 
221
  mf_transcribe = gr.Interface(
222
  fn=transcribe,
223
  inputs=[
224
  gr.Audio(type="filepath"),
225
  gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
226
+ gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio"),
227
  gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name"),
228
  ],
229
  outputs="text",
 
242
  inputs=[
243
  gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
244
  gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
245
+ gr.Radio(["no-post-processing", "separate-audio"], label="Audio separation and cleaning (takes longer - use it if your samples are not cleaned (background noise and music))", value="separate-audio"),
246
  gr.Textbox(lines=1, placeholder="Place your new dataset name here", label="Dataset name"),
247
  ],
248
  outputs=["html", "text"],
 
256
  allow_flagging="never",
257
  )
258
 
259
+ with gr.Blocks() as demo:
260
  with gr.Row():
261
  gr.LoginButton()
262
  gr.LogoutButton()