Merlintxu commited on
Commit
d8b06bf
1 Parent(s): 55c7d23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -67
app.py CHANGED
@@ -10,6 +10,11 @@ from transformers import logging
10
  import math
11
  import json
12
  import tempfile
 
 
 
 
 
13
 
14
  # Suprimir advertencias
15
  warnings.filterwarnings("ignore")
@@ -31,21 +36,34 @@ MODELS = {
31
  "facebook/wav2vec2-large-xlsr-53-portuguese",
32
  "openai/whisper-medium",
33
  "jonatasgrosman/wav2vec2-xlsr-53-portuguese"
 
 
 
34
  ]
35
  }
36
 
 
 
 
 
 
 
 
 
37
  # Función para verificar si ffmpeg está instalado
38
  def verify_ffmpeg_installation():
39
  try:
40
  subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
41
  except subprocess.CalledProcessError as e:
42
- print("ffmpeg no está instalado o no se puede ejecutar correctamente.")
43
  raise e
44
 
45
  def convert_audio_to_wav(audio_path):
46
  if os.path.isdir(audio_path):
47
  raise ValueError(f"La ruta proporcionada es un directorio, no un archivo: {audio_path}")
48
- wav_path = "converted_audio.wav"
 
 
49
 
50
  # Añadir la opción '-y' para sobrescribir el archivo existente sin preguntar
51
  command = ["ffmpeg", "-y", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
@@ -53,8 +71,8 @@ def convert_audio_to_wav(audio_path):
53
  process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
54
 
55
  # Imprimir resultados para depuración
56
- print(process.stdout.decode()) # Ver salida estándar
57
- print(process.stderr.decode()) # Ver errores
58
 
59
  if process.returncode != 0:
60
  raise ValueError(f"Error al convertir el archivo de audio a wav: {process.stderr.decode()}")
@@ -68,7 +86,7 @@ def detect_language(audio_path):
68
  raise ValueError(f"Error al cargar el archivo de audio con librosa: {e}")
69
 
70
  processor = WhisperProcessor.from_pretrained("openai/whisper-base")
71
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
72
 
73
  input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
74
  predicted_ids = model.generate(input_features)
@@ -90,45 +108,26 @@ def transcribe_audio_stream(audio, model_name):
90
  duration = len(speech) / rate
91
 
92
  transcriptions = []
93
-
94
- if "whisper" in model_name:
95
- processor = WhisperProcessor.from_pretrained(model_name)
96
- model = WhisperForConditionalGeneration.from_pretrained(model_name)
97
-
98
- chunk_duration = 30 # segundos
99
-
100
- for i in range(0, int(duration), chunk_duration):
101
- end = min(i + chunk_duration, duration)
102
- chunk = speech[int(i * rate):int(end * rate)]
103
-
104
- input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
105
- predicted_ids = model.generate(input_features)
106
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
107
-
108
- progress = min(100, (end / duration) * 100)
109
- transcriptions.append({
110
- "start_time": i,
111
- "end_time": end,
112
- "text": transcription
113
- })
114
- yield transcriptions, progress
115
- else:
116
- transcriber = pipeline("automatic-speech-recognition", model=model_name)
117
 
118
- chunk_duration = 10 # segundos
 
 
119
 
120
- for i in range(0, int(duration), chunk_duration):
121
- end = min(i + chunk_duration, duration)
122
- chunk = speech[int(i * rate):int(end * rate)]
123
- result = transcriber(chunk)
124
-
125
- progress = min(100, (end / duration) * 100)
126
- transcriptions.append({
127
- "start_time": i,
128
- "end_time": end,
129
- "text": result["text"]
130
- })
131
- yield transcriptions, progress
132
 
133
  def detect_and_select_model(audio):
134
  wav_audio = convert_audio_to_wav(audio)
@@ -146,62 +145,66 @@ def save_transcription(transcriptions, file_format):
146
  for entry in transcriptions:
147
  tmp.write(f"{entry['start_time']:.2f},{entry['end_time']:.2f},{entry['text']}\n".encode())
148
  file_path = tmp.name
149
- print(f"Archivo de transcripción guardado en: {file_path}")
150
  return file_path
151
 
152
- def combined_interface(audio, file_format):
153
  try:
154
- print(f"Ruta del archivo de audio subido: {audio}")
155
- verify_ffmpeg_installation() # Verifica si ffmpeg está instalado
156
 
157
  language, model_options = detect_and_select_model(audio)
158
- selected_model = model_options[0]
159
- print(f"Idioma detectado: {language}")
160
- print(f"Modelos disponibles: {model_options}")
 
 
 
 
 
 
 
 
 
161
 
162
  # Primer yield: Añadir None para la séptima salida (Archivo de Descarga)
163
- yield language, model_options, selected_model, "", 0, "Initializing...", None
164
 
165
  transcriptions = []
166
- for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model):
167
  transcriptions = partial_transcriptions
168
  full_transcription = " ".join([t["text"] for t in transcriptions])
169
  progress_int = math.floor(progress)
170
  status = f"Transcribing... {progress_int}% complete"
171
- print(f"Progreso: {progress_int}%")
172
- # Yield con None para el archivo de descarga hasta que esté completo
173
- yield language, model_options, selected_model, full_transcription.strip(), progress_int, status, None
174
 
175
- print("Guardando transcripción.")
176
- # Guardar transcripción
177
  file_path = save_transcription(transcriptions, file_format)
178
- print(f"Transcripción guardada en: {file_path}")
179
 
180
- # Verificar que file_path no es un directorio
181
  if os.path.isdir(file_path):
182
  raise ValueError(f"El archivo de transcripción debería ser un archivo, pero es un directorio: {file_path}")
183
 
184
- # Verificar que el archivo existe
185
  if not os.path.isfile(file_path):
186
  raise ValueError(f"El archivo de transcripción no existe: {file_path}")
187
 
188
- # Limpiar archivos temporales
189
  os.remove("converted_audio.wav")
190
- print("Archivos temporales limpiados.")
191
 
192
- # Yield final con el archivo de descarga
193
- yield language, model_options, selected_model, full_transcription.strip(), 100, "Transcription complete! Download the file below.", file_path
194
 
195
  except Exception as e:
196
- print(f"Error: {e}")
197
- # Asegurarse de que el yield de error también devuelva 7 valores
198
  yield str(e), [], "", "An error occurred during processing.", 0, "Error", ""
199
 
200
  iface = gr.Interface(
201
  fn=combined_interface,
202
  inputs=[
203
  gr.Audio(type="filepath", label="Upload Audio File"),
204
- gr.Radio(choices=["JSON", "TXT"], label="Choose output format")
 
 
 
205
  ],
206
  outputs=[
207
  gr.Textbox(label="Detected Language"),
@@ -213,7 +216,7 @@ iface = gr.Interface(
213
  gr.File(label="Download Transcription")
214
  ],
215
  title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
216
- description="Upload an audio file to detect the language, select the transcription model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
217
  live=True
218
  )
219
 
 
10
  import math
11
  import json
12
  import tempfile
13
+ import logging
14
+ import concurrent.futures
15
+
16
+ # Configurar logging
17
+ logging.basicConfig(level=logging.INFO)
18
 
19
  # Suprimir advertencias
20
  warnings.filterwarnings("ignore")
 
36
  "facebook/wav2vec2-large-xlsr-53-portuguese",
37
  "openai/whisper-medium",
38
  "jonatasgrosman/wav2vec2-xlsr-53-portuguese"
39
+ ],
40
+ "fr": [
41
+ "jonatasgrosman/wav2vec2-large-xlsr-53-french"
42
  ]
43
  }
44
 
45
+ # Cache de modelos para evitar múltiples cargas
46
+ model_cache = {}
47
+
48
+ def get_model(model_name):
49
+ if model_name not in model_cache:
50
+ model_cache[model_name] = WhisperForConditionalGeneration.from_pretrained(model_name)
51
+ return model_cache[model_name]
52
+
53
  # Función para verificar si ffmpeg está instalado
54
  def verify_ffmpeg_installation():
55
  try:
56
  subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
57
  except subprocess.CalledProcessError as e:
58
+ logging.error("ffmpeg no está instalado o no se puede ejecutar correctamente.")
59
  raise e
60
 
61
  def convert_audio_to_wav(audio_path):
62
  if os.path.isdir(audio_path):
63
  raise ValueError(f"La ruta proporcionada es un directorio, no un archivo: {audio_path}")
64
+
65
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
66
+ wav_path = tmp.name
67
 
68
  # Añadir la opción '-y' para sobrescribir el archivo existente sin preguntar
69
  command = ["ffmpeg", "-y", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
 
71
  process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
72
 
73
  # Imprimir resultados para depuración
74
+ logging.info(process.stdout.decode())
75
+ logging.error(process.stderr.decode())
76
 
77
  if process.returncode != 0:
78
  raise ValueError(f"Error al convertir el archivo de audio a wav: {process.stderr.decode()}")
 
86
  raise ValueError(f"Error al cargar el archivo de audio con librosa: {e}")
87
 
88
  processor = WhisperProcessor.from_pretrained("openai/whisper-base")
89
+ model = get_model("openai/whisper-base")
90
 
91
  input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
92
  predicted_ids = model.generate(input_features)
 
108
  duration = len(speech) / rate
109
 
110
  transcriptions = []
111
+ processor = WhisperProcessor.from_pretrained(model_name)
112
+ model = get_model(model_name)
113
+
114
+ chunk_duration = 30 # segundos
115
+
116
+ for i in range(0, int(duration), chunk_duration):
117
+ end = min(i + chunk_duration, duration)
118
+ chunk = speech[int(i * rate):int(end * rate)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
121
+ predicted_ids = model.generate(input_features)
122
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
123
 
124
+ progress = min(100, (end / duration) * 100)
125
+ transcriptions.append({
126
+ "start_time": i,
127
+ "end_time": end,
128
+ "text": transcription
129
+ })
130
+ yield transcriptions, progress
 
 
 
 
 
131
 
132
  def detect_and_select_model(audio):
133
  wav_audio = convert_audio_to_wav(audio)
 
145
  for entry in transcriptions:
146
  tmp.write(f"{entry['start_time']:.2f},{entry['end_time']:.2f},{entry['text']}\n".encode())
147
  file_path = tmp.name
148
+ logging.info(f"Archivo de transcripción guardado en: {file_path}")
149
  return file_path
150
 
151
+ def combined_interface(audio, file_format, confirmed_language, chosen_model):
152
  try:
153
+ logging.info(f"Ruta del archivo de audio subido: {audio}")
154
+ verify_ffmpeg_installation()
155
 
156
  language, model_options = detect_and_select_model(audio)
157
+
158
+ # Si el usuario ha confirmado el idioma, lo usamos, sino, lo detectamos automáticamente
159
+ if not confirmed_language:
160
+ confirmed_language = language
161
+
162
+ # Sugerimos un modelo, pero permitimos que el usuario elija uno
163
+ if not chosen_model:
164
+ chosen_model = model_options[0]
165
+
166
+ logging.info(f"Idioma detectado: {confirmed_language}")
167
+ logging.info(f"Modelos disponibles: {model_options}")
168
+ logging.info(f"Modelo seleccionado: {chosen_model}")
169
 
170
  # Primer yield: Añadir None para la séptima salida (Archivo de Descarga)
171
+ yield confirmed_language, model_options, chosen_model, "", 0, "Initializing...", None
172
 
173
  transcriptions = []
174
+ for partial_transcriptions, progress in transcribe_audio_stream(audio, chosen_model):
175
  transcriptions = partial_transcriptions
176
  full_transcription = " ".join([t["text"] for t in transcriptions])
177
  progress_int = math.floor(progress)
178
  status = f"Transcribing... {progress_int}% complete"
179
+ logging.info(f"Progreso: {progress_int}%")
180
+ yield confirmed_language, model_options, chosen_model, full_transcription.strip(), progress_int, status, None
 
181
 
182
+ logging.info("Guardando transcripción.")
 
183
  file_path = save_transcription(transcriptions, file_format)
 
184
 
 
185
  if os.path.isdir(file_path):
186
  raise ValueError(f"El archivo de transcripción debería ser un archivo, pero es un directorio: {file_path}")
187
 
 
188
  if not os.path.isfile(file_path):
189
  raise ValueError(f"El archivo de transcripción no existe: {file_path}")
190
 
 
191
  os.remove("converted_audio.wav")
192
+ logging.info("Archivos temporales limpiados.")
193
 
194
+ yield confirmed_language, model_options, chosen_model, full_transcription.strip(), 100, "Transcription complete! Download the file below.", file_path
 
195
 
196
  except Exception as e:
197
+ logging.error(f"Error: {e}")
 
198
  yield str(e), [], "", "An error occurred during processing.", 0, "Error", ""
199
 
200
  iface = gr.Interface(
201
  fn=combined_interface,
202
  inputs=[
203
  gr.Audio(type="filepath", label="Upload Audio File"),
204
+ gr.Radio(choices=["JSON", "TXT"], label="Choose output format"),
205
+ gr.Dropdown(choices=["", "es", "en", "pt", "fr"], label="Confirm detected language (optional)"),
206
+ gr.Dropdown(choices=["", "openai/whisper-large-v3", "facebook/wav2vec2-large-xlsr-53-spanish",
207
+ "jonatasgrosman/wav2vec2-xls-r-1b-spanish", "microsoft/wav2vec2-base-960h"], label="Choose model (optional)")
208
  ],
209
  outputs=[
210
  gr.Textbox(label="Detected Language"),
 
216
  gr.File(label="Download Transcription")
217
  ],
218
  title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
219
+ description="Upload an audio file to detect the language, confirm the detection or choose a model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
220
  live=True
221
  )
222