Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,11 @@ from transformers import logging
|
|
10 |
import math
|
11 |
import json
|
12 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Suprimir advertencias
|
15 |
warnings.filterwarnings("ignore")
|
@@ -31,21 +36,34 @@ MODELS = {
|
|
31 |
"facebook/wav2vec2-large-xlsr-53-portuguese",
|
32 |
"openai/whisper-medium",
|
33 |
"jonatasgrosman/wav2vec2-xlsr-53-portuguese"
|
|
|
|
|
|
|
34 |
]
|
35 |
}
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
# Función para verificar si ffmpeg está instalado
|
38 |
def verify_ffmpeg_installation():
|
39 |
try:
|
40 |
subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
41 |
except subprocess.CalledProcessError as e:
|
42 |
-
|
43 |
raise e
|
44 |
|
45 |
def convert_audio_to_wav(audio_path):
|
46 |
if os.path.isdir(audio_path):
|
47 |
raise ValueError(f"La ruta proporcionada es un directorio, no un archivo: {audio_path}")
|
48 |
-
|
|
|
|
|
49 |
|
50 |
# Añadir la opción '-y' para sobrescribir el archivo existente sin preguntar
|
51 |
command = ["ffmpeg", "-y", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
|
@@ -53,8 +71,8 @@ def convert_audio_to_wav(audio_path):
|
|
53 |
process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
54 |
|
55 |
# Imprimir resultados para depuración
|
56 |
-
|
57 |
-
|
58 |
|
59 |
if process.returncode != 0:
|
60 |
raise ValueError(f"Error al convertir el archivo de audio a wav: {process.stderr.decode()}")
|
@@ -68,7 +86,7 @@ def detect_language(audio_path):
|
|
68 |
raise ValueError(f"Error al cargar el archivo de audio con librosa: {e}")
|
69 |
|
70 |
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
|
71 |
-
model =
|
72 |
|
73 |
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
|
74 |
predicted_ids = model.generate(input_features)
|
@@ -90,45 +108,26 @@ def transcribe_audio_stream(audio, model_name):
|
|
90 |
duration = len(speech) / rate
|
91 |
|
92 |
transcriptions = []
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
end = min(i + chunk_duration, duration)
|
102 |
-
chunk = speech[int(i * rate):int(end * rate)]
|
103 |
-
|
104 |
-
input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
|
105 |
-
predicted_ids = model.generate(input_features)
|
106 |
-
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
107 |
-
|
108 |
-
progress = min(100, (end / duration) * 100)
|
109 |
-
transcriptions.append({
|
110 |
-
"start_time": i,
|
111 |
-
"end_time": end,
|
112 |
-
"text": transcription
|
113 |
-
})
|
114 |
-
yield transcriptions, progress
|
115 |
-
else:
|
116 |
-
transcriber = pipeline("automatic-speech-recognition", model=model_name)
|
117 |
|
118 |
-
|
|
|
|
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
"start_time": i,
|
128 |
-
"end_time": end,
|
129 |
-
"text": result["text"]
|
130 |
-
})
|
131 |
-
yield transcriptions, progress
|
132 |
|
133 |
def detect_and_select_model(audio):
|
134 |
wav_audio = convert_audio_to_wav(audio)
|
@@ -146,62 +145,66 @@ def save_transcription(transcriptions, file_format):
|
|
146 |
for entry in transcriptions:
|
147 |
tmp.write(f"{entry['start_time']:.2f},{entry['end_time']:.2f},{entry['text']}\n".encode())
|
148 |
file_path = tmp.name
|
149 |
-
|
150 |
return file_path
|
151 |
|
152 |
-
def combined_interface(audio, file_format):
|
153 |
try:
|
154 |
-
|
155 |
-
verify_ffmpeg_installation()
|
156 |
|
157 |
language, model_options = detect_and_select_model(audio)
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
# Primer yield: Añadir None para la séptima salida (Archivo de Descarga)
|
163 |
-
yield
|
164 |
|
165 |
transcriptions = []
|
166 |
-
for partial_transcriptions, progress in transcribe_audio_stream(audio,
|
167 |
transcriptions = partial_transcriptions
|
168 |
full_transcription = " ".join([t["text"] for t in transcriptions])
|
169 |
progress_int = math.floor(progress)
|
170 |
status = f"Transcribing... {progress_int}% complete"
|
171 |
-
|
172 |
-
|
173 |
-
yield language, model_options, selected_model, full_transcription.strip(), progress_int, status, None
|
174 |
|
175 |
-
|
176 |
-
# Guardar transcripción
|
177 |
file_path = save_transcription(transcriptions, file_format)
|
178 |
-
print(f"Transcripción guardada en: {file_path}")
|
179 |
|
180 |
-
# Verificar que file_path no es un directorio
|
181 |
if os.path.isdir(file_path):
|
182 |
raise ValueError(f"El archivo de transcripción debería ser un archivo, pero es un directorio: {file_path}")
|
183 |
|
184 |
-
# Verificar que el archivo existe
|
185 |
if not os.path.isfile(file_path):
|
186 |
raise ValueError(f"El archivo de transcripción no existe: {file_path}")
|
187 |
|
188 |
-
# Limpiar archivos temporales
|
189 |
os.remove("converted_audio.wav")
|
190 |
-
|
191 |
|
192 |
-
|
193 |
-
yield language, model_options, selected_model, full_transcription.strip(), 100, "Transcription complete! Download the file below.", file_path
|
194 |
|
195 |
except Exception as e:
|
196 |
-
|
197 |
-
# Asegurarse de que el yield de error también devuelva 7 valores
|
198 |
yield str(e), [], "", "An error occurred during processing.", 0, "Error", ""
|
199 |
|
200 |
iface = gr.Interface(
|
201 |
fn=combined_interface,
|
202 |
inputs=[
|
203 |
gr.Audio(type="filepath", label="Upload Audio File"),
|
204 |
-
gr.Radio(choices=["JSON", "TXT"], label="Choose output format")
|
|
|
|
|
|
|
205 |
],
|
206 |
outputs=[
|
207 |
gr.Textbox(label="Detected Language"),
|
@@ -213,7 +216,7 @@ iface = gr.Interface(
|
|
213 |
gr.File(label="Download Transcription")
|
214 |
],
|
215 |
title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
|
216 |
-
description="Upload an audio file to detect the language,
|
217 |
live=True
|
218 |
)
|
219 |
|
|
|
10 |
import math
|
11 |
import json
|
12 |
import tempfile
|
13 |
+
import logging
|
14 |
+
import concurrent.futures
|
15 |
+
|
16 |
+
# Configurar logging
|
17 |
+
logging.basicConfig(level=logging.INFO)
|
18 |
|
19 |
# Suprimir advertencias
|
20 |
warnings.filterwarnings("ignore")
|
|
|
36 |
"facebook/wav2vec2-large-xlsr-53-portuguese",
|
37 |
"openai/whisper-medium",
|
38 |
"jonatasgrosman/wav2vec2-xlsr-53-portuguese"
|
39 |
+
],
|
40 |
+
"fr": [
|
41 |
+
"jonatasgrosman/wav2vec2-large-xlsr-53-french"
|
42 |
]
|
43 |
}
|
44 |
|
45 |
+
# Cache de modelos para evitar múltiples cargas
|
46 |
+
model_cache = {}
|
47 |
+
|
48 |
+
def get_model(model_name):
|
49 |
+
if model_name not in model_cache:
|
50 |
+
model_cache[model_name] = WhisperForConditionalGeneration.from_pretrained(model_name)
|
51 |
+
return model_cache[model_name]
|
52 |
+
|
53 |
# Función para verificar si ffmpeg está instalado
|
54 |
def verify_ffmpeg_installation():
|
55 |
try:
|
56 |
subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
57 |
except subprocess.CalledProcessError as e:
|
58 |
+
logging.error("ffmpeg no está instalado o no se puede ejecutar correctamente.")
|
59 |
raise e
|
60 |
|
61 |
def convert_audio_to_wav(audio_path):
|
62 |
if os.path.isdir(audio_path):
|
63 |
raise ValueError(f"La ruta proporcionada es un directorio, no un archivo: {audio_path}")
|
64 |
+
|
65 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
66 |
+
wav_path = tmp.name
|
67 |
|
68 |
# Añadir la opción '-y' para sobrescribir el archivo existente sin preguntar
|
69 |
command = ["ffmpeg", "-y", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
|
|
|
71 |
process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
72 |
|
73 |
# Imprimir resultados para depuración
|
74 |
+
logging.info(process.stdout.decode())
|
75 |
+
logging.error(process.stderr.decode())
|
76 |
|
77 |
if process.returncode != 0:
|
78 |
raise ValueError(f"Error al convertir el archivo de audio a wav: {process.stderr.decode()}")
|
|
|
86 |
raise ValueError(f"Error al cargar el archivo de audio con librosa: {e}")
|
87 |
|
88 |
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
|
89 |
+
model = get_model("openai/whisper-base")
|
90 |
|
91 |
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
|
92 |
predicted_ids = model.generate(input_features)
|
|
|
108 |
duration = len(speech) / rate
|
109 |
|
110 |
transcriptions = []
|
111 |
+
processor = WhisperProcessor.from_pretrained(model_name)
|
112 |
+
model = get_model(model_name)
|
113 |
+
|
114 |
+
chunk_duration = 30 # segundos
|
115 |
+
|
116 |
+
for i in range(0, int(duration), chunk_duration):
|
117 |
+
end = min(i + chunk_duration, duration)
|
118 |
+
chunk = speech[int(i * rate):int(end * rate)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
+
input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
|
121 |
+
predicted_ids = model.generate(input_features)
|
122 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
123 |
|
124 |
+
progress = min(100, (end / duration) * 100)
|
125 |
+
transcriptions.append({
|
126 |
+
"start_time": i,
|
127 |
+
"end_time": end,
|
128 |
+
"text": transcription
|
129 |
+
})
|
130 |
+
yield transcriptions, progress
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
def detect_and_select_model(audio):
|
133 |
wav_audio = convert_audio_to_wav(audio)
|
|
|
145 |
for entry in transcriptions:
|
146 |
tmp.write(f"{entry['start_time']:.2f},{entry['end_time']:.2f},{entry['text']}\n".encode())
|
147 |
file_path = tmp.name
|
148 |
+
logging.info(f"Archivo de transcripción guardado en: {file_path}")
|
149 |
return file_path
|
150 |
|
151 |
+
def combined_interface(audio, file_format, confirmed_language, chosen_model):
|
152 |
try:
|
153 |
+
logging.info(f"Ruta del archivo de audio subido: {audio}")
|
154 |
+
verify_ffmpeg_installation()
|
155 |
|
156 |
language, model_options = detect_and_select_model(audio)
|
157 |
+
|
158 |
+
# Si el usuario ha confirmado el idioma, lo usamos, sino, lo detectamos automáticamente
|
159 |
+
if not confirmed_language:
|
160 |
+
confirmed_language = language
|
161 |
+
|
162 |
+
# Sugerimos un modelo, pero permitimos que el usuario elija uno
|
163 |
+
if not chosen_model:
|
164 |
+
chosen_model = model_options[0]
|
165 |
+
|
166 |
+
logging.info(f"Idioma detectado: {confirmed_language}")
|
167 |
+
logging.info(f"Modelos disponibles: {model_options}")
|
168 |
+
logging.info(f"Modelo seleccionado: {chosen_model}")
|
169 |
|
170 |
# Primer yield: Añadir None para la séptima salida (Archivo de Descarga)
|
171 |
+
yield confirmed_language, model_options, chosen_model, "", 0, "Initializing...", None
|
172 |
|
173 |
transcriptions = []
|
174 |
+
for partial_transcriptions, progress in transcribe_audio_stream(audio, chosen_model):
|
175 |
transcriptions = partial_transcriptions
|
176 |
full_transcription = " ".join([t["text"] for t in transcriptions])
|
177 |
progress_int = math.floor(progress)
|
178 |
status = f"Transcribing... {progress_int}% complete"
|
179 |
+
logging.info(f"Progreso: {progress_int}%")
|
180 |
+
yield confirmed_language, model_options, chosen_model, full_transcription.strip(), progress_int, status, None
|
|
|
181 |
|
182 |
+
logging.info("Guardando transcripción.")
|
|
|
183 |
file_path = save_transcription(transcriptions, file_format)
|
|
|
184 |
|
|
|
185 |
if os.path.isdir(file_path):
|
186 |
raise ValueError(f"El archivo de transcripción debería ser un archivo, pero es un directorio: {file_path}")
|
187 |
|
|
|
188 |
if not os.path.isfile(file_path):
|
189 |
raise ValueError(f"El archivo de transcripción no existe: {file_path}")
|
190 |
|
|
|
191 |
os.remove("converted_audio.wav")
|
192 |
+
logging.info("Archivos temporales limpiados.")
|
193 |
|
194 |
+
yield confirmed_language, model_options, chosen_model, full_transcription.strip(), 100, "Transcription complete! Download the file below.", file_path
|
|
|
195 |
|
196 |
except Exception as e:
|
197 |
+
logging.error(f"Error: {e}")
|
|
|
198 |
yield str(e), [], "", "An error occurred during processing.", 0, "Error", ""
|
199 |
|
200 |
iface = gr.Interface(
|
201 |
fn=combined_interface,
|
202 |
inputs=[
|
203 |
gr.Audio(type="filepath", label="Upload Audio File"),
|
204 |
+
gr.Radio(choices=["JSON", "TXT"], label="Choose output format"),
|
205 |
+
gr.Dropdown(choices=["", "es", "en", "pt", "fr"], label="Confirm detected language (optional)"),
|
206 |
+
gr.Dropdown(choices=["", "openai/whisper-large-v3", "facebook/wav2vec2-large-xlsr-53-spanish",
|
207 |
+
"jonatasgrosman/wav2vec2-xls-r-1b-spanish", "microsoft/wav2vec2-base-960h"], label="Choose model (optional)")
|
208 |
],
|
209 |
outputs=[
|
210 |
gr.Textbox(label="Detected Language"),
|
|
|
216 |
gr.File(label="Download Transcription")
|
217 |
],
|
218 |
title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
|
219 |
+
description="Upload an audio file to detect the language, confirm the detection or choose a model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
|
220 |
live=True
|
221 |
)
|
222 |
|