|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification |
|
from safetensors.torch import load_file |
|
|
|
|
|
model_name = "results" |
|
processor = Wav2Vec2Processor.from_pretrained(model_name) |
|
|
|
|
|
state_dict = load_file("results/model.safetensors") |
|
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name, state_dict=state_dict) |
|
|
|
def classify_accent(audio): |
|
if audio is None: |
|
return "Error: No se recibió audio" |
|
|
|
|
|
print(f"Tipo de entrada de audio: {type(audio)}") |
|
|
|
|
|
print(f"Entrada de audio recibida: {audio}") |
|
|
|
try: |
|
audio_array = audio["array"] |
|
sample_rate = audio["sampling_rate"] |
|
|
|
print(f"Forma del audio: {audio_array.shape}, Frecuencia de muestreo: {sample_rate}") |
|
|
|
|
|
audio_array = audio_array.astype(np.float32) |
|
|
|
|
|
if sample_rate != 16000: |
|
import librosa |
|
audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000) |
|
|
|
input_values = processor(audio_array, return_tensors="pt", sampling_rate=16000).input_values |
|
|
|
|
|
with torch.no_grad(): |
|
logits = model(input_values).logits |
|
predicted_ids = torch.argmax(logits, dim=-1).item() |
|
|
|
|
|
labels = ["Español", "Otro"] |
|
return labels[predicted_ids] |
|
|
|
except Exception as e: |
|
return f"Error al procesar el audio: {str(e)}" |
|
|
|
|
|
description_html = """ |
|
<p>Prueba con grabación o cargando un archivo de audio. Para probar, recomiendo una palabra.</p> |
|
<p>Ramon Mayor Martins: <a href="https://rmayormartins.github.io/" target="_blank">Website</a> | <a href="https://huggingface.co/rmayormartins" target="_blank">Spaces</a></p> |
|
""" |
|
|
|
|
|
interface = gr.Interface( |
|
fn=classify_accent, |
|
inputs=gr.Audio(type="numpy", source="microphone"), |
|
outputs="label", |
|
title="Clasificador de Sotaques (Español vs Otro)", |
|
description=description_html |
|
) |
|
|
|
interface.launch() |
|
|