import gradio as gr import torch from nemo.collections.asr.models import EncDecSpeakerLabelModel import json device = torch.device("cuda" if torch.cuda.is_available() else "cpu" ) THRESHOLD = 0.60 model_name = "nvidia/speakerverification_en_titanet_large" model = EncDecSpeakerLabelModel.from_pretrained(model_name).to(device) def create_voice_print(audio): if not audio: return json.dumps({ "error": "no se proporciono un audio"}) embs1 = model.get_embedding(audio).squeeze() X = embs1 / torch.linalg.norm(embs1) # return X.tolist() return X def compare_voice_print(X, Y): # Score similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5) similarity_score = (similarity_score + 1) / 2 return similarity_score.item() # encontrar como ejecutar la huella de voz def find_matches(file, voice_print): matches = [] if not file: return json.dumps({"error": "No se proporcionó un archivo JSON"}) try: json_content = json.load(open(file)) except json.JSONDecodeError: return json.dumps({"error": "El archivo JSON no es válido"}) data = json_content.get("data", []) # Convertir a tensor voice_print = torch.tensor(json.loads(voice_print)) for speaker in data: speaker_voice_print = torch.tensor(json.loads(speaker['voice_print'])) # speaker_voice_print = eval(speaker['voice_print']) similarity_score = compare_voice_print(voice_print, speaker_voice_print) print(similarity_score) if similarity_score >= THRESHOLD: matches.append({ "speaker": speaker, "similarity_score": similarity_score }) matches.sort(key=lambda match: match['similarity_score'], reverse=True) return matches[:3] voice_print_maker = gr.Interface( fn=create_voice_print, inputs=[gr.Audio(type="filepath")], outputs=gr.JSON(), ) voice_prints_loader = gr.Interface( fn=find_matches, inputs=[ gr.File(type="filepath", label="Upload JSON file"), gr.TextArea() ], outputs=gr.JSON(), ) demo = gr.TabbedInterface([voice_print_maker, voice_prints_loader], ["app", "loader"]) demo.launch()