import re import functools import requests import pandas as pd import plotly.express as px import torch import gradio as gr from transformers import pipeline, Wav2Vec2ProcessorWithLM from pyannote.audio import Pipeline from librosa import load, resample import whisperx import re alphabets= "([A-Za-z])" prefixes = "(Mr|St|Mrs|Ms|Dr)[.]" suffixes = "(Inc|Ltd|Jr|Sr|Co)" starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" websites = "[.](com|net|org|io|gov)" def split(text): text = " " + text + " " text = text.replace("\n"," ") text = re.sub(prefixes,"\\1",text) text = re.sub(websites,"\\1",text) if "Ph.D" in text: text = text.replace("Ph.D.","PhD") text = re.sub("\s" + alphabets + "[.] "," \\1 ",text) text = re.sub(acronyms+" "+starters,"\\1 \\2",text) text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1\\2\\3",text) text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1\\2",text) text = re.sub(" "+suffixes+"[.] "+starters," \\1 \\2",text) text = re.sub(" "+suffixes+"[.]"," \\1",text) text = re.sub(" " + alphabets + "[.]"," \\1",text) if "”" in text: text = text.replace(".”","”.") if "\"" in text: text = text.replace(".\"","\".") if "!" in text: text = text.replace("!\"","\"!") if "?" in text: text = text.replace("?\"","\"?") text = text.replace(".",".") text = text.replace("?","?") text = text.replace("!","!") text = text.replace("",".") sentences = text.split("") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences def speech_to_text(speech_file, speaker_segmentation, whisper, alignment_model, metadata, whisper_device): speaker_output = speaker_segmentation(speech_file) result = whisper.transcribe(speech_file) chunks = whisperx.align(result["segments"], alignment_model, metadata, speech_file, whisper_device)["word_segments"] diarized_output = [] i = 0 speaker_counter = 0 # New iteration every time the speaker changes for turn, _, _ in speaker_output.itertracks(yield_label=True): speaker = "Customer" if speaker_counter % 2 == 0 else "Support" diarized = "" while i < len(chunks) and chunks[i]["end"] <= turn.end: diarized += chunks[i]["text"] + " " i += 1 if diarized != "": # diarized = rpunct.punctuate(re.sub(eng_pattern, "", diarized), lang="en") diarized_output.extend( [ (diarized, speaker), ("from {:.2f}-{:.2f}".format(turn.start, turn.end), None), ] ) speaker_counter += 1 return diarized_output