Spaces:
Build error
Build error
Duplicate from enoreyes/call-sentiment-demo
Browse filesCo-authored-by: Eno Reyes <enoreyes@users.noreply.huggingface.co>
- .gitattributes +31 -0
- Customer_Support_Call.wav +3 -0
- README.md +13 -0
- app.py +117 -0
- example_audio.wav +3 -0
- packages.txt +2 -0
- requirements.txt +12 -0
- short-take-1.wav +3 -0
- utils.py +116 -0
.gitattributes
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
20 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
example_audio.wav filter=lfs diff=lfs merge=lfs -text
|
30 |
+
short-take-1.wav filter=lfs diff=lfs merge=lfs -text
|
31 |
+
Customer_Support_Call.wav filter=lfs diff=lfs merge=lfs -text
|
Customer_Support_Call.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db6489658bb04f84503531d628a67028de9d754ee0b18cf229f39deec7828001
|
3 |
+
size 31497612
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Call Sentiment Blocks 2
|
3 |
+
emoji: 🐠
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.11.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
duplicated_from: enoreyes/call-sentiment-demo
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
|
app.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import functools
|
4 |
+
from functools import partial
|
5 |
+
|
6 |
+
import requests
|
7 |
+
import pandas as pd
|
8 |
+
import plotly.express as px
|
9 |
+
|
10 |
+
import torch
|
11 |
+
import gradio as gr
|
12 |
+
from transformers import pipeline, Wav2Vec2ProcessorWithLM
|
13 |
+
from pyannote.audio import Pipeline
|
14 |
+
import whisperx
|
15 |
+
|
16 |
+
from utils import split, create_fig
|
17 |
+
from utils import speech_to_text as stt
|
18 |
+
|
19 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
20 |
+
device = 0 if torch.cuda.is_available() else -1
|
21 |
+
|
22 |
+
# display if the sentiment value is above these thresholds
|
23 |
+
thresholds = {"joy": 0.99,"anger": 0.95,"surprise": 0.95,"sadness": 0.98,"fear": 0.95,"love": 0.99,}
|
24 |
+
|
25 |
+
color_map = {"joy": "green","anger": "red","surprise": "yellow","sadness": "blue","fear": "orange","love": "purple",}
|
26 |
+
|
27 |
+
# Audio components
|
28 |
+
whisper_device = "cuda" if torch.cuda.is_available() else "cpu"
|
29 |
+
whisper = whisperx.load_model("tiny.en", whisper_device)
|
30 |
+
alignment_model, metadata = whisperx.load_align_model(language_code="en", device=whisper_device)
|
31 |
+
speaker_segmentation = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
|
32 |
+
use_auth_token=os.environ['ENO_TOKEN'])
|
33 |
+
|
34 |
+
|
35 |
+
# Text components
|
36 |
+
emotion_pipeline = pipeline(
|
37 |
+
"text-classification",
|
38 |
+
model="bhadresh-savani/distilbert-base-uncased-emotion",
|
39 |
+
device=device,
|
40 |
+
)
|
41 |
+
summarization_pipeline = pipeline(
|
42 |
+
"summarization",
|
43 |
+
model="knkarthick/MEETING_SUMMARY",
|
44 |
+
device=device
|
45 |
+
)
|
46 |
+
|
47 |
+
EXAMPLES = [["Customer_Support_Call.wav"]]
|
48 |
+
|
49 |
+
|
50 |
+
speech_to_text = partial(
|
51 |
+
stt,
|
52 |
+
speaker_segmentation=speaker_segmentation,
|
53 |
+
whisper=whisper,
|
54 |
+
alignment_model=alignment_model,
|
55 |
+
metadata=metadata,
|
56 |
+
whisper_device=whisper_device
|
57 |
+
)
|
58 |
+
|
59 |
+
def summarize(diarized, summarization_pipeline):
|
60 |
+
text = ""
|
61 |
+
for d in diarized:
|
62 |
+
text += f"\n{d[1]}: {d[0]}"
|
63 |
+
|
64 |
+
return summarization_pipeline(text)[0]["summary_text"]
|
65 |
+
|
66 |
+
def sentiment(diarized, emotion_pipeline):
|
67 |
+
customer_sentiments = []
|
68 |
+
|
69 |
+
for i in range(0, len(diarized), 2):
|
70 |
+
speaker_speech, speaker_id = diarized[i]
|
71 |
+
sentences = split(speaker_speech)
|
72 |
+
|
73 |
+
if "Customer" in speaker_id:
|
74 |
+
outputs = emotion_pipeline(sentences)
|
75 |
+
for idx, (o, t) in enumerate(zip(outputs, sentences)):
|
76 |
+
if o["score"] > thresholds[o["label"]]:
|
77 |
+
customer_sentiments.append((t, o["label"]))
|
78 |
+
|
79 |
+
return customer_sentiments
|
80 |
+
|
81 |
+
with gr.Blocks() as demo:
|
82 |
+
|
83 |
+
with gr.Row():
|
84 |
+
with gr.Column():
|
85 |
+
audio = gr.Audio(label="Audio file", type="filepath")
|
86 |
+
btn = gr.Button("Transcribe and Diarize")
|
87 |
+
|
88 |
+
gr.Markdown("**Call Transcript:**")
|
89 |
+
diarized = gr.HighlightedText(label="Call Transcript")
|
90 |
+
gr.Markdown("Summarize Speaker")
|
91 |
+
sum_btn = gr.Button("Get Summary")
|
92 |
+
summary = gr.Textbox(lines=4)
|
93 |
+
sentiment_btn = gr.Button("Get Customer Sentiment")
|
94 |
+
analyzed = gr.HighlightedText(color_map=color_map)
|
95 |
+
|
96 |
+
with gr.Column():
|
97 |
+
gr.Markdown("## Example Files")
|
98 |
+
gr.Examples(
|
99 |
+
examples=EXAMPLES,
|
100 |
+
inputs=[audio],
|
101 |
+
outputs=[diarized],
|
102 |
+
fn=speech_to_text,
|
103 |
+
cache_examples=True
|
104 |
+
)
|
105 |
+
# when example button is clicked, convert audio file to text and diarize
|
106 |
+
btn.click(
|
107 |
+
fn=speech_to_text,
|
108 |
+
inputs=audio,
|
109 |
+
outputs=diarized,
|
110 |
+
)
|
111 |
+
# when summarize checkboxes are changed, create summary
|
112 |
+
sum_btn.click(fn=partial(summarize, summarization_pipeline=summarization_pipeline), inputs=[diarized], outputs=summary)
|
113 |
+
|
114 |
+
# when sentiment button clicked, display highlighted text and plot
|
115 |
+
sentiment_btn.click(fn=partial(sentiment, emotion_pipeline=emotion_pipeline), inputs=diarized, outputs=[analyzed])
|
116 |
+
|
117 |
+
demo.launch(debug=1)
|
example_audio.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:43166418f743e61807c7681944bf344c4720924adb4e5879dfa954dc7ecc82b2
|
3 |
+
size 3202638
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
libsndfile1
|
2 |
+
ffmpeg
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==1.11
|
2 |
+
transformers==4.26.1
|
3 |
+
torchvision==0.12.0
|
4 |
+
torchaudio==0.11.0
|
5 |
+
torchtext==0.12.0
|
6 |
+
speechbrain==0.5.12
|
7 |
+
pyannote.audio
|
8 |
+
librosa
|
9 |
+
requests
|
10 |
+
speechbrain
|
11 |
+
plotly
|
12 |
+
git+https://github.com/m-bain/whisperx.git
|
short-take-1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf15193510fc5a5680fdfdffda6c7cc5b8595bdde3d267b9ef5223e62035a952
|
3 |
+
size 20079500
|
utils.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import functools
|
3 |
+
import requests
|
4 |
+
import pandas as pd
|
5 |
+
import plotly.express as px
|
6 |
+
import torch
|
7 |
+
import gradio as gr
|
8 |
+
from transformers import pipeline, Wav2Vec2ProcessorWithLM
|
9 |
+
from pyannote.audio import Pipeline
|
10 |
+
from librosa import load, resample
|
11 |
+
import whisperx
|
12 |
+
|
13 |
+
import re
|
14 |
+
alphabets= "([A-Za-z])"
|
15 |
+
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
|
16 |
+
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
|
17 |
+
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
18 |
+
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
|
19 |
+
websites = "[.](com|net|org|io|gov)"
|
20 |
+
|
21 |
+
def split(text):
|
22 |
+
text = " " + text + " "
|
23 |
+
text = text.replace("\n"," ")
|
24 |
+
text = re.sub(prefixes,"\\1<prd>",text)
|
25 |
+
text = re.sub(websites,"<prd>\\1",text)
|
26 |
+
if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
|
27 |
+
text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
|
28 |
+
text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
|
29 |
+
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
|
30 |
+
text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
|
31 |
+
text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
|
32 |
+
text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
|
33 |
+
text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
|
34 |
+
if "”" in text: text = text.replace(".”","”.")
|
35 |
+
if "\"" in text: text = text.replace(".\"","\".")
|
36 |
+
if "!" in text: text = text.replace("!\"","\"!")
|
37 |
+
if "?" in text: text = text.replace("?\"","\"?")
|
38 |
+
text = text.replace(".",".<stop>")
|
39 |
+
text = text.replace("?","?<stop>")
|
40 |
+
text = text.replace("!","!<stop>")
|
41 |
+
text = text.replace("<prd>",".")
|
42 |
+
sentences = text.split("<stop>")
|
43 |
+
sentences = sentences[:-1]
|
44 |
+
sentences = [s.strip() for s in sentences]
|
45 |
+
return sentences
|
46 |
+
|
47 |
+
def create_fig(x_min, x_max, to_plot, plot_sentences):
|
48 |
+
x, y = list(zip(*to_plot))
|
49 |
+
|
50 |
+
x_min -= 5
|
51 |
+
x_max += 5
|
52 |
+
|
53 |
+
plot_df = pd.DataFrame(
|
54 |
+
data={
|
55 |
+
"x": x,
|
56 |
+
"y": y,
|
57 |
+
"sentence": plot_sentences,
|
58 |
+
}
|
59 |
+
)
|
60 |
+
|
61 |
+
fig = px.line(
|
62 |
+
plot_df,
|
63 |
+
x="x",
|
64 |
+
y="y",
|
65 |
+
hover_data={
|
66 |
+
"sentence": True,
|
67 |
+
"x": True,
|
68 |
+
"y": False,
|
69 |
+
},
|
70 |
+
labels={"x": "time (seconds)", "y": "sentiment"},
|
71 |
+
title=f"Customer sentiment over time",
|
72 |
+
markers=True,
|
73 |
+
)
|
74 |
+
|
75 |
+
fig = fig.update_yaxes(categoryorder="category ascending")
|
76 |
+
fig = fig.update_layout(
|
77 |
+
font=dict(
|
78 |
+
size=18,
|
79 |
+
),
|
80 |
+
xaxis_range=[x_min, x_max],
|
81 |
+
)
|
82 |
+
|
83 |
+
return fig
|
84 |
+
|
85 |
+
def speech_to_text(speech_file, speaker_segmentation, whisper, alignment_model, metadata, whisper_device):
|
86 |
+
speaker_output = speaker_segmentation(speech_file)
|
87 |
+
result = whisper.transcribe(speech_file)
|
88 |
+
|
89 |
+
chunks = whisperx.align(result["segments"], alignment_model, metadata, speech_file, whisper_device)["word_segments"]
|
90 |
+
|
91 |
+
diarized_output = []
|
92 |
+
i = 0
|
93 |
+
speaker_counter = 0
|
94 |
+
|
95 |
+
# New iteration every time the speaker changes
|
96 |
+
for turn, _, _ in speaker_output.itertracks(yield_label=True):
|
97 |
+
|
98 |
+
speaker = "Customer" if speaker_counter % 2 == 0 else "Support"
|
99 |
+
diarized = ""
|
100 |
+
while i < len(chunks) and chunks[i]["end"] <= turn.end:
|
101 |
+
diarized += chunks[i]["text"] + " "
|
102 |
+
i += 1
|
103 |
+
|
104 |
+
if diarized != "":
|
105 |
+
# diarized = rpunct.punctuate(re.sub(eng_pattern, "", diarized), lang="en")
|
106 |
+
|
107 |
+
diarized_output.extend(
|
108 |
+
[
|
109 |
+
(diarized, speaker),
|
110 |
+
("from {:.2f}-{:.2f}".format(turn.start, turn.end), None),
|
111 |
+
]
|
112 |
+
)
|
113 |
+
|
114 |
+
speaker_counter += 1
|
115 |
+
|
116 |
+
return diarized_output
|