dwarkesh commited on
Commit
626e00c
1 Parent(s): c8693c4
Files changed (2) hide show
  1. app.py +98 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import gradio as gr
3
+ import datetime
4
+
5
+ import subprocess
6
+
7
+ import torch
8
+ import pyannote.audio
9
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
10
+
11
+ from pyannote.audio import Audio
12
+ from pyannote.core import Segment
13
+
14
+ import wave
15
+ import contextlib
16
+ import math
17
+
18
+ from sklearn.cluster import AgglomerativeClustering
19
+ import numpy as np
20
+
21
+ model = whisper.load_model("large-v2")
22
+ embedding_model = PretrainedSpeakerEmbedding(
23
+ "speechbrain/spkrec-ecapa-voxceleb",
24
+ device=torch.device("cuda"))
25
+
26
+ def transcribe(audio, num_speakers):
27
+ path = convert_to_wav(audio)
28
+ result = model.transcribe(path)
29
+ segments = result["segments"]
30
+ num_speakers = max(round(num_speakers), 1)
31
+ if len(segments) < num_speakers:
32
+ num_speakers = len(segments)
33
+ if len(segments) == 1:
34
+ segments[0]['speaker'] = 'SPEAKER 1'
35
+ else:
36
+ duration = get_duration(path)
37
+ embeddings = make_embeddings(path, segments, duration)
38
+ add_speaker_labels(segments, embeddings, num_speakers)
39
+ output = get_output(segments)
40
+ return output
41
+
42
+ def convert_to_wav(path):
43
+ if path[-3:] != 'wav':
44
+ subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
45
+ path = 'audio.wav'
46
+ return path
47
+
48
+ def get_duration(path):
49
+ with contextlib.closing(wave.open(path,'r')) as f:
50
+ frames = f.getnframes()
51
+ rate = f.getframerate()
52
+ return frames / float(rate)
53
+
54
+ def make_embeddings(path, segments, duration):
55
+ embeddings = np.zeros(shape=(len(segments), 192))
56
+ for i, segment in enumerate(segments):
57
+ embeddings[i] = segment_embedding(path, segment, duration)
58
+ return np.nan_to_num(embeddings)
59
+
60
+ audio = Audio()
61
+
62
+ def segment_embedding(path, segment, duration):
63
+ start = segment["start"]
64
+ # Whisper overshoots the end timestamp in the last segment
65
+ end = min(duration, segment["end"])
66
+ clip = Segment(start, end)
67
+ waveform, sample_rate = audio.crop(path, clip)
68
+ return embedding_model(waveform[None])
69
+
70
+ def add_speaker_labels(segments, embeddings, num_speakers):
71
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
72
+ labels = clustering.labels_
73
+ for i in range(len(segments)):
74
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
75
+
76
+ def time(secs):
77
+ return datetime.timedelta(seconds=round(secs))
78
+
79
+ def get_output(segments):
80
+ output = ''
81
+ for (i, segment) in enumerate(segments):
82
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
83
+ output += "\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n'
84
+ output += segment["text"][1:] + ' '
85
+ return output[1:]
86
+
87
+ gr.Interface(
88
+ title = 'Whisper with Speaker Recognition',
89
+ fn=transcribe,
90
+ inputs=[
91
+ gr.inputs.Audio(source="upload", type="filepath"),
92
+ gr.inputs.Number(default=2, label="Number of Speakers")
93
+
94
+ ],
95
+ outputs=[
96
+ gr.outputs.Textbox(label='Transcript')
97
+ ],
98
+ debug=True).launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/pyannote/pyannote-audio
2
+ git+https://github.com/openai/whisper.git
3
+ gradio