Spaces:
Build error
Build error
pustozerov
commited on
Commit
•
02dca0a
0
Parent(s):
Temporary remove samples.
Browse files- .gitattributes +2 -0
- .gitignore +2 -0
- .idea/.gitignore +3 -0
- .idea/PoCCallTranscription.iml +11 -0
- .idea/inspectionProfiles/Project_Default.xml +75 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +4 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- Dockerfile +13 -0
- Interface.py +75 -0
- info/configs/manifests/external_vad_manifest.json +20 -0
- info/configs/manifests/input_manifest.json +1 -0
- info/configs/offline_diarization_asr.yaml +64 -0
- modules/diarization/nemo_diarization.py +76 -0
- packages.txt +4 -0
- requirements.txt +42 -0
.gitattributes
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
/data/user_data/
|
2 |
+
/info/transcripts/
|
.idea/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
.idea/PoCCallTranscription.iml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$">
|
5 |
+
<excludeFolder url="file://$MODULE_DIR$/data/user_data" />
|
6 |
+
<excludeFolder url="file://$MODULE_DIR$/info/transcripts" />
|
7 |
+
</content>
|
8 |
+
<orderEntry type="inheritedJdk" />
|
9 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
10 |
+
</component>
|
11 |
+
</module>
|
.idea/inspectionProfiles/Project_Default.xml
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<profile version="1.0">
|
3 |
+
<option name="myName" value="Project Default" />
|
4 |
+
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
5 |
+
<option name="ignoredPackages">
|
6 |
+
<value>
|
7 |
+
<list size="23">
|
8 |
+
<item index="0" class="java.lang.String" itemvalue="parselmouth" />
|
9 |
+
<item index="1" class="java.lang.String" itemvalue="torchvision" />
|
10 |
+
<item index="2" class="java.lang.String" itemvalue="opencv-python-headless" />
|
11 |
+
<item index="3" class="java.lang.String" itemvalue="pyannote-audio" />
|
12 |
+
<item index="4" class="java.lang.String" itemvalue="pyannote" />
|
13 |
+
<item index="5" class="java.lang.String" itemvalue="scipy" />
|
14 |
+
<item index="6" class="java.lang.String" itemvalue="nemo_toolkit" />
|
15 |
+
<item index="7" class="java.lang.String" itemvalue="scikit-learn" />
|
16 |
+
<item index="8" class="java.lang.String" itemvalue="scikit_learn" />
|
17 |
+
<item index="9" class="java.lang.String" itemvalue="aiogram" />
|
18 |
+
<item index="10" class="java.lang.String" itemvalue="wget" />
|
19 |
+
<item index="11" class="java.lang.String" itemvalue="sklearn" />
|
20 |
+
<item index="12" class="java.lang.String" itemvalue="nemo" />
|
21 |
+
<item index="13" class="java.lang.String" itemvalue="pydub" />
|
22 |
+
<item index="14" class="java.lang.String" itemvalue="numpy" />
|
23 |
+
<item index="15" class="java.lang.String" itemvalue="omegaconf" />
|
24 |
+
<item index="16" class="java.lang.String" itemvalue="pandas" />
|
25 |
+
<item index="17" class="java.lang.String" itemvalue="importlib" />
|
26 |
+
<item index="18" class="java.lang.String" itemvalue="spacy" />
|
27 |
+
<item index="19" class="java.lang.String" itemvalue="matplotlib" />
|
28 |
+
<item index="20" class="java.lang.String" itemvalue="librosa" />
|
29 |
+
<item index="21" class="java.lang.String" itemvalue="xgboost" />
|
30 |
+
<item index="22" class="java.lang.String" itemvalue="torchaudio" />
|
31 |
+
</list>
|
32 |
+
</value>
|
33 |
+
</option>
|
34 |
+
</inspection_tool>
|
35 |
+
<inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
36 |
+
<option name="ignoredErrors">
|
37 |
+
<list>
|
38 |
+
<option value="E722" />
|
39 |
+
<option value="E402" />
|
40 |
+
</list>
|
41 |
+
</option>
|
42 |
+
</inspection_tool>
|
43 |
+
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
44 |
+
<option name="ignoredErrors">
|
45 |
+
<list>
|
46 |
+
<option value="N806" />
|
47 |
+
<option value="N803" />
|
48 |
+
</list>
|
49 |
+
</option>
|
50 |
+
</inspection_tool>
|
51 |
+
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
52 |
+
<option name="ignoredIdentifiers">
|
53 |
+
<list>
|
54 |
+
<option value="parselmouth" />
|
55 |
+
<option value="tuple.mean" />
|
56 |
+
<option value="tuple.variance" />
|
57 |
+
<option value="tuple.minmax" />
|
58 |
+
<option value="tuple.kurtosis" />
|
59 |
+
<option value="tuple.skewness" />
|
60 |
+
<option value="list.__getitem__" />
|
61 |
+
<option value="numpy.core._multiarray_umath.ndarray.A" />
|
62 |
+
<option value="cv2.remap" />
|
63 |
+
<option value="cv2.INTER_LINEAR" />
|
64 |
+
<option value="cv2.initUndistortRectifyMap" />
|
65 |
+
<option value="cv2.getOptimalNewCameraMatrix" />
|
66 |
+
<option value="cv2.resize" />
|
67 |
+
<option value="modules.data_generator.svgs2ttf.fontforge" />
|
68 |
+
<option value="modules.data_generator.ttf2pngs.fontforge" />
|
69 |
+
<option value="os.sys" />
|
70 |
+
<option value="typing.TextIO.__getitem__" />
|
71 |
+
</list>
|
72 |
+
</option>
|
73 |
+
</inspection_tool>
|
74 |
+
</profile>
|
75 |
+
</component>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
|
4 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/PoCCallTranscription.iml" filepath="$PROJECT_DIR$/.idea/PoCCallTranscription.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
WORKDIR /app
|
3 |
+
COPY . .
|
4 |
+
|
5 |
+
RUN apt-get -y update
|
6 |
+
RUN apt-get -y upgrade
|
7 |
+
|
8 |
+
# Install every package one after another to track time
|
9 |
+
RUN python -m pip install --upgrade pip
|
10 |
+
RUN pip install -r requirements.txt
|
11 |
+
|
12 |
+
CMD ["python", "./Interface.py"]
|
13 |
+
# Next commands are: docker build -t pustozerov/poc-call-transcription:1.0
|
Interface.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import random
|
3 |
+
import os
|
4 |
+
import soundfile as sf
|
5 |
+
import streamlit as st
|
6 |
+
from pydub import AudioSegment
|
7 |
+
|
8 |
+
from modules.diarization.nemo_diarization import diarization
|
9 |
+
|
10 |
+
st.title('Call Transcription demo')
|
11 |
+
st.subheader('This simple demo shows the possibilities of the ASR and NLP in the task of '
|
12 |
+
'automatic speech recognition and diarization. It works with mp3, ogg and wav files. You can randomly '
|
13 |
+
'pickup a set of images from the built-in database or try uploading your own files.')
|
14 |
+
|
15 |
+
|
16 |
+
if st.button('Try random samples from the database'):
|
17 |
+
folder = "data/datasets/crema_d_diarization_chunks"
|
18 |
+
os.makedirs(folder, exist_ok=True)
|
19 |
+
list_all_audio = glob.glob("data/datasets/crema_d_diarization_chunks/*.wav")
|
20 |
+
chosen_files = sorted(random.sample(list_all_audio, 1))
|
21 |
+
file_name = os.path.basename(chosen_files[0]).split(".")[0]
|
22 |
+
audio_file = open(chosen_files[0], 'rb')
|
23 |
+
audio_bytes = audio_file.read()
|
24 |
+
st.audio(audio_bytes)
|
25 |
+
f = sf.SoundFile(chosen_files[0])
|
26 |
+
st.write("Starting transcription. Estimated processing time: %0.1f seconds" % (f.frames / (f.samplerate * 5)))
|
27 |
+
result = diarization(chosen_files[0])
|
28 |
+
with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f:
|
29 |
+
transcript = f.read()
|
30 |
+
st.write("Transcription completed.")
|
31 |
+
st.write("Number of speakers: %s" % result[file_name]["speaker_count"])
|
32 |
+
st.write("Sentences: %s" % len(result[file_name]["sentences"]))
|
33 |
+
st.write("Words: %s" % len(result[file_name]["words"]))
|
34 |
+
st.download_button(
|
35 |
+
label="Download audio transcript",
|
36 |
+
data=transcript,
|
37 |
+
file_name='transcript.txt',
|
38 |
+
mime='text/csv',
|
39 |
+
)
|
40 |
+
|
41 |
+
uploaded_file = st.file_uploader("Choose your recording with a speech",
|
42 |
+
accept_multiple_files=False, type=["mp3", "wav", "ogg"])
|
43 |
+
if uploaded_file is not None:
|
44 |
+
folder = "data/user_data/"
|
45 |
+
os.makedirs(folder, exist_ok=True)
|
46 |
+
for f in glob.glob(folder + '*'):
|
47 |
+
os.remove(f)
|
48 |
+
save_path = folder + uploaded_file.name
|
49 |
+
if ".mp3" in uploaded_file:
|
50 |
+
sound = AudioSegment.from_mp3(uploaded_file)
|
51 |
+
elif ".ogg" in uploaded_file:
|
52 |
+
sound = AudioSegment.from_ogg(uploaded_file)
|
53 |
+
else:
|
54 |
+
sound = AudioSegment.from_wav(uploaded_file)
|
55 |
+
sound.export(save_path, format="wav", parameters=["-ac", "1"])
|
56 |
+
file_name = os.path.basename(save_path).split(".")[0]
|
57 |
+
audio_file = open(save_path, 'rb')
|
58 |
+
audio_bytes = audio_file.read()
|
59 |
+
st.audio(audio_bytes)
|
60 |
+
f = sf.SoundFile(save_path)
|
61 |
+
st.write("Starting transcription. Estimated processing time: %0.0f minutes and %02.0f seconds"
|
62 |
+
% ((f.frames / (f.samplerate * 3) // 60), (f.frames / (f.samplerate * 3) % 60)))
|
63 |
+
result = diarization(save_path)
|
64 |
+
with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f:
|
65 |
+
transcript = f.read()
|
66 |
+
st.write("Transcription completed.")
|
67 |
+
st.write("Number of speakers: %s" % result[file_name]["speaker_count"])
|
68 |
+
st.write("Sentences: %s" % len(result[file_name]["sentences"]))
|
69 |
+
st.write("Words: %s" % len(result[file_name]["words"]))
|
70 |
+
st.download_button(
|
71 |
+
label="Download audio transcript",
|
72 |
+
data=transcript,
|
73 |
+
file_name='transcript.txt',
|
74 |
+
mime='text/csv',
|
75 |
+
)
|
info/configs/manifests/external_vad_manifest.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 0.9703125, "duration": 1.1475000000000004, "label": "SPEECH", "uniq_id": "26"}
|
2 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 3.2146875, "duration": 1.7381250000000001, "label": "SPEECH", "uniq_id": "26"}
|
3 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 5.340937500000001, "duration": 1.8899999999999988, "label": "SPEECH", "uniq_id": "26"}
|
4 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 8.4965625, "duration": 1.0293750000000017, "label": "SPEECH", "uniq_id": "26"}
|
5 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 10.454062500000003, "duration": 11.930625, "label": "SPEECH", "uniq_id": "26"}
|
6 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 23.0934375, "duration": 1.6537500000000023, "label": "SPEECH", "uniq_id": "26"}
|
7 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 26.9746875, "duration": 1.5187500000000007, "label": "SPEECH", "uniq_id": "26"}
|
8 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 29.7253125, "duration": 3.526875000000004, "label": "SPEECH", "uniq_id": "26"}
|
9 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 34.9228125, "duration": 1.2150000000000034, "label": "SPEECH", "uniq_id": "26"}
|
10 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 37.2009375, "duration": 1.4343750000000028, "label": "SPEECH", "uniq_id": "26"}
|
11 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 40.5084375, "duration": 1.8225000000000051, "label": "SPEECH", "uniq_id": "26"}
|
12 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 42.972187500000004, "duration": 3.628124999999997, "label": "SPEECH", "uniq_id": "26"}
|
13 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 47.7478125, "duration": 1.822499999999998, "label": "SPEECH", "uniq_id": "26"}
|
14 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 50.5153125, "duration": 1.9575000000000031, "label": "SPEECH", "uniq_id": "26"}
|
15 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 53.1478125, "duration": 1.7212500000000048, "label": "SPEECH", "uniq_id": "26"}
|
16 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 55.3246875, "duration": 1.4681250000000006, "label": "SPEECH", "uniq_id": "26"}
|
17 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 58.6153125, "duration": 1.7718750000000014, "label": "SPEECH", "uniq_id": "26"}
|
18 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 60.9271875, "duration": 1.8900000000000006, "label": "SPEECH", "uniq_id": "26"}
|
19 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 64.01531250000001, "duration": 1.4681250000000006, "label": "SPEECH", "uniq_id": "26"}
|
20 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 66.4115625, "duration": 1.991250000000008, "label": "SPEECH", "uniq_id": "26"}
|
info/configs/manifests/input_manifest.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 0, "duration": null, "label": "infer", "text": "-", "num_speakers": 2, "rttm_filepath": null, "uem_filepath": null}
|
info/configs/offline_diarization_asr.yaml
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: ClusterDiarizer
|
2 |
+
num_workers: 0
|
3 |
+
sample_rate: 16000
|
4 |
+
batch_size: 64
|
5 |
+
diarizer:
|
6 |
+
manifest_filepath: ???
|
7 |
+
out_dir: ???
|
8 |
+
oracle_vad: false
|
9 |
+
collar: 0.25
|
10 |
+
ignore_overlap: true
|
11 |
+
vad:
|
12 |
+
model_path: null
|
13 |
+
external_vad_manifest: null
|
14 |
+
parameters:
|
15 |
+
window_length_in_sec: 0.15
|
16 |
+
shift_length_in_sec: 0.01
|
17 |
+
smoothing: median
|
18 |
+
overlap: 0.875
|
19 |
+
onset: 0.4
|
20 |
+
offset: 0.7
|
21 |
+
pad_onset: 0.05
|
22 |
+
pad_offset: -0.1
|
23 |
+
min_duration_on: 0.2
|
24 |
+
min_duration_off: 0.2
|
25 |
+
filter_speech_first: true
|
26 |
+
speaker_embeddings:
|
27 |
+
model_path: ???
|
28 |
+
parameters:
|
29 |
+
window_length_in_sec: 1.5
|
30 |
+
shift_length_in_sec: 0.75
|
31 |
+
multiscale_weights: null
|
32 |
+
save_embeddings: false
|
33 |
+
clustering:
|
34 |
+
parameters:
|
35 |
+
oracle_num_speakers: false
|
36 |
+
max_num_speakers: 20
|
37 |
+
enhanced_count_thres: 80
|
38 |
+
max_rp_threshold: 0.25
|
39 |
+
sparse_search_volume: 30
|
40 |
+
maj_vote_spk_count: false
|
41 |
+
asr:
|
42 |
+
model_path: ???
|
43 |
+
parameters:
|
44 |
+
asr_based_vad: false
|
45 |
+
asr_based_vad_threshold: 0.05
|
46 |
+
asr_batch_size: null
|
47 |
+
lenient_overlap_WDER: true
|
48 |
+
decoder_delay_in_sec: null
|
49 |
+
word_ts_anchor_offset: null
|
50 |
+
word_ts_anchor_pos: start
|
51 |
+
fix_word_ts_with_VAD: false
|
52 |
+
colored_text: false
|
53 |
+
print_time: true
|
54 |
+
break_lines: false
|
55 |
+
ctc_decoder_parameters:
|
56 |
+
pretrained_language_model: null
|
57 |
+
beam_width: 32
|
58 |
+
alpha: 0.5
|
59 |
+
beta: 2.5
|
60 |
+
realigning_lm_parameters:
|
61 |
+
arpa_language_model: null
|
62 |
+
min_number_of_words: 3
|
63 |
+
max_number_of_words: 10
|
64 |
+
logprob_diff_threshold: 1.2
|
modules/diarization/nemo_diarization.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASR_TIMESTAMPS
|
4 |
+
from nemo.collections.asr.parts.utils.diarization_utils import ASR_DIAR_OFFLINE
|
5 |
+
from omegaconf import OmegaConf
|
6 |
+
from pyannote.audio import Pipeline
|
7 |
+
|
8 |
+
ROOT = os.getcwd()
|
9 |
+
MODEL_CONFIG = "info/configs/offline_diarization_asr.yaml"
|
10 |
+
data_dir = os.path.join(ROOT, 'info/configs/')
|
11 |
+
os.makedirs(data_dir, exist_ok=True)
|
12 |
+
output_dir = os.path.join(ROOT, 'info/transcripts/')
|
13 |
+
os.makedirs(output_dir, exist_ok=True)
|
14 |
+
|
15 |
+
|
16 |
+
def diarization(file_path):
|
17 |
+
# Create a manifest for input with below format.
|
18 |
+
# {'audio_filepath': /path/to/audio_file, 'offset': 0, 'duration':None, 'label': 'infer', 'text': '-',
|
19 |
+
# 'num_speakers': None, 'rttm_filepath': /path/to/rttm/file, 'uem_filepath'='/path/to/uem/filepath'}
|
20 |
+
import json
|
21 |
+
meta = {
|
22 |
+
'audio_filepath': file_path,
|
23 |
+
'offset': 0,
|
24 |
+
'duration': None,
|
25 |
+
'label': 'infer',
|
26 |
+
'text': '-',
|
27 |
+
'num_speakers': 2,
|
28 |
+
'rttm_filepath': None,
|
29 |
+
'uem_filepath': None
|
30 |
+
}
|
31 |
+
with open(os.path.join(data_dir, 'manifests/', 'input_manifest.json'), 'w') as fp:
|
32 |
+
json.dump(meta, fp)
|
33 |
+
fp.write('\n')
|
34 |
+
|
35 |
+
# Make a manifest with an external VAD
|
36 |
+
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection")
|
37 |
+
output = pipeline(file_path)
|
38 |
+
initial_json = output.for_json()
|
39 |
+
keys = ("audio_filepath", "offset", "duration", "label")
|
40 |
+
output_json = []
|
41 |
+
for segment in initial_json["content"]:
|
42 |
+
vad_json = dict.fromkeys(keys)
|
43 |
+
vad_json["audio_filepath"] = file_path
|
44 |
+
vad_json["offset"] = segment["segment"]["start"]
|
45 |
+
vad_json["duration"] = segment["segment"]["end"] - segment["segment"]["start"]
|
46 |
+
vad_json["label"] = "SPEECH"
|
47 |
+
vad_json["uniq_id"] = initial_json["uri"]
|
48 |
+
output_json.append(vad_json)
|
49 |
+
with open(os.path.join(data_dir, 'manifests/', 'external_vad_manifest.json'), 'w') as f:
|
50 |
+
for item in output_json:
|
51 |
+
f.write(str(item).replace("'", '"') + '\n')
|
52 |
+
|
53 |
+
config2 = OmegaConf.load(MODEL_CONFIG)
|
54 |
+
config2.diarizer.asr.model_path = 'QuartzNet15x5Base-En'
|
55 |
+
config2.diarizer.manifest_filepath = \
|
56 |
+
os.path.join(data_dir, 'manifests/', 'input_manifest.json')
|
57 |
+
config2.diarizer.speaker_embeddings.model_path = 'titanet_large'
|
58 |
+
config2.diarizer.vad.external_vad_manifest = \
|
59 |
+
os.path.join(data_dir, 'manifests/', 'external_vad_manifest.json')
|
60 |
+
config2.diarizer.out_dir = output_dir
|
61 |
+
config2.num_workers = 0
|
62 |
+
asr_ts_decoder = ASR_TIMESTAMPS(**config2.diarizer)
|
63 |
+
asr_model = asr_ts_decoder.set_asr_model()
|
64 |
+
word_hyp, word_ts_hyp = asr_ts_decoder.run_ASR(asr_model)
|
65 |
+
print(word_hyp)
|
66 |
+
print(word_ts_hyp)
|
67 |
+
|
68 |
+
asr_diar_offline = ASR_DIAR_OFFLINE(**config2.diarizer)
|
69 |
+
asr_diar_offline.word_ts_anchor_offset = asr_ts_decoder.word_ts_anchor_offset
|
70 |
+
diar_hyp, diar_score = asr_diar_offline.run_diarization(config2, word_ts_hyp)
|
71 |
+
print("Diarization hypothesis output: \n", diar_hyp)
|
72 |
+
result = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp)
|
73 |
+
file_to_show = os.path.join(data_dir, 'transcripts/pred_rttms/', file_path.split('/')[-1].split(".")[0], '.txt')
|
74 |
+
print(file_to_show)
|
75 |
+
print(diar_hyp)
|
76 |
+
return result
|
packages.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
libsndfile1
|
2 |
+
ffmpeg
|
3 |
+
python3-pip
|
4 |
+
python3-dev
|
requirements.txt
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
astroid
|
2 |
+
braceexpand==0.1.7
|
3 |
+
editdistance==0.6.0
|
4 |
+
einops==0.3.2
|
5 |
+
h5py==3.7.0
|
6 |
+
hydra-core==1.1.2
|
7 |
+
ijson==3.1.4
|
8 |
+
inflect==5.6.0
|
9 |
+
ipadic==1.0.0
|
10 |
+
ipython==8.4.0
|
11 |
+
jieba==0.42.1
|
12 |
+
kenlm @ https://github.com/kpu/kenlm/archive/master.zip
|
13 |
+
librosa==0.9.2
|
14 |
+
mecab-python3==1.0.5
|
15 |
+
nemo-toolkit @ git+https://github.com/NVIDIA/NeMo.git@6442e339a47d30a106d869d1ef29cc1294753b75
|
16 |
+
omegaconf==2.1.2
|
17 |
+
OpenCC==1.1.1
|
18 |
+
pangu==4.0.6.1
|
19 |
+
praat-parselmouth==0.4.1
|
20 |
+
protobuf==3.19.4
|
21 |
+
psutil==5.9.1
|
22 |
+
pyannote.audio @ https://github.com/pyannote/pyannote-audio/archive/develop.zip
|
23 |
+
pyannote.core==4.4
|
24 |
+
pyannote.database==4.1.3
|
25 |
+
pyannote.metrics==3.2
|
26 |
+
pyannote.pipeline==2.3
|
27 |
+
pyctcdecode==0.3.0
|
28 |
+
pydub==0.25.1
|
29 |
+
pytorch-lightning==1.6.5
|
30 |
+
sacrebleu==2.1.0
|
31 |
+
sacremoses==0.0.53
|
32 |
+
sentencepiece==0.1.96
|
33 |
+
SoundFile==0.10.3.post1
|
34 |
+
spacy==3.4.0
|
35 |
+
speechbrain @ git+https://github.com/speechbrain/speechbrain.git
|
36 |
+
streamlit==1.10.0
|
37 |
+
torch==1.12.0
|
38 |
+
torchaudio==0.12.0
|
39 |
+
transformers==4.20.0
|
40 |
+
webdataset==0.1.62
|
41 |
+
Cython==0.29.14
|
42 |
+
youtokentome
|