Spaces:

pustozerov
/

poc_call_transcription

Build error

App Files Files Community

pustozerov commited on Aug 10, 2022

Commit

02dca0a

•

0 Parent(s):

Temporary remove samples.

Browse files

Files changed (17) hide show

.gitattributes +2 -0
.gitignore +2 -0
.idea/.gitignore +3 -0
.idea/PoCCallTranscription.iml +11 -0
.idea/inspectionProfiles/Project_Default.xml +75 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
Dockerfile +13 -0
Interface.py +75 -0
info/configs/manifests/external_vad_manifest.json +20 -0
info/configs/manifests/input_manifest.json +1 -0
info/configs/offline_diarization_asr.yaml +64 -0
modules/diarization/nemo_diarization.py +76 -0
packages.txt +4 -0
requirements.txt +42 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.wav filter=lfs diff=lfs merge=lfs -text
2	+ *.ogg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ /data/user_data/
2	+ /info/transcripts/

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/PoCCallTranscription.iml ADDED Viewed

	@@ -0,0 +1,11 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/data/user_data" />
+      <excludeFolder url="file://$MODULE_DIR$/info/transcripts" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,75 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="23">
+            <item index="0" class="java.lang.String" itemvalue="parselmouth" />
+            <item index="1" class="java.lang.String" itemvalue="torchvision" />
+            <item index="2" class="java.lang.String" itemvalue="opencv-python-headless" />
+            <item index="3" class="java.lang.String" itemvalue="pyannote-audio" />
+            <item index="4" class="java.lang.String" itemvalue="pyannote" />
+            <item index="5" class="java.lang.String" itemvalue="scipy" />
+            <item index="6" class="java.lang.String" itemvalue="nemo_toolkit" />
+            <item index="7" class="java.lang.String" itemvalue="scikit-learn" />
+            <item index="8" class="java.lang.String" itemvalue="scikit_learn" />
+            <item index="9" class="java.lang.String" itemvalue="aiogram" />
+            <item index="10" class="java.lang.String" itemvalue="wget" />
+            <item index="11" class="java.lang.String" itemvalue="sklearn" />
+            <item index="12" class="java.lang.String" itemvalue="nemo" />
+            <item index="13" class="java.lang.String" itemvalue="pydub" />
+            <item index="14" class="java.lang.String" itemvalue="numpy" />
+            <item index="15" class="java.lang.String" itemvalue="omegaconf" />
+            <item index="16" class="java.lang.String" itemvalue="pandas" />
+            <item index="17" class="java.lang.String" itemvalue="importlib" />
+            <item index="18" class="java.lang.String" itemvalue="spacy" />
+            <item index="19" class="java.lang.String" itemvalue="matplotlib" />
+            <item index="20" class="java.lang.String" itemvalue="librosa" />
+            <item index="21" class="java.lang.String" itemvalue="xgboost" />
+            <item index="22" class="java.lang.String" itemvalue="torchaudio" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="E722" />
+          <option value="E402" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N806" />
+          <option value="N803" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="parselmouth" />
+          <option value="tuple.mean" />
+          <option value="tuple.variance" />
+          <option value="tuple.minmax" />
+          <option value="tuple.kurtosis" />
+          <option value="tuple.skewness" />
+          <option value="list.__getitem__" />
+          <option value="numpy.core._multiarray_umath.ndarray.A" />
+          <option value="cv2.remap" />
+          <option value="cv2.INTER_LINEAR" />
+          <option value="cv2.initUndistortRectifyMap" />
+          <option value="cv2.getOptimalNewCameraMatrix" />
+          <option value="cv2.resize" />
+          <option value="modules.data_generator.svgs2ttf.fontforge" />
+          <option value="modules.data_generator.ttf2pngs.fontforge" />
+          <option value="os.sys" />
+          <option value="typing.TextIO.__getitem__" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/PoCCallTranscription.iml" filepath="$PROJECT_DIR$/.idea/PoCCallTranscription.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY . .
+RUN apt-get -y update
+RUN apt-get -y upgrade
+# Install every package one after another to track time
+RUN python -m pip install --upgrade pip
+RUN pip install -r requirements.txt
+CMD ["python", "./Interface.py"]
+# Next commands are: docker build -t pustozerov/poc-call-transcription:1.0

Interface.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import glob
+import random
+import os
+import soundfile as sf
+import streamlit as st
+from pydub import AudioSegment
+from modules.diarization.nemo_diarization import diarization
+st.title('Call Transcription demo')
+st.subheader('This simple demo shows the possibilities of the ASR and NLP in the task of '
+             'automatic speech recognition and diarization. It works with mp3, ogg and wav files. You can randomly '
+             'pickup a set of images from the built-in database or try uploading your own files.')
+if st.button('Try random samples from the database'):
+    folder = "data/datasets/crema_d_diarization_chunks"
+    os.makedirs(folder, exist_ok=True)
+    list_all_audio = glob.glob("data/datasets/crema_d_diarization_chunks/*.wav")
+    chosen_files = sorted(random.sample(list_all_audio, 1))
+    file_name = os.path.basename(chosen_files[0]).split(".")[0]
+    audio_file = open(chosen_files[0], 'rb')
+    audio_bytes = audio_file.read()
+    st.audio(audio_bytes)
+    f = sf.SoundFile(chosen_files[0])
+    st.write("Starting transcription. Estimated processing time: %0.1f seconds" % (f.frames / (f.samplerate * 5)))
+    result = diarization(chosen_files[0])
+    with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f:
+        transcript = f.read()
+    st.write("Transcription completed.")
+    st.write("Number of speakers: %s" % result[file_name]["speaker_count"])
+    st.write("Sentences: %s" % len(result[file_name]["sentences"]))
+    st.write("Words: %s" % len(result[file_name]["words"]))
+    st.download_button(
+        label="Download audio transcript",
+        data=transcript,
+        file_name='transcript.txt',
+        mime='text/csv',
+    )
+uploaded_file = st.file_uploader("Choose your recording with a speech",
+                                 accept_multiple_files=False, type=["mp3", "wav", "ogg"])
+if uploaded_file is not None:
+    folder = "data/user_data/"
+    os.makedirs(folder, exist_ok=True)
+    for f in glob.glob(folder + '*'):
+        os.remove(f)
+    save_path = folder + uploaded_file.name
+    if ".mp3" in uploaded_file:
+        sound = AudioSegment.from_mp3(uploaded_file)
+    elif ".ogg" in uploaded_file:
+        sound = AudioSegment.from_ogg(uploaded_file)
+    else:
+        sound = AudioSegment.from_wav(uploaded_file)
+    sound.export(save_path, format="wav", parameters=["-ac", "1"])
+    file_name = os.path.basename(save_path).split(".")[0]
+    audio_file = open(save_path, 'rb')
+    audio_bytes = audio_file.read()
+    st.audio(audio_bytes)
+    f = sf.SoundFile(save_path)
+    st.write("Starting transcription. Estimated processing time: %0.0f minutes and %02.0f seconds"
+             % ((f.frames / (f.samplerate * 3) // 60), (f.frames / (f.samplerate * 3) % 60)))
+    result = diarization(save_path)
+    with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f:
+        transcript = f.read()
+    st.write("Transcription completed.")
+    st.write("Number of speakers: %s" % result[file_name]["speaker_count"])
+    st.write("Sentences: %s" % len(result[file_name]["sentences"]))
+    st.write("Words: %s" % len(result[file_name]["words"]))
+    st.download_button(
+        label="Download audio transcript",
+        data=transcript,
+        file_name='transcript.txt',
+        mime='text/csv',
+    )

info/configs/manifests/external_vad_manifest.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 0.9703125, "duration": 1.1475000000000004, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 3.2146875, "duration": 1.7381250000000001, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 5.340937500000001, "duration": 1.8899999999999988, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 8.4965625, "duration": 1.0293750000000017, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 10.454062500000003, "duration": 11.930625, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 23.0934375, "duration": 1.6537500000000023, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 26.9746875, "duration": 1.5187500000000007, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 29.7253125, "duration": 3.526875000000004, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 34.9228125, "duration": 1.2150000000000034, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 37.2009375, "duration": 1.4343750000000028, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 40.5084375, "duration": 1.8225000000000051, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 42.972187500000004, "duration": 3.628124999999997, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 47.7478125, "duration": 1.822499999999998, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 50.5153125, "duration": 1.9575000000000031, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 53.1478125, "duration": 1.7212500000000048, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 55.3246875, "duration": 1.4681250000000006, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 58.6153125, "duration": 1.7718750000000014, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 60.9271875, "duration": 1.8900000000000006, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 64.01531250000001, "duration": 1.4681250000000006, "label": "SPEECH", "uniq_id": "26"}
+{"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 66.4115625, "duration": 1.991250000000008, "label": "SPEECH", "uniq_id": "26"}

info/configs/manifests/input_manifest.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"audio_filepath": "data/datasets/crema_d_diarization_chunks\\26.wav", "offset": 0, "duration": null, "label": "infer", "text": "-", "num_speakers": 2, "rttm_filepath": null, "uem_filepath": null}

info/configs/offline_diarization_asr.yaml ADDED Viewed

	@@ -0,0 +1,64 @@

+name: ClusterDiarizer
+num_workers: 0
+sample_rate: 16000
+batch_size: 64
+diarizer:
+  manifest_filepath: ???
+  out_dir: ???
+  oracle_vad: false
+  collar: 0.25
+  ignore_overlap: true
+  vad:
+    model_path: null
+    external_vad_manifest: null
+    parameters:
+      window_length_in_sec: 0.15
+      shift_length_in_sec: 0.01
+      smoothing: median
+      overlap: 0.875
+      onset: 0.4
+      offset: 0.7
+      pad_onset: 0.05
+      pad_offset: -0.1
+      min_duration_on: 0.2
+      min_duration_off: 0.2
+      filter_speech_first: true
+  speaker_embeddings:
+    model_path: ???
+    parameters:
+      window_length_in_sec: 1.5
+      shift_length_in_sec: 0.75
+      multiscale_weights: null
+      save_embeddings: false
+  clustering:
+    parameters:
+      oracle_num_speakers: false
+      max_num_speakers: 20
+      enhanced_count_thres: 80
+      max_rp_threshold: 0.25
+      sparse_search_volume: 30
+      maj_vote_spk_count: false
+  asr:
+    model_path: ???
+    parameters:
+      asr_based_vad: false
+      asr_based_vad_threshold: 0.05
+      asr_batch_size: null
+      lenient_overlap_WDER: true
+      decoder_delay_in_sec: null
+      word_ts_anchor_offset: null
+      word_ts_anchor_pos: start
+      fix_word_ts_with_VAD: false
+      colored_text: false
+      print_time: true
+      break_lines: false
+    ctc_decoder_parameters:
+      pretrained_language_model: null
+      beam_width: 32
+      alpha: 0.5
+      beta: 2.5
+    realigning_lm_parameters:
+      arpa_language_model: null
+      min_number_of_words: 3
+      max_number_of_words: 10
+      logprob_diff_threshold: 1.2

modules/diarization/nemo_diarization.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASR_TIMESTAMPS
+from nemo.collections.asr.parts.utils.diarization_utils import ASR_DIAR_OFFLINE
+from omegaconf import OmegaConf
+from pyannote.audio import Pipeline
+ROOT = os.getcwd()
+MODEL_CONFIG = "info/configs/offline_diarization_asr.yaml"
+data_dir = os.path.join(ROOT, 'info/configs/')
+os.makedirs(data_dir, exist_ok=True)
+output_dir = os.path.join(ROOT, 'info/transcripts/')
+os.makedirs(output_dir, exist_ok=True)
+def diarization(file_path):
+    # Create a manifest for input with below format.
+    # {'audio_filepath': /path/to/audio_file, 'offset': 0, 'duration':None, 'label': 'infer', 'text': '-',
+    # 'num_speakers': None, 'rttm_filepath': /path/to/rttm/file, 'uem_filepath'='/path/to/uem/filepath'}
+    import json
+    meta = {
+        'audio_filepath': file_path,
+        'offset': 0,
+        'duration': None,
+        'label': 'infer',
+        'text': '-',
+        'num_speakers': 2,
+        'rttm_filepath': None,
+        'uem_filepath': None
+    }
+    with open(os.path.join(data_dir, 'manifests/', 'input_manifest.json'), 'w') as fp:
+        json.dump(meta, fp)
+        fp.write('\n')
+    # Make a manifest with an external VAD
+    pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection")
+    output = pipeline(file_path)
+    initial_json = output.for_json()
+    keys = ("audio_filepath", "offset", "duration", "label")
+    output_json = []
+    for segment in initial_json["content"]:
+        vad_json = dict.fromkeys(keys)
+        vad_json["audio_filepath"] = file_path
+        vad_json["offset"] = segment["segment"]["start"]
+        vad_json["duration"] = segment["segment"]["end"] - segment["segment"]["start"]
+        vad_json["label"] = "SPEECH"
+        vad_json["uniq_id"] = initial_json["uri"]
+        output_json.append(vad_json)
+    with open(os.path.join(data_dir, 'manifests/', 'external_vad_manifest.json'), 'w') as f:
+        for item in output_json:
+            f.write(str(item).replace("'", '"') + '\n')
+    config2 = OmegaConf.load(MODEL_CONFIG)
+    config2.diarizer.asr.model_path = 'QuartzNet15x5Base-En'
+    config2.diarizer.manifest_filepath = \
+        os.path.join(data_dir, 'manifests/', 'input_manifest.json')
+    config2.diarizer.speaker_embeddings.model_path = 'titanet_large'
+    config2.diarizer.vad.external_vad_manifest = \
+        os.path.join(data_dir, 'manifests/', 'external_vad_manifest.json')
+    config2.diarizer.out_dir = output_dir
+    config2.num_workers = 0
+    asr_ts_decoder = ASR_TIMESTAMPS(**config2.diarizer)
+    asr_model = asr_ts_decoder.set_asr_model()
+    word_hyp, word_ts_hyp = asr_ts_decoder.run_ASR(asr_model)
+    print(word_hyp)
+    print(word_ts_hyp)
+    asr_diar_offline = ASR_DIAR_OFFLINE(**config2.diarizer)
+    asr_diar_offline.word_ts_anchor_offset = asr_ts_decoder.word_ts_anchor_offset
+    diar_hyp, diar_score = asr_diar_offline.run_diarization(config2, word_ts_hyp)
+    print("Diarization hypothesis output: \n", diar_hyp)
+    result = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp)
+    file_to_show = os.path.join(data_dir, 'transcripts/pred_rttms/', file_path.split('/')[-1].split(".")[0], '.txt')
+    print(file_to_show)
+    print(diar_hyp)
+    return result

packages.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+libsndfile1
+ffmpeg
+python3-pip
+python3-dev

requirements.txt ADDED Viewed

	@@ -0,0 +1,42 @@

+astroid
+braceexpand==0.1.7
+editdistance==0.6.0
+einops==0.3.2
+h5py==3.7.0
+hydra-core==1.1.2
+ijson==3.1.4
+inflect==5.6.0
+ipadic==1.0.0
+ipython==8.4.0
+jieba==0.42.1
+kenlm @ https://github.com/kpu/kenlm/archive/master.zip
+librosa==0.9.2
+mecab-python3==1.0.5
+nemo-toolkit @ git+https://github.com/NVIDIA/NeMo.git@6442e339a47d30a106d869d1ef29cc1294753b75
+omegaconf==2.1.2
+OpenCC==1.1.1
+pangu==4.0.6.1
+praat-parselmouth==0.4.1
+protobuf==3.19.4
+psutil==5.9.1
+pyannote.audio @ https://github.com/pyannote/pyannote-audio/archive/develop.zip
+pyannote.core==4.4
+pyannote.database==4.1.3
+pyannote.metrics==3.2
+pyannote.pipeline==2.3
+pyctcdecode==0.3.0
+pydub==0.25.1
+pytorch-lightning==1.6.5
+sacrebleu==2.1.0
+sacremoses==0.0.53
+sentencepiece==0.1.96
+SoundFile==0.10.3.post1
+spacy==3.4.0
+speechbrain @ git+https://github.com/speechbrain/speechbrain.git
+streamlit==1.10.0
+torch==1.12.0
+torchaudio==0.12.0
+transformers==4.20.0
+webdataset==0.1.62
+Cython==0.29.14
+youtokentome