Spaces:
Build error
Build error
import glob | |
import random | |
import os | |
import numpy as np | |
import soundfile as sf | |
import streamlit as st | |
from pydub import AudioSegment | |
from datasets import load_dataset | |
from scipy.io.wavfile import write | |
from modules.diarization.nemo_diarization import diarization | |
FOLDER_WAV = "data/user_data" | |
SAMPLE_RATE = 16000 | |
dataset = load_dataset("pustozerov/crema_d_diarization", split='validation') | |
st.title('Call Transcription demo') | |
st.subheader('This simple demo shows the possibilities of the ASR and NLP in the task of ' | |
'automatic speech recognition and diarization. It works with mp3, ogg and wav files. You can randomly ' | |
'pickup a set of images from the built-in database or try uploading your own files.') | |
if st.button('Try a random sample from the database'): | |
shuffled_dataset = dataset.shuffle(seed=random.randint(0, 100)) | |
file_name = str(shuffled_dataset["file"][0]).split(".")[0] | |
audio_bytes = np.array(shuffled_dataset["data"][0]) | |
audio_bytes_scaled = np.int16(audio_bytes / np.max(np.abs(audio_bytes)) * 32767) | |
write(os.path.join(FOLDER_WAV, file_name + '.wav'), rate=SAMPLE_RATE, data=audio_bytes_scaled) | |
f = sf.SoundFile(os.path.join(FOLDER_WAV, file_name + '.wav')) | |
audio_file = open(os.path.join(FOLDER_WAV, file_name + '.wav'), 'rb') | |
st.audio(audio_file.read()) | |
st.write("Starting transcription. Estimated processing time: %0.1f seconds" % (f.frames / (f.samplerate * 5))) | |
result = diarization(os.path.join(FOLDER_WAV, file_name + '.wav')) | |
with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f: | |
transcript = f.read() | |
st.write("Transcription completed.") | |
st.write("Number of speakers: %s" % result[file_name]["speaker_count"]) | |
st.write("Sentences: %s" % len(result[file_name]["sentences"])) | |
st.write("Words: %s" % len(result[file_name]["words"])) | |
st.download_button( | |
label="Download audio transcript", | |
data=transcript, | |
file_name='transcript.txt', | |
mime='text/csv', | |
) | |
uploaded_file = st.file_uploader("Choose your recording with a speech", | |
accept_multiple_files=False, type=["mp3", "wav", "ogg"]) | |
if uploaded_file is not None: | |
folder = "data/user_data/" | |
os.makedirs(folder, exist_ok=True) | |
for f in glob.glob(folder + '*'): | |
os.remove(f) | |
save_path = folder + uploaded_file.name | |
if ".mp3" in uploaded_file: | |
sound = AudioSegment.from_mp3(uploaded_file) | |
elif ".ogg" in uploaded_file: | |
sound = AudioSegment.from_ogg(uploaded_file) | |
else: | |
sound = AudioSegment.from_wav(uploaded_file) | |
sound.export(save_path, format="wav", parameters=["-ac", "1"]) | |
file_name = os.path.basename(save_path).split(".")[0] | |
audio_file = open(save_path, 'rb') | |
audio_bytes = audio_file.read() | |
st.audio(audio_bytes) | |
f = sf.SoundFile(save_path) | |
st.write("Starting transcription. Estimated processing time: %0.0f minutes and %02.0f seconds" | |
% ((f.frames / (f.samplerate * 3) // 60), (f.frames / (f.samplerate * 3) % 60))) | |
result = diarization(save_path) | |
with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f: | |
transcript = f.read() | |
st.write("Transcription completed.") | |
st.write("Number of speakers: %s" % result[file_name]["speaker_count"]) | |
st.write("Sentences: %s" % len(result[file_name]["sentences"])) | |
st.write("Words: %s" % len(result[file_name]["words"])) | |
st.download_button( | |
label="Download audio transcript", | |
data=transcript, | |
file_name='transcript.txt', | |
mime='text/csv', | |
) | |