Spaces:
Build error
Build error
File size: 3,623 Bytes
02dca0a 8f18caf 02dca0a 8f18caf 02dca0a 8f18caf 02dca0a 8f18caf 02dca0a 8f18caf 02dca0a 8f18caf 02dca0a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import glob
import random
import os
import numpy as np
import soundfile as sf
import streamlit as st
from pydub import AudioSegment
from datasets import load_dataset
from scipy.io.wavfile import write
from modules.diarization.nemo_diarization import diarization
FOLDER_WAV = "data/user_data"
SAMPLE_RATE = 16000
dataset = load_dataset("pustozerov/crema_d_diarization", split='validation')
st.title('Call Transcription demo')
st.subheader('This simple demo shows the possibilities of the ASR and NLP in the task of '
'automatic speech recognition and diarization. It works with mp3, ogg and wav files. You can randomly '
'pickup a set of images from the built-in database or try uploading your own files.')
if st.button('Try a random sample from the database'):
shuffled_dataset = dataset.shuffle(seed=random.randint(0, 100))
file_name = str(shuffled_dataset["file"][0]).split(".")[0]
audio_bytes = np.array(shuffled_dataset["data"][0])
audio_bytes_scaled = np.int16(audio_bytes / np.max(np.abs(audio_bytes)) * 32767)
write(os.path.join(FOLDER_WAV, file_name + '.wav'), rate=SAMPLE_RATE, data=audio_bytes_scaled)
f = sf.SoundFile(os.path.join(FOLDER_WAV, file_name + '.wav'))
audio_file = open(os.path.join(FOLDER_WAV, file_name + '.wav'), 'rb')
st.audio(audio_file.read())
st.write("Starting transcription. Estimated processing time: %0.1f seconds" % (f.frames / (f.samplerate * 5)))
result = diarization(os.path.join(FOLDER_WAV, file_name + '.wav'))
with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f:
transcript = f.read()
st.write("Transcription completed.")
st.write("Number of speakers: %s" % result[file_name]["speaker_count"])
st.write("Sentences: %s" % len(result[file_name]["sentences"]))
st.write("Words: %s" % len(result[file_name]["words"]))
st.download_button(
label="Download audio transcript",
data=transcript,
file_name='transcript.txt',
mime='text/csv',
)
uploaded_file = st.file_uploader("Choose your recording with a speech",
accept_multiple_files=False, type=["mp3", "wav", "ogg"])
if uploaded_file is not None:
folder = "data/user_data/"
os.makedirs(folder, exist_ok=True)
for f in glob.glob(folder + '*'):
os.remove(f)
save_path = folder + uploaded_file.name
if ".mp3" in uploaded_file:
sound = AudioSegment.from_mp3(uploaded_file)
elif ".ogg" in uploaded_file:
sound = AudioSegment.from_ogg(uploaded_file)
else:
sound = AudioSegment.from_wav(uploaded_file)
sound.export(save_path, format="wav", parameters=["-ac", "1"])
file_name = os.path.basename(save_path).split(".")[0]
audio_file = open(save_path, 'rb')
audio_bytes = audio_file.read()
st.audio(audio_bytes)
f = sf.SoundFile(save_path)
st.write("Starting transcription. Estimated processing time: %0.0f minutes and %02.0f seconds"
% ((f.frames / (f.samplerate * 3) // 60), (f.frames / (f.samplerate * 3) % 60)))
result = diarization(save_path)
with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f:
transcript = f.read()
st.write("Transcription completed.")
st.write("Number of speakers: %s" % result[file_name]["speaker_count"])
st.write("Sentences: %s" % len(result[file_name]["sentences"]))
st.write("Words: %s" % len(result[file_name]["words"]))
st.download_button(
label="Download audio transcript",
data=transcript,
file_name='transcript.txt',
mime='text/csv',
)
|