Spaces:

pustozerov
/

poc_call_transcription

Build error

App Files Files Community

poc_call_transcription / app.py

pustozerov

Database with examples was completely moved to the HuggingFace cloud.

8f18caf over 2 years ago

raw

history blame

3.62 kB

	import glob
	import random
	import os
	import numpy as np
	import soundfile as sf
	import streamlit as st
	from pydub import AudioSegment
	from datasets import load_dataset
	from scipy.io.wavfile import write

	from modules.diarization.nemo_diarization import diarization

	FOLDER_WAV = "data/user_data"
	SAMPLE_RATE = 16000
	dataset = load_dataset("pustozerov/crema_d_diarization", split='validation')

	st.title('Call Transcription demo')
	st.subheader('This simple demo shows the possibilities of the ASR and NLP in the task of '
	'automatic speech recognition and diarization. It works with mp3, ogg and wav files. You can randomly '
	'pickup a set of images from the built-in database or try uploading your own files.')
	if st.button('Try a random sample from the database'):

	shuffled_dataset = dataset.shuffle(seed=random.randint(0, 100))
	file_name = str(shuffled_dataset["file"][0]).split(".")[0]
	audio_bytes = np.array(shuffled_dataset["data"][0])
	audio_bytes_scaled = np.int16(audio_bytes / np.max(np.abs(audio_bytes)) * 32767)
	write(os.path.join(FOLDER_WAV, file_name + '.wav'), rate=SAMPLE_RATE, data=audio_bytes_scaled)
	f = sf.SoundFile(os.path.join(FOLDER_WAV, file_name + '.wav'))
	audio_file = open(os.path.join(FOLDER_WAV, file_name + '.wav'), 'rb')
	st.audio(audio_file.read())
	st.write("Starting transcription. Estimated processing time: %0.1f seconds" % (f.frames / (f.samplerate * 5)))
	result = diarization(os.path.join(FOLDER_WAV, file_name + '.wav'))
	with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f:
	transcript = f.read()
	st.write("Transcription completed.")
	st.write("Number of speakers: %s" % result[file_name]["speaker_count"])
	st.write("Sentences: %s" % len(result[file_name]["sentences"]))
	st.write("Words: %s" % len(result[file_name]["words"]))
	st.download_button(
	label="Download audio transcript",
	data=transcript,
	file_name='transcript.txt',
	mime='text/csv',
	)

	uploaded_file = st.file_uploader("Choose your recording with a speech",
	accept_multiple_files=False, type=["mp3", "wav", "ogg"])
	if uploaded_file is not None:
	folder = "data/user_data/"
	os.makedirs(folder, exist_ok=True)
	for f in glob.glob(folder + '*'):
	os.remove(f)
	save_path = folder + uploaded_file.name
	if ".mp3" in uploaded_file:
	sound = AudioSegment.from_mp3(uploaded_file)
	elif ".ogg" in uploaded_file:
	sound = AudioSegment.from_ogg(uploaded_file)
	else:
	sound = AudioSegment.from_wav(uploaded_file)
	sound.export(save_path, format="wav", parameters=["-ac", "1"])
	file_name = os.path.basename(save_path).split(".")[0]
	audio_file = open(save_path, 'rb')
	audio_bytes = audio_file.read()
	st.audio(audio_bytes)
	f = sf.SoundFile(save_path)
	st.write("Starting transcription. Estimated processing time: %0.0f minutes and %02.0f seconds"
	% ((f.frames / (f.samplerate * 3) // 60), (f.frames / (f.samplerate * 3) % 60)))
	result = diarization(save_path)
	with open("info/transcripts/pred_rttms/" + file_name + ".txt") as f:
	transcript = f.read()
	st.write("Transcription completed.")
	st.write("Number of speakers: %s" % result[file_name]["speaker_count"])
	st.write("Sentences: %s" % len(result[file_name]["sentences"]))
	st.write("Words: %s" % len(result[file_name]["words"]))
	st.download_button(
	label="Download audio transcript",
	data=transcript,
	file_name='transcript.txt',
	mime='text/csv',
	)