NLP / app.py
krishnasai99's picture
Update app.py
cb23395
import streamlit as st
import soundfile as sf
import librosa
from transformers import HubertForCTC, Wav2Vec2Processor , pipeline , Wav2Vec2ForCTC , Wav2Vec2Tokenizer
import torch
import spacy
from spacy import displacy
import en_core_web_sm
import spacy.cli
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk import tokenize
nltk.download('punkt')
import spacy_streamlit
from datasets import load_dataset
from transformers import pipeline
st.title('Audio-to-Text')
audio_file = st.file_uploader('Upload Audio' , type=['wav' , 'mp3','m4a'])
st.subheader( 'Please select any of the NLP tasks')
if st.button('Audio Transcription'):
if audio_file is not None:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summary_list = [str(sentence) for sentence in text]
result = ' '.join(summary_list)
st.markdown(result)
else:
st.error('please upload the audio file')
if st.button('Summarize'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summary_list = [str(sentence) for sentence in text]
result = ' '.join(summary_list)
summarize = pipeline("summarization" , model='facebook/bart-large-cnn')
st.markdown(summarize(result)[0]['summary_text'])
if st.button('Sentiment Analysis'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summary_list = [str(sentence) for sentence in text]
result = ' '.join(summary_list)
nlp_sa = pipeline("sentiment-analysis")
st.markdown(nlp_sa(result))
if st.button('Audio Classification'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summary_list = [str(sentence) for sentence in text]
result = ' '.join(summary_list)
dataset = load_dataset("anton-l/superb_demo", "er", split="session1")
classifier = pipeline("audio-classification", model="superb/wav2vec2-base-superb-er")
labels = classifier(dataset[0]["file"], top_k=5)
st.markdown(labels)
if st.button('Name Entity Recognition'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summary_list = [str(sentence) for sentence in text]
result = ' '.join(summary_list)
nlp = spacy.load('en_core_web_sm')
doc=nlp(result)
spacy_streamlit.visualize_ner(doc, labels=nlp.get_pipe("ner").labels, title= "List of Entities")
tokenizer = AutoTokenizer.from_pretrained("t5-base")
@st.cache(allow_output_mutation=True)
def load_model():
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
return model
model1 = load_model()
st.subheader('Select your source and target language below.')
source_lang = st.selectbox("Source language",['English'])
target_lang = st.selectbox("Target language",['German','French'])
if st.button('Translate'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summary_list = [str(sentence) for sentence in text]
result = ' '.join(summary_list)
prefix = 'translate '+str(source_lang)+' to '+str(target_lang)
sentence_token = tokenize.sent_tokenize(result)
output = tokenizer([prefix+sentence for sentence in sentence_token], padding=True, return_tensors="pt")
translated_id = model1.generate(output["input_ids"], attention_mask=output['attention_mask'], max_length=10000)
translated_word = tokenizer.batch_decode(translated_id, skip_special_tokens=True)
st.subheader('Translated Text')
st.write(' '.join(translated_word))