samplewhisper / app.py
Shubham09's picture
Update app.py
84bc308
import nltk
import librosa
import torch
import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer
nltk.download("punkt")
from transformers import pipeline
import scipy.io.wavfile
import soundfile as sf
from huggingface_hub import HfApi, CommitOperationAdd, CommitOperationDelete
model_name = "Shubham09/whisper31filescheck"
processor = WhisperProcessor.from_pretrained(model_name,task="transcribe")
#tokenizer = WhisperTokenizer.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
def load_data(input_file):
#reading the file
speech, sample_rate = librosa.load(input_file)
#make it 1-D
if len(speech.shape) > 1:
speech = speech[:,0] + speech[:,1]
#Resampling the audio at 16KHz
if sample_rate !=16000:
speech = librosa.resample(speech, sample_rate,16000)
return speech
# def write_to_file(input_file):
# fs = 16000
# sf.write("my_Audio_file.flac",input_file, fs)
# api = HfApi()
# operations = [
# CommitOperationAdd(path_in_repo="my_Audio_file.flac", path_or_fileobj="Shubham09/whisper31filescheck/repo/my_Audio_file.flac"),
# # CommitOperationAdd(path_in_repo="weights.h5", path_or_fileobj="~/repo/weights-final.h5"),
# # CommitOperationDelete(path_in_repo="old-weights.h5"),
# # CommitOperationDelete(path_in_repo="logs/"),
#scipy.io.wavfile.write("microphone-result.wav")
# with open("microphone-results.wav", "wb") as f:
# f.write(input_file.get_wav_data())
# import base64
# wav_file = open("temp.wav", "wb")
# decode_string = base64.b64decode(input_file)
# wav_file.write(decode_string)
pipe = pipeline(model="Shubham09/whisper31filescheck") # change to "your-username/the-name-you-picked"
def asr_transcript(input_file):
#audio = "Shubham09/whisper31filescheck/repo/my_Audio_file.flac"
text = pipe(input_file)["text"]
return text
# speech = load_data(input_file)
# #Tokenize
# input_features = processor(speech).input_features #, padding="longest" , return_tensors="pt"
# #input_values = tokenizer(speech, return_tensors="pt").input_values
# #Take logits
# logits = model(input_features).logits
# #Take argmax
# predicted_ids = torch.argmax(logits, dim=-1)
# #Get the words from predicted word ids
# transcription = processor.batch_decode(predicted_ids)
# #Correcting the letter casing
# #transcription = correct_casing(transcription.lower())
# return transcription
gr.Interface(asr_transcript,
inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker"),
outputs = gr.outputs.Textbox(label="Output Text"),
title="ASR using Whisper",
description = "This application displays transcribed text for given audio input",
examples = [["Actuator.wav"], ["anomalies.wav"]], theme="grass").launch(share=True)