import librosa, joblib, numpy as np, gradio as gr from scipy.interpolate import interp1d from pyAudioAnalysis import ShortTermFeatures from pydub.silence import detect_nonsilent from pydub import AudioSegment def smart_resize(arr, target_size): current_size = arr.shape[1] current_idx = np.linspace(0, current_size - 1, current_size) target_idx = np.linspace(0, current_size - 1, target_size) # Interpolate/extrapolate interp_func = interp1d(current_idx, arr.squeeze(), kind='linear', fill_value="extrapolate") resized_arr = interp_func(target_idx) return resized_arr.reshape(1, target_size) def remove_silence(wav_file): audSeg = AudioSegment.from_wav(wav_file) non_silence_ranges = detect_nonsilent(audSeg, min_silence_len=5, silence_thresh=-30) if not non_silence_ranges: sound = audSeg else: start = non_silence_ranges[0][0] end = non_silence_ranges[-1][1] trimmed_sound = audSeg[start:end] sound = trimmed_sound sound.export('audio.wav', format="wav") def transform_data(audio): remove_silence(audio) x, sr = librosa.load('audio.wav') result, f_names = ShortTermFeatures.feature_extraction(x, sr, 0.050*sr, 0.025*sr) resize_features = smart_resize(result.reshape(1,-1), 20) return resize_features def predict(newdf, loaded_model): prediction = loaded_model.predict(newdf) return prediction def get_label(newpred): if newpred == 0: return 'No' else: return 'Si' def load_model(): ram_for = joblib.load('models/sgd_90.pkl') return ram_for def main(audio): newdf = transform_data(audio) loaded_model = load_model() newpred = predict(newdf, loaded_model) final = get_label(newpred) return final demo = gr.Interface( title = "Autoagent | YES or NO Classification - Layer7", description = "

This model is useful to classify if the user says 'Si' or 'No'. 🎙️


Record your voice:", allow_flagging = "never", fn = main, inputs = gr.Audio( sources=["microphone"], type="filepath", ), outputs = gr.Textbox(label="Clasification") ) if __name__ == "__main__": demo.launch(show_api=False)