|
import librosa, joblib, numpy as np, gradio as gr |
|
from scipy.interpolate import interp1d |
|
from pyAudioAnalysis import ShortTermFeatures |
|
from pydub.silence import detect_nonsilent |
|
from pydub import AudioSegment |
|
|
|
|
|
def smart_resize(arr, target_size): |
|
current_size = arr.shape[1] |
|
|
|
current_idx = np.linspace(0, current_size - 1, current_size) |
|
target_idx = np.linspace(0, current_size - 1, target_size) |
|
|
|
|
|
interp_func = interp1d(current_idx, arr.squeeze(), kind='linear', fill_value="extrapolate") |
|
resized_arr = interp_func(target_idx) |
|
|
|
return resized_arr.reshape(1, target_size) |
|
|
|
def remove_silence(wav_file): |
|
audSeg = AudioSegment.from_wav(wav_file) |
|
non_silence_ranges = detect_nonsilent(audSeg, min_silence_len=5, silence_thresh=-30) |
|
|
|
if not non_silence_ranges: |
|
sound = audSeg |
|
else: |
|
start = non_silence_ranges[0][0] |
|
end = non_silence_ranges[-1][1] |
|
trimmed_sound = audSeg[start:end] |
|
sound = trimmed_sound |
|
|
|
sound.export('audio.wav', format="wav") |
|
|
|
def transform_data(audio): |
|
remove_silence(audio) |
|
x, sr = librosa.load('audio.wav') |
|
|
|
result, f_names = ShortTermFeatures.feature_extraction(x, sr, 0.050*sr, 0.025*sr) |
|
|
|
resize_features = smart_resize(result.reshape(1,-1), 20) |
|
|
|
return resize_features |
|
|
|
def predict(newdf, loaded_model): |
|
|
|
prediction = loaded_model.predict(newdf) |
|
|
|
proba = loaded_model.predict_proba(newdf) |
|
|
|
return prediction, proba[0] |
|
|
|
def get_label(newpred): |
|
if newpred == 0: |
|
return 'No' |
|
else: |
|
return 'Si' |
|
|
|
def load_model(): |
|
ram_for = joblib.load('models/sgd_90.pkl') |
|
|
|
return ram_for |
|
|
|
def main(audio): |
|
newdf = transform_data(audio) |
|
loaded_model = load_model() |
|
newpred, proba = predict(newdf, loaded_model) |
|
final = get_label(newpred) |
|
|
|
return final, {'Si probability': proba[1], |
|
'No probability': proba[0]} |
|
|
|
demo = gr.Interface( |
|
title = "Autoagent | YES or NO Classification - Layer7", |
|
description = """<h3>This model is useful to classify if the user says 'Si' or 'No'. 🎙️ </h3> |
|
<img src="https://huggingface.co/spaces/Adrian8as/imagen/resolve/main/output.png" width="350" height="350"/> <br> |
|
<b>Record your voice:</b>""", |
|
allow_flagging = "never", |
|
fn = main, |
|
inputs = gr.Audio( |
|
sources=["microphone"], |
|
type="filepath", |
|
), |
|
outputs = [gr.Textbox(label="Clasification"),"label"] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(show_api=False) |