Spaces:

santiviquez
/

noisy_human

Sleeping

App Files Files Community

santiviquez commited on Jan 29, 2023

Commit

29457c0

1 Parent(s): 3d8b507

add everything

Browse files

Files changed (2) hide show

app.py +113 -0
cnn.py +67 -0

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import streamlit as st
+from huggingface_hub import hf_hub_url, cached_download
+import torch
+import torchaudio.transforms as transforms
+from miniaudio import SampleFormat, decode
+from librosa.util import fix_length
+import numpy as np
+from audio_recorder_streamlit import audio_recorder
+# Streamlit app title
+st.markdown("## Noisy Human")
+st.markdown("")
+st.markdown(
+    "Non-speach human sounds classification. This model can identify with up to 78/% accuracy the following 10 classes"
+)
+col1, col2 = st.columns(2)
+with st.container():
+    with col1:
+        st.markdown(
+            """
+                    * Clapping 👏
+                    * Footsteps 🦶
+                    * Brushing Teeth 🪥
+                    * Drinking Sipping 🧃
+                    * Laughing 😂
+                    """
+        )
+    with col2:
+        st.markdown(
+            """
+    * Breathing 🌬️
+    * Crying Baby 😭
+    * Coughing 🤧
+    * Snoring 😴
+    * Sneezing 🤧
+    """
+        )
+# from audio_recorder_streamlit import audio_recorder
+from cnn import CNN
+REPO_ID = "santiviquez/noisy_human_cnn"
+FILENAME = "CNN_MelSpec_Deltas_fold_4_.pth"
+RATE = 22050
+@st.cache
+def download_model():
+    model_weights = torch.load(
+        cached_download(hf_hub_url(REPO_ID, FILENAME)), map_location=torch.device("cpu")
+    )
+    return model_weights
+model_weights = download_model()
+model = CNN(input_channels=2)
+model.load_state_dict(model_weights)
+model.eval()
+audio_bytes = st.file_uploader(
+    "Choose an audio (.wav) file", accept_multiple_files=False
+)
+st.caption("OR")
+audio_bytes = audio_recorder()
+if audio_bytes:
+    # audio_bytes = audio_file_path.read()
+    st.audio(audio_bytes, format="audio/ogg")
+    # st.audio(audio_bytes, format="audio/ogg")
+    # torch.tensor(audio_bytes).shape
+    decoded_audio = decode(
+        audio_bytes, nchannels=1, sample_rate=RATE, output_format=SampleFormat.SIGNED32
+    )
+    waveform = np.array(decoded_audio.samples)
+    waveform = fix_length(waveform, 5 * RATE)
+    waveform = torch.FloatTensor(waveform)
+    x_mel = transforms.MelSpectrogram(sample_rate=RATE, n_fft=1024, n_mels=60)(waveform)
+    x_deltas = transforms.ComputeDeltas()(x_mel)
+    x = torch.cat((x_mel, x_deltas)).view(1, 2, 60, 216)
+    y_pred = model(x)
+    y_pred_softmax = torch.log_softmax(y_pred, dim=1)
+    _, y_pred_tags = torch.max(y_pred_softmax, dim=1)
+    category_map = {
+        0: "Clapping 👏",
+        1: "Footsteps 🦶",
+        2: "Brushing Teeth 🪥",
+        3: "Drinking Sipping 🧃",
+        4: "Laughing 😂",
+        5: "Breathing 🌬️",
+        6: "Crying Baby 😭",
+        7: "Coughing 🤧",
+        8: "Snoring 😴",
+        9: "Sneezing 🤧",
+    }
+    st.write("**Predicted class:**", category_map[y_pred_tags.item()])
+st.text("")
+st.text("")
+st.text("")
+st.markdown(
+    """`Create by` [Santiago Viquez](https://twitter.com/santiviquez)
+    and [Ivan Padezhki](https://github.com/ivanpadezhki)
+    |  `Code:` [GitHub](https://github.com/santiviquez/noisy-human-recognition)"""
+)

cnn.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch.nn as nn
+import torch.nn.functional as F
+class CNN(nn.Module):
+    def __init__(self, input_channels):
+        super(CNN, self).__init__()
+        self.input_channels = input_channels
+        self.conv1 = nn.Conv2d(self.input_channels, 32, kernel_size=(3, 3))
+        self.batchnorm1 = nn.BatchNorm2d(32)
+        self.pool1 = nn.MaxPool2d(kernel_size=(3, 3))
+        self.dropout1 = nn.Dropout(0.3)
+        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3))
+        self.batchnorm2 = nn.BatchNorm2d(64)
+        self.pool2 = nn.MaxPool2d(kernel_size=(1, 3))
+        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3))
+        self.batchnorm3 = nn.BatchNorm2d(128)
+        self.pool3 = nn.MaxPool2d(kernel_size=2)
+        self.dropout2 = nn.Dropout(0.3)
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc1 = nn.Linear(128, 256)
+        self.fc2 = nn.Linear(256, 512)
+        self.dropout3 = nn.Dropout(0.5)
+        self.fc3 = nn.Linear(512, 10)
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            nn.init.xavier_normal_(module.weight.data)
+            if module.bias is not None:
+                nn.init.constant_(module.bias.data, 0)
+        elif isinstance(module, nn.BatchNorm2d):
+            nn.init.constant_(module.weight.data, 1)
+            nn.init.constant_(module.bias.data, 0)
+        elif isinstance(module, nn.Linear):
+            n = module.in_features
+            y = 1.0 / n ** (1/2)
+            module.weight.data.uniform_(-y, y)
+            module.bias.data.fill_(0)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.batchnorm1(x)
+        x = self.pool1(x)
+        x = self.dropout1(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = self.batchnorm2(x)
+        x = self.pool2(x)
+        x = self.conv3(x)
+        x = F.relu(x)
+        x = self.batchnorm3(x)
+        x = self.pool3(x)
+        x = self.dropout2(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout3(x)
+        x = self.fc2(x)
+        x = F.relu(x)
+        x = self.dropout3(x)
+        x = self.fc3(x)
+        return x