Spaces:
Runtime error
Runtime error
initial commit
Browse files- .gitattributes +1 -0
- app.py +168 -0
- examples/car_horn.wav +3 -0
- examples/children_playing.wav +3 -0
- examples/dog_bark.wav +3 -0
- examples/siren.wav +3 -0
- examples/street_music.wav +3 -0
- requirements.txt +2 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
examples/*.wav filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AUTOGENERATED! DO NOT EDIT! File to edit: app.ipynb.
|
2 |
+
|
3 |
+
# %% auto 0
|
4 |
+
__all__ = ['data', 'audios', 'metadata', 'to_consider', 'processed_metadata', 'repo_id', 'learner', 'categories', 'title',
|
5 |
+
'description', 'mic', 'label', 'examples', 'intf', 'process_audio_exists', 'load_x', 'load_label_tfm',
|
6 |
+
'classify_audio']
|
7 |
+
|
8 |
+
# %% app.ipynb 1
|
9 |
+
import torch
|
10 |
+
import gradio as gr
|
11 |
+
from gradio import CSVLogger
|
12 |
+
from fastai.vision.all import *
|
13 |
+
import torchaudio
|
14 |
+
import torchaudio.transforms as T
|
15 |
+
import warnings
|
16 |
+
from huggingface_hub import from_pretrained_fastai
|
17 |
+
|
18 |
+
# %% app.ipynb 2
|
19 |
+
warnings.filterwarnings("ignore")
|
20 |
+
|
21 |
+
# %% app.ipynb 3
|
22 |
+
def process_audio_exists(audio):
|
23 |
+
slice_name = audio.name
|
24 |
+
|
25 |
+
# check if slice name exists in new metadata file
|
26 |
+
row = processed_metadata.loc[processed_metadata['slice_file_name'] == slice_name].index.any()
|
27 |
+
|
28 |
+
return row
|
29 |
+
|
30 |
+
# %% app.ipynb 4
|
31 |
+
data = Path('examples')
|
32 |
+
audios = get_files(data, extensions='.wav')
|
33 |
+
|
34 |
+
metadata = pd.read_csv('UrbanSound8K.csv')
|
35 |
+
to_consider = ['siren', 'street_music', 'children_playing', 'dog_bark', 'car_horn']
|
36 |
+
processed_metadata = metadata.loc[metadata['class'].isin(to_consider)]
|
37 |
+
processed_metadata.loc[processed_metadata['class'] == 'siren', 'classID'] = 4
|
38 |
+
processed_metadata.loc[processed_metadata['class'] == 'street_music', 'classID'] = 0
|
39 |
+
|
40 |
+
# %% app.ipynb 5
|
41 |
+
class load_x(Transform):
|
42 |
+
def __init__(self):
|
43 |
+
self.sr = 44100
|
44 |
+
self.max_ms = 4000
|
45 |
+
self.channels = 2
|
46 |
+
# self.transform = transform
|
47 |
+
def rechannel(self, waveform, sr):
|
48 |
+
if (waveform.shape[0] == self.channels):
|
49 |
+
# no rechanneling needed
|
50 |
+
return waveform, sr
|
51 |
+
|
52 |
+
if (self.channels==1):
|
53 |
+
# converting stereo to mono
|
54 |
+
# by selecting the first channel
|
55 |
+
new_waveform = waveform[:1,:]
|
56 |
+
|
57 |
+
elif (self.channels==2):
|
58 |
+
# converting mono to stereo
|
59 |
+
# by duplicating the first channel
|
60 |
+
new_waveform = torch.cat([waveform, waveform])
|
61 |
+
return new_waveform, sr
|
62 |
+
|
63 |
+
def resample(self, waveform, sr):
|
64 |
+
if (sr==self.sr):
|
65 |
+
# no resampling needed
|
66 |
+
return waveform, sr
|
67 |
+
|
68 |
+
num_channels = waveform.shape[0]
|
69 |
+
|
70 |
+
# resample first channel
|
71 |
+
new_waveform = torchaudio.transforms.Resample(sr, self.sr)(waveform[:1,:])
|
72 |
+
if (num_channels) > 1:
|
73 |
+
# resample second channel and merge the two
|
74 |
+
re_two = torchaudio.transforms.Resample(sr, self.sr)(waveform[1:,:])
|
75 |
+
new_waveform = torch.cat([new_waveform, re_two])
|
76 |
+
|
77 |
+
return (new_waveform, self.sr)
|
78 |
+
|
79 |
+
def pad_trunc(self, waveform, sr):
|
80 |
+
num_channels, num_frames = waveform.shape
|
81 |
+
max_len = sr//1000 * self.max_ms
|
82 |
+
|
83 |
+
if (num_frames>max_len):
|
84 |
+
# truncate signal to given length
|
85 |
+
waveform = waveform[:,:max_len]
|
86 |
+
|
87 |
+
else:
|
88 |
+
# get padding lengths for beginning and end
|
89 |
+
begin_ln = random.randint(0, max_len-num_frames)
|
90 |
+
end_ln = max_len - num_frames - begin_ln
|
91 |
+
|
92 |
+
# pad the audio with zeros
|
93 |
+
pad_begin = torch.zeros((num_channels, begin_ln))
|
94 |
+
pad_end = torch.zeros((num_channels, end_ln))
|
95 |
+
|
96 |
+
waveform = torch.cat((pad_begin, waveform, pad_end), 1)
|
97 |
+
|
98 |
+
return (waveform, sr)
|
99 |
+
|
100 |
+
def mel_specgram(self, waveform, sr):
|
101 |
+
mel_tfm = T.MelSpectrogram(
|
102 |
+
sample_rate=sr,
|
103 |
+
n_fft=1024,
|
104 |
+
win_length=None,
|
105 |
+
hop_length=512,
|
106 |
+
center=True,
|
107 |
+
pad_mode="reflect",
|
108 |
+
power=2.0,
|
109 |
+
norm="slaney",
|
110 |
+
onesided=True,
|
111 |
+
n_mels=128,
|
112 |
+
mel_scale="htk")
|
113 |
+
|
114 |
+
spec = mel_tfm(waveform)
|
115 |
+
|
116 |
+
waveform = torchaudio.transforms.AmplitudeToDB(top_db=80)(spec)
|
117 |
+
|
118 |
+
return waveform, sr
|
119 |
+
|
120 |
+
|
121 |
+
def encodes(self, x):
|
122 |
+
waveform, sr = torchaudio.load(x)
|
123 |
+
waveform, sr = self.resample(waveform, sr)
|
124 |
+
waveform, sr = self.pad_trunc(waveform, sr)
|
125 |
+
waveform, sr = self.rechannel(waveform, sr)
|
126 |
+
waveform, sr = self.mel_specgram(waveform, sr)
|
127 |
+
return waveform
|
128 |
+
|
129 |
+
|
130 |
+
class load_label_tfm(Transform):
|
131 |
+
def __init__(self, metadata=processed_metadata): self.metadata = metadata
|
132 |
+
def encodes(self, x):
|
133 |
+
return self.metadata.loc[self.metadata['slice_file_name'] == x.name]['class'].item()
|
134 |
+
|
135 |
+
# %% app.ipynb 6
|
136 |
+
repo_id = "Jimmie/urban8k"
|
137 |
+
|
138 |
+
learner = from_pretrained_fastai(repo_id)
|
139 |
+
|
140 |
+
# %% app.ipynb 14
|
141 |
+
categories = tuple(learner.dls.vocab)
|
142 |
+
|
143 |
+
def classify_audio(audio):
|
144 |
+
# use Path to open audio
|
145 |
+
audio_path = Path(audio)
|
146 |
+
pred,idx,probs = learner.predict(audio_path)
|
147 |
+
return dict(zip(categories, map(float, probs)))
|
148 |
+
|
149 |
+
# %% app.ipynb 16
|
150 |
+
title = "Environmental Sound Classification"
|
151 |
+
|
152 |
+
description = """
|
153 |
+
This demo showcases how AI can be used to recognize environmental sounds. It focuses specifically on 5 classes: car_horn, children_playing, dog_bark, siren and street music
|
154 |
+
|
155 |
+
|
156 |
+
When uploading audio, make sure it is in .wav format and is less than 4 seconds long.
|
157 |
+
|
158 |
+
Enjoy!
|
159 |
+
"""
|
160 |
+
mic = gr.Audio(source='upload', type="filepath", label='Upload Audio File here')
|
161 |
+
label = gr.outputs.Label()
|
162 |
+
examples = list(data.ls())
|
163 |
+
|
164 |
+
intf = gr.Interface(fn=classify_audio, inputs=mic, outputs=label, examples=examples,
|
165 |
+
title=title, description=description, cache_examples=False,
|
166 |
+
auto_submit_duration=5)
|
167 |
+
|
168 |
+
intf.launch(inline=False)
|
examples/car_horn.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:390a545a53dfe52f47a92876691eb40e64d1240c8885be7f72df3654b8fe70f8
|
3 |
+
size 705644
|
examples/children_playing.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a01b7b6f6e9d51a57a7abf1128518c68631f3c7095736f0364479c813e07ab8
|
3 |
+
size 768044
|
examples/dog_bark.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fefae5223783da73b535df8815dea61a285f444f0770228c9d9ec8ea5a2e65c7
|
3 |
+
size 705644
|
examples/siren.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7b54c8d4a92dbd21fdbe5ba3027a289fe2c4f636d14bacf7205b07543e26f78
|
3 |
+
size 768044
|
examples/street_music.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d976eeb884ede8c4c731bf616e197a40a7a9ecef47b9005e2a1f6acaec8888c3
|
3 |
+
size 1152080
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
fastai<=2.7.11
|
2 |
+
torchaudio<=0.13.1
|