Update Readme
Browse files
README.md
CHANGED
@@ -12,10 +12,144 @@ extra_gated_fields:
|
|
12 |
Company/university: text
|
13 |
Website: text
|
14 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
from audioset_convnext_inf.pytorch.convnext import ConvNeXt
|
18 |
|
19 |
-
model = ConvNeXt.from_pretrained(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
```
|
21 |
|
|
|
12 |
Company/university: text
|
13 |
Website: text
|
14 |
---
|
15 |
+
ConvNeXt-Tiny-AT is an audio tagging CNN model, trained on AudioSet (balanced+unbalanced subsets). It reached 0.471 mAP on the test set.
|
16 |
+
|
17 |
+
The model expects as input audio files of duration 10 seconds, and sample rate 32kHz.
|
18 |
+
It provides logits and probabilities for the 527 audio event tags of AudioSet (see http://research.google.com/audioset/index.html).
|
19 |
+
Two methods can also be used to get scene embeddings (a single vector per file) and frame-level embeddings, see below.
|
20 |
+
The scene embedding is obtained from the frame-level embeddings, on which mean pooling is applied onto the frequency dim, followed by mean pooling + max pooling onto the time dim.
|
21 |
+
|
22 |
+
# Install
|
23 |
+
|
24 |
+
This code is based on our repo: https://github.com/topel/audioset-convnext-inf
|
25 |
+
|
26 |
+
Note that the checkpoint is also available on Zenodo: https://zenodo.org/record/8020843/files/convnext_tiny_471mAP.pth?download=1
|
27 |
+
|
28 |
+
|
29 |
+
```bash
|
30 |
+
pip install git+https://github.com/topel/audioset-convnext-inf@pip-install
|
31 |
+
```
|
32 |
+
|
33 |
+
# Usage
|
34 |
+
|
35 |
+
Below is an example of how to instantiate our model convnext_tiny_471mAP.pth
|
36 |
|
37 |
```python
|
38 |
+
# 1. visit hf.co/topel/ConvNeXt-Tiny-AT and accept user conditions
|
39 |
+
# 2. visit hf.co/settings/tokens to create an access token
|
40 |
+
# 3. instantiate pretrained model
|
41 |
+
|
42 |
+
import os
|
43 |
+
import numpy as np
|
44 |
+
import torch
|
45 |
+
import torchaudio
|
46 |
+
|
47 |
from audioset_convnext_inf.pytorch.convnext import ConvNeXt
|
48 |
|
49 |
+
model = ConvNeXt.from_pretrained("topel/ConvNeXt-Tiny-AT", use_auth_token=None, map_location='cpu', use_auth_token="ACCESS_TOKEN_GOES_HERE")
|
50 |
+
|
51 |
+
print(
|
52 |
+
"# params:",
|
53 |
+
sum(param.numel() for param in model.parameters() if param.requires_grad),
|
54 |
+
)
|
55 |
+
if torch.cuda.is_available():
|
56 |
+
device = torch.device("cuda")
|
57 |
+
else:
|
58 |
+
device = torch.device("cpu")
|
59 |
+
|
60 |
+
if "cuda" in str(device):
|
61 |
+
model = model.to(device)
|
62 |
+
```
|
63 |
+
|
64 |
+
Output:
|
65 |
+
```
|
66 |
+
# params: 28222767
|
67 |
+
```
|
68 |
+
|
69 |
+
## Inference: get logits and probabilities
|
70 |
+
|
71 |
+
```python
|
72 |
+
|
73 |
+
sample_rate = 32000
|
74 |
+
audio_target_length = 10 * sample_rate # 10 s
|
75 |
+
|
76 |
+
AUDIO_FNAME = "f62-S-v2swA_200000_210000.wav"
|
77 |
+
AUDIO_FPATH = os.path.join("/path/to/audio", AUDIO_FNAME)
|
78 |
+
|
79 |
+
waveform, sample_rate_ = torchaudio.load(AUDIO_FPATH)
|
80 |
+
if sample_rate_ != sample_rate:
|
81 |
+
print("ERROR: sampling rate not 32k Hz", sample_rate_)
|
82 |
+
|
83 |
+
waveform = waveform.to(device)
|
84 |
+
|
85 |
+
print("\nInference on " + AUDIO_FNAME + "\n")
|
86 |
+
|
87 |
+
with torch.no_grad():
|
88 |
+
model.eval()
|
89 |
+
output = model(waveform)
|
90 |
+
|
91 |
+
logits = output["clipwise_logits"]
|
92 |
+
print("logits size:", logits.size())
|
93 |
+
|
94 |
+
probs = output["clipwise_output"]
|
95 |
+
# Equivalent: probs = torch.sigmoid(logits)
|
96 |
+
print("probs size:", probs.size())
|
97 |
+
|
98 |
+
threshold = 0.25
|
99 |
+
sample_labels = np.where(probs[0].clone().detach().cpu() > threshold)[0]
|
100 |
+
print("Predicted labels using activity threshold 0.25:\n")
|
101 |
+
print(sample_labels)
|
102 |
+
```
|
103 |
+
|
104 |
+
Output:
|
105 |
+
```
|
106 |
+
logits size: torch.Size([1, 527])
|
107 |
+
probs size: torch.Size([1, 527])
|
108 |
+
Predicted labels using activity threshold 0.25:
|
109 |
+
|
110 |
+
[ 0 137 138 139 151 506]
|
111 |
+
```
|
112 |
+
|
113 |
+
|
114 |
+
|
115 |
+
## Get audio scene embeddings
|
116 |
+
```python
|
117 |
+
with torch.no_grad():
|
118 |
+
model.eval()
|
119 |
+
output = model.forward_scene_embeddings(waveform)
|
120 |
+
|
121 |
+
print("\nScene embedding, shape:", output.size())
|
122 |
+
```
|
123 |
+
|
124 |
+
Output:
|
125 |
+
```
|
126 |
+
Scene embedding, shape: torch.Size([1, 768])
|
127 |
+
```
|
128 |
+
|
129 |
+
## Get frame-level embeddings
|
130 |
+
```python
|
131 |
+
with torch.no_grad():
|
132 |
+
model.eval()
|
133 |
+
output = model.forward_frame_embeddings(waveform)
|
134 |
+
|
135 |
+
print("\nFrame-level embeddings, shape:", output.size())
|
136 |
+
```
|
137 |
+
|
138 |
+
Output:
|
139 |
+
```
|
140 |
+
Frame-level embeddings, shape: torch.Size([1, 768, 31, 7])
|
141 |
+
```
|
142 |
+
|
143 |
+
|
144 |
+
## Citation
|
145 |
+
|
146 |
+
```bibtex
|
147 |
+
@inproceedings{Bredin2021,
|
148 |
+
Title = {{Adapting a ConvNeXt model to audio classification on AudioSet}},
|
149 |
+
Author = {{Pellegrini}, Thomas, and {Khalfaoui-Hassani}, Ismail and {Labb\'e}, Etienne and {Masquelier}, Timoth\'e},
|
150 |
+
Booktitle = {Proc. Interspeech 2023},
|
151 |
+
Address = {Dublin},
|
152 |
+
Month = {August},
|
153 |
+
Year = {2023},
|
154 |
```
|
155 |
|