Update README.md
Browse files
README.md
CHANGED
@@ -21,6 +21,7 @@ A Vision Transformer (ViT) for audio. Pretrained on AudioSet-2M with Self-Superv
|
|
21 |
|
22 |
## Model Usage
|
23 |
### Audio Classification and Embeddings
|
|
|
24 |
```python
|
25 |
from urllib.request import urlopen
|
26 |
import timm
|
@@ -32,14 +33,14 @@ img = Image.open(urlopen(
|
|
32 |
'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
|
33 |
))
|
34 |
|
35 |
-
# NOTE: `global_pool='avg'`
|
36 |
# if only embeddings are needed, pass `num_classes=0`
|
37 |
-
model = timm.create_model("hf_hub:gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k", pretrained=True
|
38 |
model = model.eval()
|
39 |
|
40 |
-
# TODO:
|
41 |
audio = torch.randn(1, 10 * 16_000)
|
42 |
-
melspec = fbank(
|
43 |
audio,
|
44 |
htk_compat=True,
|
45 |
sample_frequency=16_000,
|
|
|
21 |
|
22 |
## Model Usage
|
23 |
### Audio Classification and Embeddings
|
24 |
+
|
25 |
```python
|
26 |
from urllib.request import urlopen
|
27 |
import timm
|
|
|
33 |
'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
|
34 |
))
|
35 |
|
36 |
+
# NOTE: for timm<0.9.11, you also need to pass `global_pool='avg'`
|
37 |
# if only embeddings are needed, pass `num_classes=0`
|
38 |
+
model = timm.create_model("hf_hub:gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k", pretrained=True)
|
39 |
model = model.eval()
|
40 |
|
41 |
+
# TODO: HF preprocessor (AST)
|
42 |
audio = torch.randn(1, 10 * 16_000)
|
43 |
+
melspec = kaldi.fbank(
|
44 |
audio,
|
45 |
htk_compat=True,
|
46 |
sample_frequency=16_000,
|