gaunernst
/

vit_base_patch16_1024_128.audiomae_as2m_ft_as20k

Audio Classification

Model card Files Files and versions Community

gaunernst commited on Nov 23, 2023

Commit

ad5370b

•

1 Parent(s): 3058f72

Update README.md

Files changed (1) hide show

README.md +10 -20

README.md CHANGED Viewed

@@ -23,34 +23,24 @@ A Vision Transformer (ViT) for audio. Pretrained on AudioSet-2M with Self-Superv
 ### Audio Classification and Embeddings
 ```python
-from urllib.request import urlopen
 import timm
-from torchaudio.compliance import kaldi
 import torch
-# TODO: change this to audio
-img = Image.open(urlopen(
-    'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
-))
 # NOTE: for timm<0.9.11, you also need to pass `global_pool='avg'`
 # if only embeddings are needed, pass `num_classes=0`
 model = timm.create_model("hf_hub:gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k", pretrained=True)
 model = model.eval()
-# TODO: HF preprocessor (AST)
-audio = torch.randn(1, 10 * 16_000)
-melspec = kaldi.fbank(
-    audio,
-    htk_compat=True,
-    sample_frequency=16_000,
-    use_energy=False,
-    window_type='hanning',
-    num_mel_bins=128,
-    dither=0.0,
-    frame_shift=10,
-)  # shape (n_frames, 128)
-melspec = melspec[:1024]  # AudioMAE only accepts 1024-frame input
 melspec = melspec.view(1, 1, 1024, 128)  # add batch dim and channel dim
 output = model(melspec)

 ### Audio Classification and Embeddings
 ```python
 import timm
 import torch
+import torch.nn.functional as F
+from torchaudio.compliance import kaldi
 # NOTE: for timm<0.9.11, you also need to pass `global_pool='avg'`
 # if only embeddings are needed, pass `num_classes=0`
 model = timm.create_model("hf_hub:gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k", pretrained=True)
 model = model.eval()
+audio = torch.randn(1, 10 * 16_000)  # make sure input is 16kHz
+melspec = kaldi.fbank(audio, htk_compat=True, window_type="hanning", num_mel_bins=128)  # shape (n_frames, 128)
+# AudioMAE only accepts 1024-frame input
+if melspec.shape[0] < 1024:
+    melspec = F.pad(melspec, (0, 0, 0, 1024 - melspec.shape[0]))
+else:
+    melspec = melspec[:1024]
 melspec = melspec.view(1, 1, 1024, 128)  # add batch dim and channel dim
 output = model(melspec)