gaunernst
/

vit_base_patch16_1024_128.audiomae_as2m_ft_as20k

Audio Classification

Model card Files Files and versions Community

gaunernst commited on Nov 24, 2023

Commit

81e2a6f

•

1 Parent(s): 8333b77

Update README.md

Files changed (1) hide show

README.md +4 -0

README.md CHANGED Viewed

@@ -33,6 +33,9 @@ from torchaudio.compliance import kaldi
 model = timm.create_model("hf_hub:gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k", pretrained=True)
 model = model.eval()
 audio = torch.randn(1, 10 * 16_000)  # make sure input is 16kHz
 melspec = kaldi.fbank(audio, htk_compat=True, window_type="hanning", num_mel_bins=128)  # shape (n_frames, 128)
@@ -41,6 +44,7 @@ if melspec.shape[0] < 1024:
     melspec = F.pad(melspec, (0, 0, 0, 1024 - melspec.shape[0]))
 else:
     melspec = melspec[:1024]
 melspec = melspec.view(1, 1, 1024, 128)  # add batch dim and channel dim
 output = model(melspec)

 model = timm.create_model("hf_hub:gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k", pretrained=True)
 model = model.eval()
+MEAN = -4.2677393
+STD = 4.5689974
 audio = torch.randn(1, 10 * 16_000)  # make sure input is 16kHz
 melspec = kaldi.fbank(audio, htk_compat=True, window_type="hanning", num_mel_bins=128)  # shape (n_frames, 128)
     melspec = F.pad(melspec, (0, 0, 0, 1024 - melspec.shape[0]))
 else:
     melspec = melspec[:1024]
+melspec = (melspec - MEAN) / (STD * 2)
 melspec = melspec.view(1, 1, 1024, 128)  # add batch dim and channel dim
 output = model(melspec)