gaunernst commited on
Commit
3058f72
1 Parent(s): 6746e7c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +5 -4
README.md CHANGED
@@ -21,6 +21,7 @@ A Vision Transformer (ViT) for audio. Pretrained on AudioSet-2M with Self-Superv
21
 
22
  ## Model Usage
23
  ### Audio Classification and Embeddings
 
24
  ```python
25
  from urllib.request import urlopen
26
  import timm
@@ -32,14 +33,14 @@ img = Image.open(urlopen(
32
  'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
33
  ))
34
 
35
- # NOTE: `global_pool='avg'` is required
36
  # if only embeddings are needed, pass `num_classes=0`
37
- model = timm.create_model("hf_hub:gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k", pretrained=True, global_pool='avg')
38
  model = model.eval()
39
 
40
- # TODO: torchaudio.compliance.kaldi.fbank
41
  audio = torch.randn(1, 10 * 16_000)
42
- melspec = fbank(
43
  audio,
44
  htk_compat=True,
45
  sample_frequency=16_000,
 
21
 
22
  ## Model Usage
23
  ### Audio Classification and Embeddings
24
+
25
  ```python
26
  from urllib.request import urlopen
27
  import timm
 
33
  'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
34
  ))
35
 
36
+ # NOTE: for timm<0.9.11, you also need to pass `global_pool='avg'`
37
  # if only embeddings are needed, pass `num_classes=0`
38
+ model = timm.create_model("hf_hub:gaunernst/vit_base_patch16_1024_128.audiomae_as2m_ft_as20k", pretrained=True)
39
  model = model.eval()
40
 
41
+ # TODO: HF preprocessor (AST)
42
  audio = torch.randn(1, 10 * 16_000)
43
+ melspec = kaldi.fbank(
44
  audio,
45
  htk_compat=True,
46
  sample_frequency=16_000,