lijialudew
/

wav2vec_LittleBeats_LENA

Audio Classification

Fairseq

English

Model card Files Files and versions Community

lijialudew commited on Jan 21, 2023

Commit

5490fe0

•

1 Parent(s): 6333913

Update README.md

Browse files

Files changed (1) hide show

README.md +85 -343

README.md CHANGED Viewed

@@ -39,350 +39,92 @@ We develop fine-tuning recipe using SpeechBrain toolkit available at
 ## Quick Start [optional]
 <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
-If you wish to use fairseq framework, the following code snippet can be used to load our pretrained model
-'''
-"""This lobe enables the integration of fairseq pretrained wav2vec models.
-Reference: https://arxiv.org/abs/2006.11477
-Reference: https://arxiv.org/abs/1904.05862
-FairSeq >= 1.0.0 needs to be installed: https://fairseq.readthedocs.io/en/latest/
-Authors
- * Titouan Parcollet 2021
- * Salima Mdhaffar 2021
-"""
-import torch
-import torch.nn.functional as F
-from torch import nn
-from speechbrain.utils.data_utils import download_file
-import pdb
-# We check if fairseq is installed.
-try:
-    import fairseq
-except ImportError:
-    MSG = "Please install Fairseq to use pretrained wav2vec\n"
-    MSG += "E.G. run: pip install fairseq"
-    raise ImportError(MSG)
-class FairseqWav2Vec2(nn.Module):
-    """This lobe enables the integration of fairseq pretrained wav2vec2.0 models.
-    Source paper: https://arxiv.org/abs/2006.11477
-    FairSeq >= 1.0.0 needs to be installed:
-    https://fairseq.readthedocs.io/en/latest/
-    The model can be used as a fixed features extractor or can be finetuned. It
-    will download automatically the model if a url is given (e.g FairSeq
-    repository from GitHub).
-    Arguments
-    ---------
-    pretrained_path : str
-        Path of the pretrained wav2vec2 model. It can be a url or a local path.
-    save_path : str
-        Path and filename of the downloaded model.
-    input_norm : bool (default: None)
-        If True, a layer_norm (affine) will be applied to the input waveform.
-        By default, it is extracted from the checkpoint of the downloaded model
-        in order to match the pretraining conditions. However, if this information
-        is not given in the checkpoint, it has to be given manually.
-    output_norm : bool (default: True)
-        If True, a layer_norm (affine) will be applied to the output obtained
-        from the wav2vec model.
-    freeze : bool (default: True)
-        If True, the model is frozen. If False, the model will be trained
-        alongside with the rest of the pipeline.
-    pretrain : bool (default: True)
-        If True, the model is pretrained with the specified source.
-        If False, the randomly-initialized model is instantiated.
-    dropout : float (default: None)
-        If different from None (0.0 to 1.0), it will override the given fairseq
-        dropout rates. This is useful if the wav2vec2 model has been trained
-        without dropout and one wants to reactivate it for downstream task
-        fine-tuning (better performance observed).
-    Example
-    -------
-    >>> inputs = torch.rand([10, 600])
-    >>> model_url = "https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small.pt"
-    >>> save_path = "models_checkpoints/wav2vec2.pt"
-    >>> model = FairseqWav2Vec2(model_url, save_path)
-    >>> outputs = model(inputs)
-    >>> outputs.shape
-    torch.Size([10, 100,  768])
-    """
-    def __init__(
-        self,
-        pretrained_path,
-        save_path,
-        input_norm=None,
-        output_norm=True,
-        freeze=True,
-        pretrain=True,
-        dropout=None,
-        encoder_dropout = None,
-        output_all_hiddens=False,
-        tgt_layer=None,
-        include_CNN_layer=True,
-    ):
-        super().__init__()
-        # Download the pretrained wav2vec2 model. It can be local or online.
-        download_file(pretrained_path, save_path)
-        # During pretraining dropout might be set to 0. However, we might want
-        # to apply dropout when fine-tuning on a downstream task. Hence we need
-        # to modify the fairseq cfg to activate dropout (if requested).
-        overrides={}
-        if encoder_dropout is not None:
-            overrides = {
-                "model": {
-                    "encoder_layerdrop": encoder_dropout,
-                }
-            }
-        if not freeze:
-            if dropout is not None and encoder_dropout is not None:
-                overrides = {
-                    "model": {
-                        "dropout": dropout,
-                        "encoder_layerdrop": encoder_dropout,
-                        "dropout_input": dropout,
-                        "attention_dropout": dropout,
-                    }
-                }
-            elif dropout is not None:
-                overrides = {
-                    "model": {
-                        "dropout": dropout,
-                        "dropout_input": dropout,
-                        "attention_dropout": dropout,
-                    }
-                }
-        (
-            model,
-            cfg,
-            task,
-        ) = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [save_path], arg_overrides=overrides
-        )
-        # wav2vec pretrained models may need the input waveform to be normalized
-        # Hence, we check if the model has be trained with or without it.
-        # If the information isn't contained in the checkpoint IT HAS TO BE GIVEN
-        # BY THE USER.
-        if input_norm is None:
-            if hasattr(cfg["task"], "normalize"):
-                self.normalize = cfg["task"].normalize
-            elif hasattr(cfg, "normalize"):
-                self.normalize = cfg.normalize
-            else:
-                self.normalize = False
-        else:
-            self.normalize = input_norm
-        model = model[0]
-        self.model = model
-        self.freeze = freeze
-        self.output_norm = output_norm
-        if self.freeze:
-            self.model.eval()
-            # Freeze parameters
-            for param in model.parameters():
-                param.requires_grad = False
-        else:
-            self.model.train()
-            for param in model.parameters():
-                param.requires_grad = True
-        # Randomly initialized layers if pretrain is False
-        if not (pretrain):
-            self.reset_layer(self.model)
-        # Following the fairseq implementation of downstream training,
-        # we remove some modules that are unnecessary.
-        self.remove_pretraining_modules()
-        self.output_all_hiddens = output_all_hiddens
-        self.tgt_layer = tgt_layer
-        self.include_CNN_layer = include_CNN_layer
-    def forward(self, wav):
-        """Takes an input waveform and return its corresponding wav2vec encoding.
-        Arguments
-        ---------
-        wav : torch.Tensor (signal)
-            A batch of audio signals to transform to features.
-        """
-        # If we freeze, we simply remove all grads and features from the graph.
-        if self.freeze:
-            with torch.no_grad():
-                return self.extract_features(wav).detach()
-        return self.extract_features(wav)
-    def extract_features(self, wav):
-        """Extracts the wav2vect embeddings"""
-        # We normalize the input signal if needed.
-        if self.normalize:
-            wav = F.layer_norm(wav, wav.shape)
-        # Extract wav2vec output
-        if self.tgt_layer=="CNN": #initial embeddings from conv
-            out = self.model.extract_features(wav, padding_mask=None, mask=False)
-            out = self.model.post_extract_proj(out['features'])
-        elif isinstance(self.tgt_layer, int):
-            out = self.model.extract_features(wav, padding_mask=None, mask=False, layer=self.tgt_layer)['x']
-        else: #
-            out = self.model.extract_features(wav, padding_mask=None, mask=False, layer=self.tgt_layer)
-            if self.output_all_hiddens or isinstance(self.tgt_layer, list):
-                out = self.aggregate_features(out, include_CNN_layer=self.include_CNN_layer) # 13, B, T, D
-                if isinstance(self.tgt_layer, list):
-                    out = out[self.tgt_layer]
-            else:
-                out = out['x']
-        # We normalize the output if required
-        if self.output_norm:
-            out = F.layer_norm(out, out.shape)
-        return out
-    def aggregate_features(self, out, include_CNN_layer=True):
-        features = []
-        if include_CNN_layer:
-            features = [self.model.post_extract_proj(out['features'])]
-        self.model.layerdrop = 0
-        for i in range(len(out['layer_results'])):
-            curr_feature = out['layer_results'][i][0].transpose(0,1)
-            features.append(curr_feature)
-        features = torch.stack(features)
-        return features
-    def reset_layer(self, model):
-        """Reinitializes the parameters of the network"""
-        if hasattr(model, "reset_parameters"):
-            model.reset_parameters()
-        for child_layer in model.children():
-            if model != child_layer:
-                self.reset_layer(child_layer)
-    def remove_pretraining_modules(self):
-        """ Remove uneeded modules. Inspired by the same fairseq function."""
-        self.model.quantizer = None
-        self.model.project_q = None
-        self.model.target_glu = None
-        self.model.final_proj = None
-class FairseqWav2Vec1(nn.Module):
-    """This lobes enables the integration of fairseq pretrained wav2vec1.0 models.
-    Arguments
-    ---------
-    pretrained_path : str
-        Path of the pretrained wav2vec1 model. It can be a url or a local path.
-    save_path : str
-        Path and filename of the downloaded model.
-    output_norm : bool (default: True)
-        If True, a layer_norm (affine) will be applied to the output obtained
-        from the wav2vec model.
-    freeze : bool (default: True)
-        If True, the model is frozen. If False, the model will be trained
-        alongside with the rest of the pipeline.
-    pretrain : bool (default: True)
-        If True, the model is pretrained with the specified source.
-        If False, the randomly-initialized model is instantiated.
-    Example
-    -------
-    >>> inputs = torch.rand([10, 600])
-    >>> model_url = ""
-    >>> save_path = "models_checkpoints/wav2vec.pt"
-    >>> model = FairseqWav2Vec1(model_url, save_path)
-    >>> outputs = model(inputs)
-    >>> outputs.shape
-    torch.Size([10, 100, 512])
-    """
-    def __init__(
-        self,
-        pretrained_path,
-        save_path,
-        output_norm=True,
-        freeze=True,
-        pretrain=True,
-    ):
-        super().__init__()
-        self.freeze = freeze
-        self.output_norm = output_norm
-        # Download the pretrained wav2vec1 model. It can be local or online.
-        download_file(pretrained_path, save_path)
-        (
-            model,
-            cfg,
-            task,
-        ) = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-            [pretrained_path]
-        )
-        self.model = model
-        self.model = self.model[0]
-        if self.freeze:
-            model.eval()
-        # Randomly initialized layers if pretrain is False
-        if not (pretrain):
-            self.reset_layer(self.model)
-    def forward(self, wav):
-        """Takes an input waveform and return its corresponding wav2vec encoding.
-        Arguments
-        ---------
-        wav : torch.Tensor (signal)
-            A batch of audio signals to transform to features.
-        """
-        # If we freeze, we simply remove all grads and features from the graph.
-        if self.freeze:
-            with torch.no_grad():
-                return self.extract_features(wav).detach()
-        return self.extract_features(wav)
-    def extract_features(self, wav):
-        """Extracts the wav2vect embeddings"""
-        out = self.model.feature_extractor(wav)
-        out = self.model.feature_aggregator(out).squeeze(0)
-        out = out.transpose(2, 1)
-        # We normalize the output if required
-        if self.output_norm:
-            out = F.layer_norm(out, out.shape)
-        return out
-    def reset_layer(self, model):
-        """Reinitializes the parameters of the network"""
-        if hasattr(model, "reset_parameters"):
-            model.reset_parameters()
-        for child_layer in model.children():
-            if model != child_layer:
-                self.reset_layer(child_layer)
-'''
 # Evaluation
 <!-- This section describes the evaluation protocols and provides the results. -->

 ## Quick Start [optional]
 <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+If you wish to use fairseq framework, the following code snippet can be used to load our pretrained model.
+  <pre><code>
+  import torch
+  import torch.nn.functional as F
+  from torch import nn
+  import fairseq
+  import torchaudio
+  def load_model(model_path, freeze=True):
+      '''
+      This function loads pretrained model using fairseq framework.
+      Arguments
+      ---------
+      model_path : str
+          Path and filename of the pretrained model
+      freeze : bool (default: True)
+          If True, the model is frozen with no parameter updates through training.
+      '''
+      model,_,_ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_path])
+      model = model[0]
+      if freeze:
+          model.eval()
+          # Freeze parameters
+          for param in model.parameters():
+              param.requires_grad = False
+      else:
+          model.train()
+          for param in model.parameters():
+              param.requires_grad = True
+      #remove unnecessary components
+      model.quantizer = None
+      model.project_q = None
+      model.target_glu = None
+      model.final_proj = None
+      return model
+  def extract_features(model, wav, input_norm=None, output_norm=True, tgt_layer=None, output_all_hiddens=False):
+      '''
+      This function extracts features from w2v2 model. The function extracts the last transformer layer
+      feature by default. It allows for extracting features from certain layer, or features from all layers
+      Arguments
+      ---------
+      model : fairseq wav2vec
+      wav : tensor
+          audio wav for feature extraction
+      input_norm : bool (default: None)
+          If True, a layer_norm (affine) will be applied to the input waveform.
+      output_norm : bool (default: True)
+          If True, a layer_norm (affine) will be applied to the output obtained
+          from the wav2vec model.
+      tgt_layer : int (default: None)
+          Target transformer layer features, 0-indexed.
+      output_all_hiddens : bool (default: False)
+          Whether to extract features from all layers. Need to set tgt_layer as None
+      '''
+      if input_norm:
+          wav = F.layer_norm(wav, wav.shape)
+      # Extract wav2vec output
+      out = model.extract_features(wav, padding_mask=None, mask=False)['x']
+      if isinstance(tgt_layer, int):
+          out = model.extract_features(wav, padding_mask=None, mask=False, layer=tgt_layer)['x']
+      elif output_all_hiddens:
+          features = []
+          model.layerdrop = 0
+          for i in range(len(out['layer_results'])):
+              curr_feature = out['layer_results'][i][0].transpose(0,1)
+              features.append(curr_feature)
+          out = torch.stack(features)
+      if output_norm:
+          out = F.layer_norm(out, out.shape)
+      return out
+  model=load_model("your/path/to/LL_4300/checkpoint_best.pt")
+  audio, fs = torchaudio.load("sample.wav")
+  audio = audio.transpose(0,1).squeeze(1)
+  features = extract_features(model, audio)
+  </code></pre>
 # Evaluation
 <!-- This section describes the evaluation protocols and provides the results. -->