Spaces:
Running
Running
""" | |
MERT model configuration | |
""" | |
import functools | |
import operator | |
# from ...configuration_utils import PretrainedConfig | |
# from ...utils import logging | |
from transformers.configuration_utils import PretrainedConfig | |
from transformers.utils import logging | |
logger = logging.get_logger(__name__) | |
# TODO: use this MAP while uploading to Huggingface | |
# HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { | |
# "facebook/hubert-base-ls960": "https://huggingface.co/facebook/hubert-base-ls960/resolve/main/config.json", | |
# # See all Hubert models at https://huggingface.co/models?filter=hubert | |
# } | |
class MERTConfig(PretrainedConfig): | |
r""" | |
""" | |
model_type = "mert_model" | |
def __init__( | |
self, | |
vocab_size=32, | |
hidden_size=768, | |
num_hidden_layers=12, | |
num_attention_heads=12, | |
intermediate_size=3072, | |
hidden_act="gelu", | |
hidden_dropout=0.1, | |
activation_dropout=0.1, | |
attention_dropout=0.1, | |
feat_proj_layer_norm=True, | |
feat_proj_dropout=0.0, | |
final_dropout=0.1, | |
layerdrop=0.1, | |
initializer_range=0.02, | |
layer_norm_eps=1e-5, | |
feat_extract_norm="group", | |
feat_extract_activation="gelu", | |
conv_dim=(512, 512, 512, 512, 512, 512, 512), | |
conv_stride=(5, 2, 2, 2, 2, 2, 2), | |
conv_kernel=(10, 3, 3, 3, 3, 2, 2), | |
conv_bias=False, | |
num_conv_pos_embeddings=128, | |
num_conv_pos_embedding_groups=16, | |
do_stable_layer_norm=False, | |
apply_spec_augment=True, | |
mask_time_prob=0.05, | |
mask_time_length=10, | |
mask_time_min_masks=2, | |
mask_feature_prob=0.0, | |
mask_feature_length=10, | |
mask_feature_min_masks=0, | |
ctc_loss_reduction="sum", | |
ctc_zero_infinity=False, | |
use_weighted_layer_sum=False, | |
classifier_proj_size=256, | |
pad_token_id=0, | |
bos_token_id=1, | |
eos_token_id=2, | |
feature_extractor_cqt=False, | |
feature_extractor_cqt_bins=336, | |
deepnorm=False, | |
attention_relax=-1.0, | |
**kwargs | |
): | |
super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) | |
self.hidden_size = hidden_size | |
self.feat_extract_norm = feat_extract_norm | |
self.feat_extract_activation = feat_extract_activation | |
self.conv_dim = list(conv_dim) | |
self.conv_stride = list(conv_stride) | |
self.conv_kernel = list(conv_kernel) | |
self.conv_bias = conv_bias | |
self.num_conv_pos_embeddings = num_conv_pos_embeddings | |
self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups | |
self.num_feat_extract_layers = len(self.conv_dim) | |
self.num_hidden_layers = num_hidden_layers | |
self.intermediate_size = intermediate_size | |
self.hidden_act = hidden_act | |
self.num_attention_heads = num_attention_heads | |
self.hidden_dropout = hidden_dropout | |
self.attention_dropout = attention_dropout | |
self.activation_dropout = activation_dropout | |
self.feat_proj_layer_norm = feat_proj_layer_norm | |
self.feat_proj_dropout = feat_proj_dropout | |
self.final_dropout = final_dropout | |
self.layerdrop = layerdrop | |
self.layer_norm_eps = layer_norm_eps | |
self.initializer_range = initializer_range | |
self.vocab_size = vocab_size | |
self.do_stable_layer_norm = do_stable_layer_norm | |
self.use_weighted_layer_sum = use_weighted_layer_sum | |
self.classifier_proj_size = classifier_proj_size | |
if ( | |
(len(self.conv_stride) != self.num_feat_extract_layers) | |
or (len(self.conv_kernel) != self.num_feat_extract_layers) | |
or (len(self.conv_dim) != self.num_feat_extract_layers) | |
): | |
raise ValueError( | |
"Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` ==" | |
" `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) =" | |
f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`," | |
f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." | |
) | |
# fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779 | |
self.apply_spec_augment = apply_spec_augment | |
self.mask_time_prob = mask_time_prob | |
self.mask_time_length = mask_time_length | |
self.mask_time_min_masks = mask_time_min_masks | |
self.mask_feature_prob = mask_feature_prob | |
self.mask_feature_length = mask_feature_length | |
self.mask_feature_min_masks = mask_feature_min_masks | |
# ctc loss | |
self.ctc_loss_reduction = ctc_loss_reduction | |
self.ctc_zero_infinity = ctc_zero_infinity | |
# cqt feature extractor | |
self.feature_extractor_cqt = feature_extractor_cqt | |
self.feature_extractor_cqt_bins = feature_extractor_cqt_bins | |
# deepnorm: up-scale weighted residual conection + down-scale initial value transformer encoder | |
self.deepnorm = deepnorm | |
self.attention_relax = attention_relax | |
def inputs_to_logits_ratio(self): | |
return functools.reduce(operator.mul, self.conv_stride, 1) | |