Salama1429's picture
Update model_config.yaml
564c4b7 verified
# Supported languages
languages: [
"af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et",
"fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it",
"ja", "kk", "kn", "ko", "ky", "lt", "lv", "mk", "ml", "mr",
"nl", "or", "pa", "pl", "ps", "pt", "ro", "ru", "rw", "so",
"sr", "sw", "ta", "te", "tr", "uk", "zh"
]
# Maximum sequence length for the model
max_length: 256
# Labels that might appear before punctuation (if any)
pre_labels: [
"<NULL>",
"¿"
]
# Punctuation and capitalization labels (post-token)
post_labels: [
"<NULL>",
"<ACRONYM>",
".",
",",
"?",
"?",
"،",
"。",
"、",
"・",
"।",
"؟",
"،",
";",
"።",
"፣",
"፧"
]
# Combined list of class labels (typically similar to pre/post labels)
class_labels: [
"<NULL>",
"¿",
"<ACRONYM>",
".",
",",
"?",
"?",
"،",
"。",
"、",
"・",
"।",
"؟",
"،",
";",
"።",
"፣",
"፧"
]
# Dataset configuration: these keys are required by NeMo's legacy conversion.
dataset:
pad_label: "<NULL>" # Label used for padding
ignore_extra_tokens: false # Whether to ignore tokens beyond the expected sequence length
ignore_start_end: false # Whether to ignore special start/end tokens
# Label ID mappings: here we simply assign sequential integers.
punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
# Configuration for the punctuation prediction head.
punct_head:
punct_num_fc_layers: 1
fc_dropout: 0.1
activation: "relu"
use_transformer_init: false
# Configuration for the capitalization prediction head.
capit_head:
capit_num_fc_layers: 1
fc_dropout: 0.1
activation: "relu"
use_transformer_init: false
# Tokenizer: specify the pretrained tokenizer name.
tokenizer: "xlm-roberta-base"
# Language model configuration.
# Instead of a raw string, we provide a dictionary that matches the expected LanguageModelConfig.
language_model:
pretrained_model_name: "xlm-roberta-base"
# You can add more parameters here if your setup requires them.
# For example:
# model_config: {} # Optional: any additional configuration for the language model.
# Optimizer configuration (dummy values; adjust if needed).
optim:
name: "adam"
lr: 0.001