File size: 2,339 Bytes
564c4b7 9d5ee77 5a9878b 564c4b7 5a9878b 564c4b7 5a9878b 9d5ee77 5a9878b 564c4b7 5a9878b 9d5ee77 5a9878b 9d5ee77 ddf501b 564c4b7 1e77852 9d5ee77 1e77852 564c4b7 9d5ee77 564c4b7 9d5ee77 564c4b7 9d5ee77 564c4b7 9d5ee77 564c4b7 9d5ee77 564c4b7 9d5ee77 1e77852 564c4b7 9d5ee77 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# Supported languages
languages: [
"af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et",
"fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it",
"ja", "kk", "kn", "ko", "ky", "lt", "lv", "mk", "ml", "mr",
"nl", "or", "pa", "pl", "ps", "pt", "ro", "ru", "rw", "so",
"sr", "sw", "ta", "te", "tr", "uk", "zh"
]
# Maximum sequence length for the model
max_length: 256
# Labels that might appear before punctuation (if any)
pre_labels: [
"<NULL>",
"¿"
]
# Punctuation and capitalization labels (post-token)
post_labels: [
"<NULL>",
"<ACRONYM>",
".",
",",
"?",
"?",
"،",
"。",
"、",
"・",
"।",
"؟",
"،",
";",
"።",
"፣",
"፧"
]
# Combined list of class labels (typically similar to pre/post labels)
class_labels: [
"<NULL>",
"¿",
"<ACRONYM>",
".",
",",
"?",
"?",
"،",
"。",
"、",
"・",
"।",
"؟",
"،",
";",
"።",
"፣",
"፧"
]
# Dataset configuration: these keys are required by NeMo's legacy conversion.
dataset:
pad_label: "<NULL>" # Label used for padding
ignore_extra_tokens: false # Whether to ignore tokens beyond the expected sequence length
ignore_start_end: false # Whether to ignore special start/end tokens
# Label ID mappings: here we simply assign sequential integers.
punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
# Configuration for the punctuation prediction head.
punct_head:
punct_num_fc_layers: 1
fc_dropout: 0.1
activation: "relu"
use_transformer_init: false
# Configuration for the capitalization prediction head.
capit_head:
capit_num_fc_layers: 1
fc_dropout: 0.1
activation: "relu"
use_transformer_init: false
# Tokenizer: specify the pretrained tokenizer name.
tokenizer: "xlm-roberta-base"
# Language model configuration.
# Instead of a raw string, we provide a dictionary that matches the expected LanguageModelConfig.
language_model:
pretrained_model_name: "xlm-roberta-base"
# You can add more parameters here if your setup requires them.
# For example:
# model_config: {} # Optional: any additional configuration for the language model.
# Optimizer configuration (dummy values; adjust if needed).
optim:
name: "adam"
lr: 0.001
|