Update model_config.yaml
Browse files- model_config.yaml +22 -16
model_config.yaml
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
#
|
2 |
languages: [
|
3 |
"af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et",
|
4 |
"fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it",
|
@@ -7,16 +7,16 @@ languages: [
|
|
7 |
"sr", "sw", "ta", "te", "tr", "uk", "zh"
|
8 |
]
|
9 |
|
10 |
-
# Maximum sequence length
|
11 |
max_length: 256
|
12 |
|
13 |
-
# Labels that
|
14 |
pre_labels: [
|
15 |
"<NULL>",
|
16 |
"¿"
|
17 |
]
|
18 |
|
19 |
-
#
|
20 |
post_labels: [
|
21 |
"<NULL>",
|
22 |
"<ACRONYM>",
|
@@ -37,7 +37,7 @@ post_labels: [
|
|
37 |
"፧"
|
38 |
]
|
39 |
|
40 |
-
#
|
41 |
class_labels: [
|
42 |
"<NULL>",
|
43 |
"¿",
|
@@ -59,36 +59,42 @@ class_labels: [
|
|
59 |
"፧"
|
60 |
]
|
61 |
|
62 |
-
# Dataset configuration
|
63 |
dataset:
|
64 |
-
pad_label: "<NULL>"
|
65 |
-
ignore_extra_tokens: false
|
66 |
-
ignore_start_end: false
|
67 |
|
68 |
-
# Label ID mappings
|
69 |
-
# Here, we simply map each class in order (0, 1, 2, …).
|
70 |
punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
|
71 |
capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
|
72 |
|
73 |
-
#
|
74 |
punct_head:
|
75 |
punct_num_fc_layers: 1
|
76 |
fc_dropout: 0.1
|
77 |
activation: "relu"
|
78 |
use_transformer_init: false
|
79 |
|
80 |
-
#
|
81 |
capit_head:
|
82 |
capit_num_fc_layers: 1
|
83 |
fc_dropout: 0.1
|
84 |
activation: "relu"
|
85 |
use_transformer_init: false
|
86 |
|
87 |
-
# Tokenizer
|
88 |
tokenizer: "xlm-roberta-base"
|
89 |
-
language_model: "xlm-roberta-base"
|
90 |
|
91 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
optim:
|
93 |
name: "adam"
|
94 |
lr: 0.001
|
|
|
1 |
+
# Supported languages
|
2 |
languages: [
|
3 |
"af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et",
|
4 |
"fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it",
|
|
|
7 |
"sr", "sw", "ta", "te", "tr", "uk", "zh"
|
8 |
]
|
9 |
|
10 |
+
# Maximum sequence length for the model
|
11 |
max_length: 256
|
12 |
|
13 |
+
# Labels that might appear before punctuation (if any)
|
14 |
pre_labels: [
|
15 |
"<NULL>",
|
16 |
"¿"
|
17 |
]
|
18 |
|
19 |
+
# Punctuation and capitalization labels (post-token)
|
20 |
post_labels: [
|
21 |
"<NULL>",
|
22 |
"<ACRONYM>",
|
|
|
37 |
"፧"
|
38 |
]
|
39 |
|
40 |
+
# Combined list of class labels (typically similar to pre/post labels)
|
41 |
class_labels: [
|
42 |
"<NULL>",
|
43 |
"¿",
|
|
|
59 |
"፧"
|
60 |
]
|
61 |
|
62 |
+
# Dataset configuration: these keys are required by NeMo's legacy conversion.
|
63 |
dataset:
|
64 |
+
pad_label: "<NULL>" # Label used for padding
|
65 |
+
ignore_extra_tokens: false # Whether to ignore tokens beyond the expected sequence length
|
66 |
+
ignore_start_end: false # Whether to ignore special start/end tokens
|
67 |
|
68 |
+
# Label ID mappings: here we simply assign sequential integers.
|
|
|
69 |
punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
|
70 |
capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
|
71 |
|
72 |
+
# Configuration for the punctuation prediction head.
|
73 |
punct_head:
|
74 |
punct_num_fc_layers: 1
|
75 |
fc_dropout: 0.1
|
76 |
activation: "relu"
|
77 |
use_transformer_init: false
|
78 |
|
79 |
+
# Configuration for the capitalization prediction head.
|
80 |
capit_head:
|
81 |
capit_num_fc_layers: 1
|
82 |
fc_dropout: 0.1
|
83 |
activation: "relu"
|
84 |
use_transformer_init: false
|
85 |
|
86 |
+
# Tokenizer: specify the pretrained tokenizer name.
|
87 |
tokenizer: "xlm-roberta-base"
|
|
|
88 |
|
89 |
+
# Language model configuration.
|
90 |
+
# Instead of a raw string, we provide a dictionary that matches the expected LanguageModelConfig.
|
91 |
+
language_model:
|
92 |
+
pretrained_model_name: "xlm-roberta-base"
|
93 |
+
# You can add more parameters here if your setup requires them.
|
94 |
+
# For example:
|
95 |
+
# model_config: {} # Optional: any additional configuration for the language model.
|
96 |
+
|
97 |
+
# Optimizer configuration (dummy values; adjust if needed).
|
98 |
optim:
|
99 |
name: "adam"
|
100 |
lr: 0.001
|