Salama1429 commited on
Commit
564c4b7
·
verified ·
1 Parent(s): 9d5ee77

Update model_config.yaml

Browse files
Files changed (1) hide show
  1. model_config.yaml +22 -16
model_config.yaml CHANGED
@@ -1,4 +1,4 @@
1
- # List of supported languages
2
  languages: [
3
  "af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et",
4
  "fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it",
@@ -7,16 +7,16 @@ languages: [
7
  "sr", "sw", "ta", "te", "tr", "uk", "zh"
8
  ]
9
 
10
- # Maximum sequence length
11
  max_length: 256
12
 
13
- # Labels that come before punctuation (if any)
14
  pre_labels: [
15
  "<NULL>",
16
  "¿"
17
  ]
18
 
19
- # Labels for punctuation and capitalization (post-token labels)
20
  post_labels: [
21
  "<NULL>",
22
  "<ACRONYM>",
@@ -37,7 +37,7 @@ post_labels: [
37
  "፧"
38
  ]
39
 
40
- # A combined list of class labels.
41
  class_labels: [
42
  "<NULL>",
43
  "¿",
@@ -59,36 +59,42 @@ class_labels: [
59
  "፧"
60
  ]
61
 
62
- # Dataset configuration (dummy/default values)
63
  dataset:
64
- pad_label: "<NULL>" # The label used for padding
65
- ignore_extra_tokens: false # Whether to ignore extra tokens
66
- ignore_start_end: false # Whether to ignore start/end tokens
67
 
68
- # Label ID mappings for punctuation and capitalization.
69
- # Here, we simply map each class in order (0, 1, 2, …).
70
  punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
71
  capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
72
 
73
- # Head configuration for punctuation prediction.
74
  punct_head:
75
  punct_num_fc_layers: 1
76
  fc_dropout: 0.1
77
  activation: "relu"
78
  use_transformer_init: false
79
 
80
- # Head configuration for capitalization prediction.
81
  capit_head:
82
  capit_num_fc_layers: 1
83
  fc_dropout: 0.1
84
  activation: "relu"
85
  use_transformer_init: false
86
 
87
- # Tokenizer and language model to be used.
88
  tokenizer: "xlm-roberta-base"
89
- language_model: "xlm-roberta-base"
90
 
91
- # Optimizer configuration (dummy/default values).
 
 
 
 
 
 
 
 
92
  optim:
93
  name: "adam"
94
  lr: 0.001
 
1
+ # Supported languages
2
  languages: [
3
  "af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et",
4
  "fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it",
 
7
  "sr", "sw", "ta", "te", "tr", "uk", "zh"
8
  ]
9
 
10
+ # Maximum sequence length for the model
11
  max_length: 256
12
 
13
+ # Labels that might appear before punctuation (if any)
14
  pre_labels: [
15
  "<NULL>",
16
  "¿"
17
  ]
18
 
19
+ # Punctuation and capitalization labels (post-token)
20
  post_labels: [
21
  "<NULL>",
22
  "<ACRONYM>",
 
37
  "፧"
38
  ]
39
 
40
+ # Combined list of class labels (typically similar to pre/post labels)
41
  class_labels: [
42
  "<NULL>",
43
  "¿",
 
59
  "፧"
60
  ]
61
 
62
+ # Dataset configuration: these keys are required by NeMo's legacy conversion.
63
  dataset:
64
+ pad_label: "<NULL>" # Label used for padding
65
+ ignore_extra_tokens: false # Whether to ignore tokens beyond the expected sequence length
66
+ ignore_start_end: false # Whether to ignore special start/end tokens
67
 
68
+ # Label ID mappings: here we simply assign sequential integers.
 
69
  punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
70
  capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
71
 
72
+ # Configuration for the punctuation prediction head.
73
  punct_head:
74
  punct_num_fc_layers: 1
75
  fc_dropout: 0.1
76
  activation: "relu"
77
  use_transformer_init: false
78
 
79
+ # Configuration for the capitalization prediction head.
80
  capit_head:
81
  capit_num_fc_layers: 1
82
  fc_dropout: 0.1
83
  activation: "relu"
84
  use_transformer_init: false
85
 
86
+ # Tokenizer: specify the pretrained tokenizer name.
87
  tokenizer: "xlm-roberta-base"
 
88
 
89
+ # Language model configuration.
90
+ # Instead of a raw string, we provide a dictionary that matches the expected LanguageModelConfig.
91
+ language_model:
92
+ pretrained_model_name: "xlm-roberta-base"
93
+ # You can add more parameters here if your setup requires them.
94
+ # For example:
95
+ # model_config: {} # Optional: any additional configuration for the language model.
96
+
97
+ # Optimizer configuration (dummy values; adjust if needed).
98
  optim:
99
  name: "adam"
100
  lr: 0.001