Salama1429
/

xlm-roberta_punctuation_fullstop_truecase

@@ -1,4 +1,4 @@
-# List of supported languages
 languages: [
   "af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et",
   "fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it",
@@ -7,16 +7,16 @@ languages: [
   "sr", "sw", "ta", "te", "tr", "uk", "zh"
 ]
-# Maximum sequence length
 max_length: 256
-# Labels that come before punctuation (if any)
 pre_labels: [
   "<NULL>",
   "¿"
 ]
-# Labels for punctuation and capitalization (post-token labels)
 post_labels: [
   "<NULL>",
   "<ACRONYM>",
@@ -37,7 +37,7 @@ post_labels: [
   "፧"
 ]
-# A combined list of class labels.
 class_labels: [
   "<NULL>",
   "¿",
@@ -59,36 +59,42 @@ class_labels: [
   "፧"
 ]
-# Dataset configuration (dummy/default values)
 dataset:
-  pad_label: "<NULL>"            # The label used for padding
-  ignore_extra_tokens: false     # Whether to ignore extra tokens
-  ignore_start_end: false        # Whether to ignore start/end tokens
-# Label ID mappings for punctuation and capitalization.
-# Here, we simply map each class in order (0, 1, 2, …).
 punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
 capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
-# Head configuration for punctuation prediction.
 punct_head:
   punct_num_fc_layers: 1
   fc_dropout: 0.1
   activation: "relu"
   use_transformer_init: false
-# Head configuration for capitalization prediction.
 capit_head:
   capit_num_fc_layers: 1
   fc_dropout: 0.1
   activation: "relu"
   use_transformer_init: false
-# Tokenizer and language model to be used.
 tokenizer: "xlm-roberta-base"
-language_model: "xlm-roberta-base"
-# Optimizer configuration (dummy/default values).
 optim:
   name: "adam"
   lr: 0.001

+# Supported languages
 languages: [
   "af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et",
   "fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it",
   "sr", "sw", "ta", "te", "tr", "uk", "zh"
 ]
+# Maximum sequence length for the model
 max_length: 256
+# Labels that might appear before punctuation (if any)
 pre_labels: [
   "<NULL>",
   "¿"
 ]
+# Punctuation and capitalization labels (post-token)
 post_labels: [
   "<NULL>",
   "<ACRONYM>",
   "፧"
 ]
+# Combined list of class labels (typically similar to pre/post labels)
 class_labels: [
   "<NULL>",
   "¿",
   "፧"
 ]
+# Dataset configuration: these keys are required by NeMo's legacy conversion.
 dataset:
+  pad_label: "<NULL>"          # Label used for padding
+  ignore_extra_tokens: false   # Whether to ignore tokens beyond the expected sequence length
+  ignore_start_end: false      # Whether to ignore special start/end tokens
+# Label ID mappings: here we simply assign sequential integers.
 punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
 capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+# Configuration for the punctuation prediction head.
 punct_head:
   punct_num_fc_layers: 1
   fc_dropout: 0.1
   activation: "relu"
   use_transformer_init: false
+# Configuration for the capitalization prediction head.
 capit_head:
   capit_num_fc_layers: 1
   fc_dropout: 0.1
   activation: "relu"
   use_transformer_init: false
+# Tokenizer: specify the pretrained tokenizer name.
 tokenizer: "xlm-roberta-base"
+# Language model configuration.
+# Instead of a raw string, we provide a dictionary that matches the expected LanguageModelConfig.
+language_model:
+  pretrained_model_name: "xlm-roberta-base"
+  # You can add more parameters here if your setup requires them.
+  # For example:
+  # model_config: {}   # Optional: any additional configuration for the language model.
+# Optimizer configuration (dummy values; adjust if needed).
 optim:
   name: "adam"
   lr: 0.001