File size: 2,339 Bytes

# Supported languages
languages: [
  "af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et",
  "fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it",
  "ja", "kk", "kn", "ko", "ky", "lt", "lv", "mk", "ml", "mr",
  "nl", "or", "pa", "pl", "ps", "pt", "ro", "ru", "rw", "so",
  "sr", "sw", "ta", "te", "tr", "uk", "zh"
]

# Maximum sequence length for the model
max_length: 256

# Labels that might appear before punctuation (if any)
pre_labels: [
  "<NULL>",
  "¿"
]

# Punctuation and capitalization labels (post-token)
post_labels: [
  "<NULL>",
  "<ACRONYM>",
  ".",
  ",",
  "?",
  "？",
  "،",
  "。",
  "、",
  "・",
  "।",
  "؟",
  "،",
  ";",
  "።",
  "፣",
  "፧"
]

# Combined list of class labels (typically similar to pre/post labels)
class_labels: [
  "<NULL>",
  "¿",
  "<ACRONYM>",
  ".",
  ",",
  "?",
  "？",
  "،",
  "。",
  "、",
  "・",
  "।",
  "؟",
  "،",
  ";",
  "።",
  "፣",
  "፧"
]

# Dataset configuration: these keys are required by NeMo's legacy conversion.
dataset:
  pad_label: "<NULL>"          # Label used for padding
  ignore_extra_tokens: false   # Whether to ignore tokens beyond the expected sequence length
  ignore_start_end: false      # Whether to ignore special start/end tokens

# Label ID mappings: here we simply assign sequential integers.
punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

# Configuration for the punctuation prediction head.
punct_head:
  punct_num_fc_layers: 1
  fc_dropout: 0.1
  activation: "relu"
  use_transformer_init: false

# Configuration for the capitalization prediction head.
capit_head:
  capit_num_fc_layers: 1
  fc_dropout: 0.1
  activation: "relu"
  use_transformer_init: false

# Tokenizer: specify the pretrained tokenizer name.
tokenizer: "xlm-roberta-base"

# Language model configuration.
# Instead of a raw string, we provide a dictionary that matches the expected LanguageModelConfig.
language_model:
  pretrained_model_name: "xlm-roberta-base"
  # You can add more parameters here if your setup requires them.
  # For example:
  # model_config: {}   # Optional: any additional configuration for the language model.

# Optimizer configuration (dummy values; adjust if needed).
optim:
  name: "adam"
  lr: 0.001