# Supported languages languages: [ "af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it", "ja", "kk", "kn", "ko", "ky", "lt", "lv", "mk", "ml", "mr", "nl", "or", "pa", "pl", "ps", "pt", "ro", "ru", "rw", "so", "sr", "sw", "ta", "te", "tr", "uk", "zh" ] # Maximum sequence length for the model max_length: 256 # Labels that might appear before punctuation (if any) pre_labels: [ "", "¿" ] # Punctuation and capitalization labels (post-token) post_labels: [ "", "", ".", ",", "?", "?", "،", "。", "、", "・", "।", "؟", "،", ";", "።", "፣", "፧" ] # Combined list of class labels (typically similar to pre/post labels) class_labels: [ "", "¿", "", ".", ",", "?", "?", "،", "。", "、", "・", "।", "؟", "،", ";", "።", "፣", "፧" ] # Dataset configuration: these keys are required by NeMo's legacy conversion. dataset: pad_label: "" # Label used for padding ignore_extra_tokens: false # Whether to ignore tokens beyond the expected sequence length ignore_start_end: false # Whether to ignore special start/end tokens # Label ID mappings: here we simply assign sequential integers. punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] # Configuration for the punctuation prediction head. punct_head: punct_num_fc_layers: 1 fc_dropout: 0.1 activation: "relu" use_transformer_init: false # Configuration for the capitalization prediction head. capit_head: capit_num_fc_layers: 1 fc_dropout: 0.1 activation: "relu" use_transformer_init: false # Tokenizer: specify the pretrained tokenizer name. tokenizer: "xlm-roberta-base" # Language model configuration. # Instead of a raw string, we provide a dictionary that matches the expected LanguageModelConfig. language_model: pretrained_model_name: "xlm-roberta-base" # You can add more parameters here if your setup requires them. # For example: # model_config: {} # Optional: any additional configuration for the language model. # Optimizer configuration (dummy values; adjust if needed). optim: name: "adam" lr: 0.001