File size: 2,339 Bytes
564c4b7
9d5ee77
5a9878b
 
 
 
 
 
 
564c4b7
5a9878b
 
564c4b7
5a9878b
 
9d5ee77
5a9878b
 
564c4b7
5a9878b
 
 
 
 
 
 
9d5ee77
5a9878b
 
 
 
 
 
 
 
 
9d5ee77
ddf501b
 
564c4b7
1e77852
 
 
 
 
 
 
 
9d5ee77
1e77852
 
 
 
 
 
 
 
 
 
 
 
564c4b7
9d5ee77
564c4b7
 
 
9d5ee77
564c4b7
9d5ee77
 
 
564c4b7
9d5ee77
 
 
 
 
 
564c4b7
9d5ee77
 
 
 
 
 
564c4b7
9d5ee77
1e77852
564c4b7
 
 
 
 
 
 
 
 
9d5ee77
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Supported languages
languages: [
  "af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et",
  "fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it",
  "ja", "kk", "kn", "ko", "ky", "lt", "lv", "mk", "ml", "mr",
  "nl", "or", "pa", "pl", "ps", "pt", "ro", "ru", "rw", "so",
  "sr", "sw", "ta", "te", "tr", "uk", "zh"
]

# Maximum sequence length for the model
max_length: 256

# Labels that might appear before punctuation (if any)
pre_labels: [
  "<NULL>",
  "¿"
]

# Punctuation and capitalization labels (post-token)
post_labels: [
  "<NULL>",
  "<ACRONYM>",
  ".",
  ",",
  "?",
  "?",
  "،",
  "。",
  "、",
  "・",
  "।",
  "؟",
  "،",
  ";",
  "።",
  "፣",
  "፧"
]

# Combined list of class labels (typically similar to pre/post labels)
class_labels: [
  "<NULL>",
  "¿",
  "<ACRONYM>",
  ".",
  ",",
  "?",
  "?",
  "،",
  "。",
  "、",
  "・",
  "।",
  "؟",
  "،",
  ";",
  "።",
  "፣",
  "፧"
]

# Dataset configuration: these keys are required by NeMo's legacy conversion.
dataset:
  pad_label: "<NULL>"          # Label used for padding
  ignore_extra_tokens: false   # Whether to ignore tokens beyond the expected sequence length
  ignore_start_end: false      # Whether to ignore special start/end tokens

# Label ID mappings: here we simply assign sequential integers.
punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

# Configuration for the punctuation prediction head.
punct_head:
  punct_num_fc_layers: 1
  fc_dropout: 0.1
  activation: "relu"
  use_transformer_init: false

# Configuration for the capitalization prediction head.
capit_head:
  capit_num_fc_layers: 1
  fc_dropout: 0.1
  activation: "relu"
  use_transformer_init: false

# Tokenizer: specify the pretrained tokenizer name.
tokenizer: "xlm-roberta-base"

# Language model configuration.
# Instead of a raw string, we provide a dictionary that matches the expected LanguageModelConfig.
language_model:
  pretrained_model_name: "xlm-roberta-base"
  # You can add more parameters here if your setup requires them.
  # For example:
  # model_config: {}   # Optional: any additional configuration for the language model.

# Optimizer configuration (dummy values; adjust if needed).
optim:
  name: "adam"
  lr: 0.001