|
|
|
languages: [ |
|
"af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et", |
|
"fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it", |
|
"ja", "kk", "kn", "ko", "ky", "lt", "lv", "mk", "ml", "mr", |
|
"nl", "or", "pa", "pl", "ps", "pt", "ro", "ru", "rw", "so", |
|
"sr", "sw", "ta", "te", "tr", "uk", "zh" |
|
] |
|
|
|
|
|
max_length: 256 |
|
|
|
|
|
pre_labels: [ |
|
"<NULL>", |
|
"¿" |
|
] |
|
|
|
|
|
post_labels: [ |
|
"<NULL>", |
|
"<ACRONYM>", |
|
".", |
|
",", |
|
"?", |
|
"?", |
|
"،", |
|
"。", |
|
"、", |
|
"・", |
|
"।", |
|
"؟", |
|
"،", |
|
";", |
|
"።", |
|
"፣", |
|
"፧" |
|
] |
|
|
|
|
|
class_labels: [ |
|
"<NULL>", |
|
"¿", |
|
"<ACRONYM>", |
|
".", |
|
",", |
|
"?", |
|
"?", |
|
"،", |
|
"。", |
|
"、", |
|
"・", |
|
"।", |
|
"؟", |
|
"،", |
|
";", |
|
"።", |
|
"፣", |
|
"፧" |
|
] |
|
|
|
|
|
dataset: |
|
pad_label: "<NULL>" |
|
ignore_extra_tokens: false |
|
ignore_start_end: false |
|
|
|
|
|
punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] |
|
capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] |
|
|
|
|
|
punct_head: |
|
punct_num_fc_layers: 1 |
|
fc_dropout: 0.1 |
|
activation: "relu" |
|
use_transformer_init: false |
|
|
|
|
|
capit_head: |
|
capit_num_fc_layers: 1 |
|
fc_dropout: 0.1 |
|
activation: "relu" |
|
use_transformer_init: false |
|
|
|
|
|
tokenizer: "xlm-roberta-base" |
|
|
|
|
|
|
|
language_model: |
|
pretrained_model_name: "xlm-roberta-base" |
|
|
|
|
|
|
|
|
|
|
|
optim: |
|
name: "adam" |
|
lr: 0.001 |
|
|