Salama1429
/

xlm-roberta_punctuation_fullstop_truecase

Text2Text Generation

sentence-boundary-detection

Model card Files Files and versions Community

xlm-roberta_punctuation_fullstop_truecase / model_config.yaml

Salama1429's picture

Update model_config.yaml

564c4b7 verified 3 days ago

history blame contribute delete

2.34 kB

	# Supported languages
	languages: [
	"af", "am", "ar", "bg", "bn", "de", "el", "en", "es", "et",
	"fa", "fi", "fr", "gu", "hi", "hr", "hu", "id", "is", "it",
	"ja", "kk", "kn", "ko", "ky", "lt", "lv", "mk", "ml", "mr",
	"nl", "or", "pa", "pl", "ps", "pt", "ro", "ru", "rw", "so",
	"sr", "sw", "ta", "te", "tr", "uk", "zh"
	]

	# Maximum sequence length for the model
	max_length: 256

	# Labels that might appear before punctuation (if any)
	pre_labels: [
	"<NULL>",
	"¿"
	]

	# Punctuation and capitalization labels (post-token)
	post_labels: [
	"<NULL>",
	"<ACRONYM>",
	".",
	",",
	"?",
	"？",
	"،",
	"。",
	"、",
	"・",
	"।",
	"؟",
	"،",
	";",
	"።",
	"፣",
	"፧"
	]

	# Combined list of class labels (typically similar to pre/post labels)
	class_labels: [
	"<NULL>",
	"¿",
	"<ACRONYM>",
	".",
	",",
	"?",
	"？",
	"،",
	"。",
	"、",
	"・",
	"।",
	"؟",
	"،",
	";",
	"።",
	"፣",
	"፧"
	]

	# Dataset configuration: these keys are required by NeMo's legacy conversion.
	dataset:
	pad_label: "<NULL>" # Label used for padding
	ignore_extra_tokens: false # Whether to ignore tokens beyond the expected sequence length
	ignore_start_end: false # Whether to ignore special start/end tokens

	# Label ID mappings: here we simply assign sequential integers.
	punct_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
	capit_label_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

	# Configuration for the punctuation prediction head.
	punct_head:
	punct_num_fc_layers: 1
	fc_dropout: 0.1
	activation: "relu"
	use_transformer_init: false

	# Configuration for the capitalization prediction head.
	capit_head:
	capit_num_fc_layers: 1
	fc_dropout: 0.1
	activation: "relu"
	use_transformer_init: false

	# Tokenizer: specify the pretrained tokenizer name.
	tokenizer: "xlm-roberta-base"

	# Language model configuration.
	# Instead of a raw string, we provide a dictionary that matches the expected LanguageModelConfig.
	language_model:
	pretrained_model_name: "xlm-roberta-base"
	# You can add more parameters here if your setup requires them.
	# For example:
	# model_config: {} # Optional: any additional configuration for the language model.

	# Optimizer configuration (dummy values; adjust if needed).
	optim:
	name: "adam"
	lr: 0.001