v2

Browse files

Roberta-base-ca-v2 trained with the new version of the TeCla dataset (v2).

Files changed (5) hide show

README.md +11 -11
config.json +110 -40
pytorch_model.bin +2 -2
tokenizer.json +0 -0
tokenizer_config.json +1 -1

README.md CHANGED Viewed

@@ -34,7 +34,7 @@ model-index:
     metrics:
       - name: Accuracy
         type: accuracy
-        value: 0.7426
 widget:
@@ -107,7 +107,7 @@ At the time of submission, no measures have been taken to estimate the bias embe
 ## Training
 ### Training data
-We used the TC dataset in Catalan called [TeCla](https://huggingface.co/datasets/projecte-aina/tecla) for training and evaluation.
 ### Training procedure
 The model was trained with a batch size of 16 and a learning rate of 5e-5 for 5 epochs. We then selected the best checkpoint using the downstream task metric in the corresponding development set and then evaluated it on the test set.
@@ -116,17 +116,17 @@ The model was trained with a batch size of 16 and a learning rate of 5e-5 for 5
 ### Variable and metrics
-This model was finetuned maximizing accuracy.
 ## Evaluation results
-We evaluated the _roberta-base-ca-v2-cased-tc_ on the TeCla test set against standard multilingual and monolingual baselines:
-| Model        | TeCla (Accuracy) |
-| ------------|:-------------|
-| roberta-base-ca-v2-cased-tc | **74.26** |
-| roberta-base-ca-cased-tc | 73.65 |
-| mBERT       |  69.90 |
-| XLM-RoBERTa | 70.14 |
 For more details, check the fine-tuning and evaluation scripts in the official [GitHub repository](https://github.com/projecte-aina/club).

     metrics:
       - name: Accuracy
         type: accuracy
+        value: 0.8034
 widget:
 ## Training
 ### Training data
+We used the TC dataset in Catalan called [TeCla](https://huggingface.co/datasets/projecte-aina/tecla) for training and evaluation. Although TeCla includes a coarse-grained ('label1') and a fine-grained categorization ('label2'), only the last one, with 53 classes, was used for the training.
 ### Training procedure
 The model was trained with a batch size of 16 and a learning rate of 5e-5 for 5 epochs. We then selected the best checkpoint using the downstream task metric in the corresponding development set and then evaluated it on the test set.
 ### Variable and metrics
+This model was finetuned maximizing F1 (weighted).
 ## Evaluation results
+We evaluated the _roberta-base-ca-v2-cased-tc_ on the TeCla test set against standard multilingual and monolingual baselines. The results for 'label1' categories were obtained through a mapping from the fine-grained category ('label2') to the corresponding coarse-grained one ('label1').
+| Model        | TeCla - label1 (Accuracy) |  TeCla - label2 (Accuracy) |
+| ------------|:-------------|:-------------|
+| roberta-base-ca-v2 | 96.31 | 80.34 |
+| roberta-large-ca-v2 | **96.51** | **80.68** |
+| mBERT       |  95.72 | 78.47 |
+| XLM-RoBERTa | 95.66 | 78.01 |
 For more details, check the fine-tuning and evaluation scripts in the official [GitHub repository](https://github.com/projecte-aina/club).

config.json CHANGED Viewed

@@ -1,10 +1,11 @@
 {
-  "_name_or_path": "projecte-aina/roberta-base-ca-v2-cased-tc",
   "architectures": [
     "RobertaForSequenceClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,
   "eos_token_id": 2,
   "finetuning_task": "tecla",
   "gradient_checkpointing": false,
@@ -12,48 +13,116 @@
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "id2label": {
-      "0": "Medi ambient",
-      "1": "Societat",
-      "2": "Policial",
-      "3": "Judicial",
-      "4": "Empresa",
-      "5": "Partits",
-      "6": "Política",
-      "7": "Successos",
-      "8": "Salut",
-      "9": "Infraestructures",
-      "10": "Parlament",
-      "11": "Música",
-      "12": "Govern",
-      "13": "Unió Europea",
-      "14": "Economia",
-      "15": "Mobilitat",
-      "16": "Treball",
-      "17": "Cultura",
-      "18": "Educació"
   },
   "initializer_range": 0.02,
   "intermediate_size": 3072,
   "label2id": {
-        "Medi ambient": 0,
-        "Societat": 1,
-        "Policial": 2,
-        "Judicial": 3,
-        "Empresa": 4,
-        "Partits": 5,
-        "Política": 6,
-        "Successos": 7,
-        "Salut": 8,
-        "Infraestructures": 9,
-        "Parlament": 10,
-        "Música": 11,
-        "Govern": 12,
-        "Unió Europea": 13,
-        "Economia": 14,
-        "Mobilitat": 15,
-        "Treball": 16,
-        "Cultura": 17,
-        "Educació": 18
   },
   "layer_norm_eps": 1e-05,
   "max_position_embeddings": 514,
@@ -63,7 +132,8 @@
   "pad_token_id": 1,
   "position_embedding_type": "absolute",
   "problem_type": "single_label_classification",
-  "transformers_version": "4.6.1",
   "type_vocab_size": 1,
   "use_cache": true,
   "vocab_size": 50262

 {
+  "_name_or_path": "/gpfs/projects/bsc88/projects/catalan_evaluation/models/roberta-base-ca-v2",
   "architectures": [
     "RobertaForSequenceClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,
+  "classifier_dropout": null,
   "eos_token_id": 2,
   "finetuning_task": "tecla",
   "gradient_checkpointing": false,
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "id2label": {
+    "0": "Llengua",
+    "1": "Infraestructures",
+    "2": "Arts",
+    "3": "Parlament",
+    "4": "Noves tecnologies",
+    "5": "Castells",
+    "6": "Successos",
+    "7": "Empresa",
+    "8": "Mobilitat",
+    "9": "Teatre",
+    "10": "Treball",
+    "11": "Log\u00edstica",
+    "12": "Urbanisme",
+    "13": "Govern",
+    "14": "Entitats",
+    "15": "Finances",
+    "16": "Govern espanyol",
+    "17": "Tr\u00e0nsit",
+    "18": "Ind\u00fastria",
+    "19": "Esports",
+    "20": "Exteriors",
+    "21": "Medi ambient",
+    "22": "Habitatge",
+    "23": "Salut",
+    "24": "Equipaments i patrimoni",
+    "25": "Recerca",
+    "26": "Cooperaci\u00f3",
+    "27": "Innovaci\u00f3",
+    "28": "Agroalimentaci\u00f3",
+    "29": "Policial",
+    "30": "Serveis Socials",
+    "31": "Cinema",
+    "32": "Mem\u00f2ria hist\u00f2rica",
+    "33": "Turisme",
+    "34": "Pol\u00edtica municipal",
+    "35": "Comer\u00e7",
+    "36": "Universitats",
+    "37": "Hisenda",
+    "38": "Judicial",
+    "39": "Partits",
+    "40": "M\u00fasica",
+    "41": "Lletres",
+    "42": "Religi\u00f3",
+    "43": "Festa i cultura popular",
+    "44": "Uni\u00f3 Europea",
+    "45": "Moda",
+    "46": "Moviments socials",
+    "47": "Comptes p\u00fablics",
+    "48": "Immigraci\u00f3",
+    "49": "Educaci\u00f3",
+    "50": "Gastronomia",
+    "51": "Meteorologia",
+    "52": "Energia"
   },
   "initializer_range": 0.02,
   "intermediate_size": 3072,
   "label2id": {
+    "Agroalimentaci\u00f3": 28,
+    "Arts": 2,
+    "Castells": 5,
+    "Cinema": 31,
+    "Comer\u00e7": 35,
+    "Comptes p\u00fablics": 47,
+    "Cooperaci\u00f3": 26,
+    "Educaci\u00f3": 49,
+    "Empresa": 7,
+    "Energia": 52,
+    "Entitats": 14,
+    "Equipaments i patrimoni": 24,
+    "Esports": 19,
+    "Exteriors": 20,
+    "Festa i cultura popular": 43,
+    "Finances": 15,
+    "Gastronomia": 50,
+    "Govern": 13,
+    "Govern espanyol": 16,
+    "Habitatge": 22,
+    "Hisenda": 37,
+    "Immigraci\u00f3": 48,
+    "Ind\u00fastria": 18,
+    "Infraestructures": 1,
+    "Innovaci\u00f3": 27,
+    "Judicial": 38,
+    "Llengua": 0,
+    "Lletres": 41,
+    "Log\u00edstica": 11,
+    "Medi ambient": 21,
+    "Mem\u00f2ria hist\u00f2rica": 32,
+    "Meteorologia": 51,
+    "Mobilitat": 8,
+    "Moda": 45,
+    "Moviments socials": 46,
+    "M\u00fasica": 40,
+    "Noves tecnologies": 4,
+    "Parlament": 3,
+    "Partits": 39,
+    "Policial": 29,
+    "Pol\u00edtica municipal": 34,
+    "Recerca": 25,
+    "Religi\u00f3": 42,
+    "Salut": 23,
+    "Serveis Socials": 30,
+    "Successos": 6,
+    "Teatre": 9,
+    "Treball": 10,
+    "Tr\u00e0nsit": 17,
+    "Turisme": 33,
+    "Universitats": 36,
+    "Uni\u00f3 Europea": 44,
+    "Urbanisme": 12
   },
   "layer_norm_eps": 1e-05,
   "max_position_embeddings": 514,
   "pad_token_id": 1,
   "position_embedding_type": "absolute",
   "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.17.0",
   "type_vocab_size": 1,
   "use_cache": true,
   "vocab_size": 50262

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5da19d73d770844b657a2e4489d742af5e843a480c8b5fc46cbb61d1a6698e2
-size 498717165

 version https://git-lfs.github.com/spec/v1
+oid sha256:c4e13e309d6f6b36be7736992f9164db10e421c8abadf61128f44759237fd686
+size 498822701

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1 +1 @@

- {"~~unk_token~~": {"content": "<~~unk~~>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "~~bos_token~~": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "~~eos_token~~": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "~~add_prefix_space~~": ~~true, "errors": "replace", "sep_token":~~ {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "~~cls_token~~": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "max_len": 512, "special_tokens_map_file": null, "name_or_path": "/gpfs/projects/bsc88/~~BERTs~~/models/~~roberta_base_ca_jsc/transformed_lr0.0005~~"}

+ {"errors": "replace", "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": true, "trim_offsets": true, "max_len": 512, "special_tokens_map_file": null, "name_or_path": "/gpfs/projects/bsc88/projects/catalan_evaluation/models/roberta-base-ca-v2", "tokenizer_class": "RobertaTokenizer"}