File size: 16,765 Bytes
5639a81
c0fa950
 
 
 
 
 
 
 
88675db
c0fa950
 
8aaf0e7
de3b367
 
67cd6fc
 
 
 
1dbfacb
67cd6fc
 
 
 
4264417
1dbfacb
 
67cd6fc
 
 
 
 
 
 
 
 
 
 
 
1dbfacb
67cd6fc
 
 
4264417
 
c14ac9f
 
 
 
 
 
 
 
c0fa950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a411ea
 
c0fa950
c14ac9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0fa950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a411ea
 
c0fa950
c14ac9f
 
 
 
 
 
 
 
 
c0fa950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a411ea
 
c0fa950
c14ac9f
 
 
 
 
 
 
 
 
 
c0fa950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a411ea
 
c14ac9f
 
 
 
 
 
 
 
 
c0fa950
 
 
 
 
 
 
 
 
 
 
 
 
1a411ea
 
c0fa950
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a411ea
 
c14ac9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5639a81
 
9ee518f
c0fa950
5639a81
c0fa950
67cd6fc
1a3f05a
5639a81
c0fa950
 
5639a81
 
 
 
1a411ea
 
c14ac9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5639a81
 
 
 
 
 
 
1a3f05a
5639a81
 
 
 
1a411ea
 
c14ac9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5639a81
 
 
 
 
 
 
1a3f05a
5639a81
 
 
 
 
 
 
1a411ea
7e0c0f1
c14ac9f
 
 
 
 
 
 
 
 
 
 
 
 
5639a81
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
version: 1.1.0
config:
  REPO_ID: "eduagarcia/open_pt_llm_leaderboard"
  QUEUE_REPO: eduagarcia-temp/llm_pt_leaderboard_requests
  RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_results
  RAW_RESULTS_REPO: eduagarcia-temp/llm_pt_leaderboard_raw_results
  DYNAMIC_INFO_REPO: "eduagarcia-temp/llm_pt_leaderboard_model_info"
  PATH_TO_COLLECTION: "eduagarcia/portuguese-llm-leaderboard-best-models-65c152c13ab3c67bc4f203a6"
  IS_PUBLIC: true
  LEADERBOARD_NAME: "Open Portuguese LLM Leaderboard"
  GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
  TRUST_REMOTE_CODE: true
  SHOW_INCOMPLETE_EVALS: false
  REQUIRE_MODEL_CARD: true
  REQUIRE_MODEL_LICENSE: false
readme:
  general_description: |
    📐 The 🚀 Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of 
    Large Language Models (LLMs) in the Portuguese language across a variety of tasks 
    and datasets.     
  support_description: |
    This leaderboard is made possible by the support of the 
    [Center of Excelence in AI (CEIA)](https://ceia.ufg.br/) at the 
    [Federal University of Goiás (UFG)](https://international.ufg.br/).

    If you have any questions, suggestions, or would like to contribute to the leaderboard,
    please feel free to reach out at [@eduagarcia](https://linktr.ee/eduagarcia).
  about_description: |
    The 🚀 Open PT-LLM Leaderboard is a benchmark for the evaluation of 
    Large Language Models (LLMs) in the Portuguese language.  
    
    The leaderboard is open to submissions of models from the community and 
    is designed to be a resource for  researchers, practitioners, and enthusiasts interested 
    in the development and evaluation of LLMs for the Portuguese language.  

    Supported by the [Center of Excelence in AI (CEIA)](https://ceia.ufg.br/) at the 
    [Federal University of Goiás (UFG)](https://international.ufg.br/), this leaderboard 
    operates on a backend of Nvidia A100-80G GPUs. Evaluations are subject to 
    resource availability, which is not exclusive. Therefore, please be patient if 
    your model is in the queue.

    This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">🤗 Open LLM Leaderboard</a> with 
    portuguese benchmarks. 

    Add the results to your model card: [🧐 Open Portuguese LLM Leaderboard Results PR Opener](https://huggingface.co/spaces/eduagarcia-temp/portuguese-leaderboard-results-to-modelcard)
  citation: |
    @misc{open-pt-llm-leaderboard,
      author = {Garcia, Eduardo A. S.},
      title = {Open Portuguese LLM Leaderboard},
      year = {2024},
      publisher = {Hugging Face},
      howpublished = "\url{https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard}"
    }
tasks:
  enem_challenge:
    benchmark: enem_challenge
    col_name: ENEM
    task_list:
    - enem_challenge
    metric: acc
    few_shot: 3
    limit: null
    baseline: 20.0 #random baseline
    #https://www.sejalguem.com/enem
    #https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html
    human_baseline: 35.0 # ~60 / 180 acertos - nota  ~500
    expert_human_baseline: 70.0 # ~124 / 180 acertos - nota ~700
    description: "The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School
      level exam widely applied every year by the Brazilian government to students that 
      wish to undertake a University degree. This dataset contains 1,430 questions that don't require
      image understanding of the exams from 2010 to 2018, 2022 and 2023."  
    link: https://www.ime.usp.br/~ddm/project/enem/ENEM-GuidingTest.pdf
    sources: ["https://huggingface.co/datasets/eduagarcia/enem_challenge", "https://www.ime.usp.br/~ddm/project/enem/", "https://github.com/piresramon/gpt-4-enem", "https://huggingface.co/datasets/maritaca-ai/enem"]
    baseline_sources: ["https://www.sejalguem.com/enem", "https://vestibular.brasilescola.uol.com.br/enem/confira-as-medias-e-notas-maximas-e-minimas-do-enem-2020/349732.html"]
    citation: |
      @InProceedings{ENEM-Challenge,
        author = {Silveira, Igor Cataneo and Mau\'a, Denis Deratani},
        booktitle = {Proceedings of the 6th Brazilian Conference on Intelligent Systems},
        series = {BRACIS},
        title = {University Entrance Exam as a Guiding Test for Artificial Intelligence},
        pages = {426--431},
        year = {2017}
      }
      @misc{nunes2023evaluating,
        title={Evaluating GPT-3.5 and GPT-4 Models on Brazilian University Admission Exams}, 
        author={Desnes Nunes and Ricardo Primi and Ramon Pires and Roberto Lotufo and Rodrigo Nogueira},
        year={2023},
        eprint={2303.17003},
        archivePrefix={arXiv},
        primaryClass={cs.CL}
      }
      @misc{pires2023evaluating,
        title={Evaluating GPT-4's Vision Capabilities on Brazilian University Admission Exams}, 
        author={Ramon Pires and Thales Sales Almeida and Hugo Abonizio and Rodrigo Nogueira},
        year={2023},
        eprint={2311.14169},
        archivePrefix={arXiv},
        primaryClass={cs.CL}
      }
  bluex:
    benchmark: bluex
    col_name: BLUEX
    task_list:
    - bluex
    metric: acc
    few_shot: 3
    limit: null
    baseline: 22.5 #random baseline
    #https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf 56% mean - 88% @ top-.99 
    #https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf 43,4%  - ~77% @ top-.99 
    human_baseline: 50.0
    expert_human_baseline: 82.5
    description: "BLUEX is a multimodal dataset consisting of the two leading 
    university entrance exams conducted in Brazil: Convest (Unicamp) and Fuvest (USP), 
    spanning from 2018 to 2024. The benchmark comprises of 724 questions that do not have accompanying images"   
    link: https://arxiv.org/abs/2307.05410
    sources: ["https://huggingface.co/datasets/eduagarcia-temp/BLUEX_without_images", "https://github.com/portuguese-benchmark-datasets/bluex", "https://huggingface.co/datasets/portuguese-benchmark-datasets/BLUEX"]
    baseline_sources: ["https://www.comvest.unicamp.br/wp-content/uploads/2023/08/Relatorio_F1_2023.pdf", "https://acervo.fuvest.br/fuvest/2018/FUVEST_2018_indice_discriminacao_1_fase_ins.pdf"]
    citation: |
      @misc{almeida2023bluex,
        title={BLUEX: A benchmark based on Brazilian Leading Universities Entrance eXams}, 
        author={Thales Sales Almeida and Thiago Laitz and Giovana K. Bonás and Rodrigo Nogueira},
        year={2023},
        eprint={2307.05410},
        archivePrefix={arXiv},
        primaryClass={cs.CL}
      }
  oab_exams:
    benchmark: oab_exams
    col_name: OAB Exams
    task_list:
    - oab_exams
    metric: acc
    few_shot: 3
    limit: null
    baseline: 25.0 #random baseline
    #https://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros # 46%
    # http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3
    # Acertou +70% = 17214 / 638500 = top-97,5%
    # desvio top-97,5% -> 46 - 70.0% = 24 
    # z score 97,5% ~ 1,9675
    # desvio padrao estimado -> 12,2
    # top 99% = 46 + 2,33*12,2 = ~75.0
    human_baseline: 46.0
    expert_human_baseline: 75.0
    description: OAB Exams is a dataset of more than 2,000 questions from the Brazilian Bar
      Association's exams, from 2010 to 2018.
    link: https://arxiv.org/abs/1712.05128
    sources: ["https://huggingface.co/datasets/eduagarcia/oab_exams", "https://github.com/legal-nlp/oab-exams"]
    baseline_sources: ["http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol2", "http://fgvprojetos.fgv.br/publicacao/exame-de-ordem-em-numeros-vol3"]
    citation: |
      @inproceedings{d2017passing,
        title={Passing the Brazilian OAB Exam: Data Preparation and Some Experiments1},
        author={d RADEMAKER, Alexandre},
        booktitle={Legal Knowledge and Information Systems: JURIX 2017: The Thirtieth Annual Conference},
        volume={302},
        pages={89},
        year={2017},
        organization={IOS Press}
      }
  assin2_rte:
    benchmark: assin2_rte
    col_name: ASSIN2 RTE
    task_list:
    - assin2_rte
    metric: f1_macro
    few_shot: 15
    limit: null
    baseline: 50.0 #random baseline
    human_baseline: null
    expert_human_baseline: null
    description: "ASSIN 2 (Avaliação de Similaridade Semântica e Inferência Textual - 
    Evaluating Semantic Similarity and Textual Entailment) is the second edition of ASSIN, 
    an evaluation shared task in the scope of the computational processing 
    of Portuguese. Recognising Textual Entailment (RTE), also called Natural Language 
    Inference (NLI), is the task of predicting if a given text (premise) entails (implies) in
    other text (hypothesis)."
    link: https://dl.acm.org/doi/abs/10.1007/978-3-030-41505-1_39
    sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
    citation: |
      @inproceedings{real2020assin,
        title={The assin 2 shared task: a quick overview},
        author={Real, Livy and Fonseca, Erick and Oliveira, Hugo Goncalo},
        booktitle={International Conference on Computational Processing of the Portuguese Language},
        pages={406--412},
        year={2020},
        organization={Springer}
      }
  assin2_sts:
    benchmark: assin2_sts
    col_name: ASSIN2 STS
    task_list:
    - assin2_sts
    metric: pearson
    few_shot: 15
    limit: null
    baseline: 0.0 #random baseline
    human_baseline: null
    expert_human_baseline: null
    description: "Same as dataset as above. Semantic Textual Similarity (STS) 
    ‘measures the degree of semantic equivalence between two sentences’."
    link: https://dl.acm.org/doi/abs/10.1007/978-3-030-41505-1_39
    sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://sites.google.com/view/assin2/", "https://huggingface.co/datasets/assin2"]
  faquad_nli:
    benchmark: faquad_nli
    col_name: FAQUAD NLI
    task_list:
    - faquad_nli
    metric: f1_macro
    few_shot: 15
    limit: null
    baseline: 45.6 #random baseline
    human_baseline: null
    expert_human_baseline: null
    description: "FaQuAD is a Portuguese reading comprehension dataset that follows the format of the 
    Stanford Question Answering Dataset (SQuAD). The dataset aims to address the problem of 
    abundant questions sent by academics whose answers are found in available institutional 
    documents in the Brazilian higher education system. It consists of 900 questions about 
    249 reading passages taken from 18 official documents of a computer science college
    from a Brazilian federal university and 21 Wikipedia articles related to the 
    Brazilian higher education system. FaQuAD-NLI is a modified version of the 
    FaQuAD dataset that repurposes the question answering task as a textual 
    entailment task between a question and its possible answers."
    link: https://ieeexplore.ieee.org/abstract/document/8923668
    sources: ["https://github.com/liafacom/faquad/", "https://huggingface.co/datasets/ruanchaves/faquad-nli"]
    citation: |
      @inproceedings{8923668,
        author={Sayama, Hélio Fonseca and Araujo, Anderson Viçoso and Fernandes, Eraldo Rezende},
        booktitle={2019 8th Brazilian Conference on Intelligent Systems (BRACIS)}, 
        title={FaQuAD: Reading Comprehension Dataset in the Domain of Brazilian Higher Education}, 
        year={2019},
        volume={},
        number={},
        pages={443-448},
        keywords={Training;Context modeling;Encyclopedias;Electronic publishing;Internet;Natural Language Processing;Machine Reading Comprehension;Dataset},
        doi={10.1109/BRACIS.2019.00084}
      }
      @software{Chaves_Rodrigues_napolab_2023,
        author = {Chaves Rodrigues, Ruan and Tanti, Marc and Agerri, Rodrigo},
        doi = {10.5281/zenodo.7781848},
        month = {3},
        title = {{Natural Portuguese Language Benchmark (Napolab)}},
        url = {https://github.com/ruanchaves/napolab},
        version = {1.0.0},
        year = {2023}
      }
  hatebr_offensive:
    benchmark: hatebr_offensive
    col_name: HateBR
    task_list:
    - hatebr_offensive
    metric: f1_macro
    few_shot: 25
    limit: null
    baseline: 50.0
    human_baseline: null
    expert_human_baseline: null
    description: "HateBR is the first large-scale expert annotated dataset of Brazilian Instagram comments for abusive language detection 
    on the web and social media. The HateBR was collected from Brazilian Instagram comments of politicians and manually annotated 
    by specialists. It is composed of 7,000 documents annotated with a binary classification (offensive 
    versus non-offensive comments)."
    link: https://arxiv.org/abs/2103.14972
    sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://github.com/franciellevargas/HateBR", "https://huggingface.co/datasets/ruanchaves/hatebr"]
    citation: |
      @inproceedings{vargas-etal-2022-hatebr,
        title = "{H}ate{BR}: A Large Expert Annotated Corpus of {B}razilian {I}nstagram Comments for Offensive Language and Hate Speech Detection",
        author = "Vargas, Francielle  and
          Carvalho, Isabelle  and
          Rodrigues de G{\'o}es, Fabiana  and
          Pardo, Thiago  and
          Benevenuto, Fabr{\'\i}cio",
        booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
        month = jun,
        year = "2022",
        address = "Marseille, France",
        publisher = "European Language Resources Association",
        url = "https://aclanthology.org/2022.lrec-1.777",
        pages = "7174--7183"
      }
  portuguese_hate_speech:
    benchmark: portuguese_hate_speech
    col_name: PT Hate Speech
    task_list:
    - portuguese_hate_speech
    metric: f1_macro
    few_shot: 25
    limit: null
    baseline: 47.9
    human_baseline: null
    expert_human_baseline: null
    description: "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate')"
    link: https://aclanthology.org/W19-3510/
    sources: ["https://huggingface.co/datasets/eduagarcia/portuguese_benchmark", "https://github.com/paulafortuna/Portuguese-Hate-Speech-Dataset", "https://huggingface.co/datasets/hate_speech_portuguese"]
    citation: |
      @inproceedings{fortuna-etal-2019-hierarchically,
        title = "A Hierarchically-Labeled {P}ortuguese Hate Speech Dataset",
        author = "Fortuna, Paula  and
          Rocha da Silva, Jo{\~a}o  and
          Soler-Company, Juan  and
          Wanner, Leo  and
          Nunes, S{\'e}rgio",
        booktitle = "Proceedings of the 3rd Workshop on Abusive Language Online (ALW3)",
        year = "2019",
        publisher = "Association for Computational Linguistics",
        url = "https://aclanthology.org/W19-3510",
        doi = "10.18653/v1/W19-3510",
        pages = "94--104",
      }
  tweetsentbr:
    benchmark: tweetsentbr
    col_name: tweetSentBR
    task_list:
    - tweetsentbr
    metric: f1_macro
    few_shot: 25
    limit: null
    baseline: 32.8
    human_baseline: null
    expert_human_baseline: null
    description: "TweetSentBR is a corpus of Tweets in Brazilian Portuguese. 
    It was labeled by several annotators following steps stablished on the literature for 
    improving reliability on the task of Sentiment Analysis. Each Tweet was annotated 
    in one of the three following classes: Positive, Negative, Neutral."
    link: https://arxiv.org/abs/1712.08917
    sources: ["https://bitbucket.org/HBrum/tweetsentbr", "https://huggingface.co/datasets/eduagarcia/tweetsentbr_fewshot"]
    citation: |
      @InProceedings{BRUM18.389,
        author = {Henrico Brum and Maria das Gra\c{c}as Volpe Nunes},
        title = "{Building a Sentiment Corpus of Tweets in Brazilian Portuguese}",
        booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
        year = {2018},
        month = {May 7-12, 2018},
        address = {Miyazaki, Japan},
        editor = {Nicoletta Calzolari (Conference chair) and Khalid Choukri and Christopher Cieri and Thierry Declerck and Sara Goggi and Koiti Hasida and Hitoshi Isahara and Bente Maegaard and Joseph Mariani and HÚlŔne Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis and Takenobu Tokunaga},
        publisher = {European Language Resources Association (ELRA)},
        isbn = {979-10-95546-00-9},
        language = {english}
      }