OSainz commited on
Commit
b1e5746
β€’
2 Parent(s): c714a68 36aaa79

Merge branch 'main' of https://huggingface.co/spaces/CONDA-Workshop/Data-Contamination-Report into pr/13

Browse files
Files changed (2) hide show
  1. .gitignore +2 -1
  2. contamination_report.csv +42 -17
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  *.pyc
2
- *.json
 
 
1
  *.pyc
2
+ *.json
3
+ *.lock
contamination_report.csv CHANGED
@@ -1,5 +1,22 @@
1
  Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  allenai/ai2_arc;;CommonCrawl;corpus;;;28.7;data-based;https://arxiv.org/abs/2310.17589;5
5
  tau/commonsense_qa;;CommonCrawl;corpus;;1.6;;data-based;https://arxiv.org/abs/2310.17589;5
@@ -436,32 +453,40 @@ zest;;EleutherAI/pile;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
436
  zest;;togethercomputer/RedPajama-Data-V2;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
437
 
438
 
439
- imdb;;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
440
- imdb;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
441
 
442
- ag_news;;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/pdf/2308.08493;3
443
- ag_news;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
444
 
445
- yelp_review_full;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
446
- yelp_review_full;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
447
 
448
- nyu-mll/glue;rte;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
449
- nyu-mll/glue;rte;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
450
 
451
- nyu-mll/glue;wnli;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/pdf/2308.08493;3
452
- nyu-mll/glue;wnli;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
453
 
454
- samsum;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
455
- samsum;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
456
 
457
- EdinburghNLP/xsum;;GPT-4;model;0.0;;100.0;model-based;https://arxiv.org/pdf/2308.08493;3
458
- EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/pdf/2308.08493;3
459
 
460
- bigbio/mednli;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
461
- bigbio/mednli;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
462
 
463
  RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
464
  RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
 
 
 
 
 
 
 
 
465
 
466
  quac;;GPT-3;model;;99.0;;data-based;https://arxiv.org/abs/2005.14165;13
467
  rajpurkar/squad_v2;;GPT-3;model;;94.0;;data-based;https://arxiv.org/abs/2005.14165;13
@@ -571,4 +596,4 @@ ibragim-bad/arc_easy;;FLAN;model;;20.2;;data-based;https://arxiv.org/abs/2109.01
571
  ibragim-bad/arc_challenge;;FLAN;model;;15.6;;data-based;https://arxiv.org/abs/2109.01652;13
572
  facebook/anli;dev_r3;FLAN;model;;40.2;;data-based;https://arxiv.org/abs/2109.01652;13
573
  facebook/anli;dev_r2;FLAN;model;;97.9;;data-based;https://arxiv.org/abs/2109.01652;13
574
- facebook/anli;dev_r1;FLAN;model;;98.6;;data-based;https://arxiv.org/abs/2109.01652;13
 
1
  Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
2
 
3
+ gsm8k;;GPT-4;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
4
+ ucinlp/drop;;GPT-4;model;;44.00;;model-based;https://arxiv.org/abs/2311.06233;8
5
+ openai_humaneval;;GPT-4;model;;;56.71;model-based;https://arxiv.org/abs/2311.06233;8
6
+ imdb;;GPT-4;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
7
+ imdb;;GPT-3.5;model;;;55.00;model-based;https://arxiv.org/abs/2311.06233;8
8
+ ag_news;;GPT-4;model;;;91.00;model-based;https://arxiv.org/abs/2311.06233;8
9
+ ag_news;;GPT-3.5;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
10
+ yelp_review_full;;GPT-4;model;;;80.00;model-based;https://arxiv.org/abs/2311.06233;8
11
+ yelp_review_full;;GPT-3.5;model;;;13.00;model-based;https://arxiv.org/abs/2311.06233;8
12
+ nyu-mll/glue;rte;GPT-4;model;;60.00;;model-based;https://arxiv.org/abs/2311.06233;8
13
+ nyu-mll/glue;rte;GPT-3.5;model;;71.00;;model-based;https://arxiv.org/abs/2311.06233;8
14
+ nyu-mll/glue;wnli;GPT-4;model;;50.70;;model-based;https://arxiv.org/abs/2311.06233;8
15
+ nyu-mll/glue;wnli;GPT-3.5;model;;12.68;;model-based;https://arxiv.org/abs/2311.06233;8
16
+ samsum;;GPT-4;model;;;77.00;model-based;https://arxiv.org/abs/2311.06233;8
17
+ samsum;;GPT-3.5;model;;;74.00;model-based;https://arxiv.org/abs/2311.06233;8
18
+ EdinburghNLP/xsum;;GPT-4;model;;;95.00;model-based;https://arxiv.org/abs/2311.06233;8
19
+ EdinburghNLP/xsum;;GPT-3.5;model;;;79.00;model-based;https://arxiv.org/abs/2311.06233;8
20
 
21
  allenai/ai2_arc;;CommonCrawl;corpus;;;28.7;data-based;https://arxiv.org/abs/2310.17589;5
22
  tau/commonsense_qa;;CommonCrawl;corpus;;1.6;;data-based;https://arxiv.org/abs/2310.17589;5
 
453
  zest;;togethercomputer/RedPajama-Data-V2;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
454
 
455
 
456
+ imdb;;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
457
+ imdb;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
458
 
459
+ ag_news;;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
460
+ ag_news;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
461
 
462
+ yelp_review_full;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
463
+ yelp_review_full;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
464
 
465
+ nyu-mll/glue;rte;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
466
+ nyu-mll/glue;rte;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
467
 
468
+ nyu-mll/glue;wnli;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
469
+ nyu-mll/glue;wnli;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
470
 
471
+ samsum;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
472
+ samsum;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
473
 
474
+ EdinburghNLP/xsum;;GPT-4;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
475
+ EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
476
 
477
+ bigbio/mednli;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
478
+ bigbio/mednli;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
479
 
480
  RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
481
  RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/pdf/2308.08493;8
482
+ RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
483
+ RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
484
+
485
+
486
+ openai_humaneval;;EleutherAI/pile;corpus;;;12.2;data-based;https://arxiv.org/abs/2403.04811;12
487
+ mbpp;;EleutherAI/pile;corpus;;;3.6;data-based;https://arxiv.org/abs/2403.04811;12
488
+ openai_humaneval;;bigcode/the-stack;corpus;;;18.9;data-based;https://arxiv.org/abs/2403.04811;12
489
+ mbpp;;bigcode/the-stack;corpus;;;20.8;data-based;https://arxiv.org/abs/2403.04811;12
490
 
491
  quac;;GPT-3;model;;99.0;;data-based;https://arxiv.org/abs/2005.14165;13
492
  rajpurkar/squad_v2;;GPT-3;model;;94.0;;data-based;https://arxiv.org/abs/2005.14165;13
 
596
  ibragim-bad/arc_challenge;;FLAN;model;;15.6;;data-based;https://arxiv.org/abs/2109.01652;13
597
  facebook/anli;dev_r3;FLAN;model;;40.2;;data-based;https://arxiv.org/abs/2109.01652;13
598
  facebook/anli;dev_r2;FLAN;model;;97.9;;data-based;https://arxiv.org/abs/2109.01652;13
599
+ facebook/anli;dev_r1;FLAN;model;;98.6;;data-based;https://arxiv.org/abs/2109.01652;13