Spaces:
Runtime error
Runtime error
gsm8k fix, queue, ref_model column
Browse files- app.py +22 -4
- data/code_eval_board.csv +8 -8
- data/queue.csv +17 -0
- detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc +0 -0
- detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc +0 -0
- detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc +0 -0
- detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc +0 -0
- detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc +0 -0
- detect-pretrain-code-contamination/src/eval.py +1 -2
- detect-pretrain-code-contamination/src/run.py +4 -1
- src/__pycache__/css_html.cpython-311.pyc +0 -0
- src/__pycache__/envs.cpython-311.pyc +0 -0
- src/__pycache__/text_content.cpython-311.pyc +0 -0
- src/__pycache__/utils.cpython-311.pyc +0 -0
- src/utils.py +13 -0
app.py
CHANGED
@@ -5,6 +5,7 @@ import sys
|
|
5 |
import time
|
6 |
import pandas as pd
|
7 |
from threading import Thread
|
|
|
8 |
|
9 |
# Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
|
10 |
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
|
@@ -23,6 +24,8 @@ from src.utils import (
|
|
23 |
make_clickable_names,
|
24 |
styled_error,
|
25 |
styled_message,
|
|
|
|
|
26 |
)
|
27 |
|
28 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
@@ -33,7 +36,8 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
|
|
33 |
# CONFIGURATION:
|
34 |
ref_model = "huggyllama/llama-7b"
|
35 |
test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
|
36 |
-
modelQueue =
|
|
|
37 |
|
38 |
def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
|
39 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
@@ -84,7 +88,7 @@ def worker_thread():
|
|
84 |
global modelQueue, server
|
85 |
while True:
|
86 |
for submission in modelQueue:
|
87 |
-
#evaluate(submission[
|
88 |
#modelQueue.pop(modelQueue.index(submission))
|
89 |
|
90 |
# Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
|
@@ -98,7 +102,12 @@ def worker_thread():
|
|
98 |
|
99 |
def queue(model,model_type):
|
100 |
global modelQueue
|
101 |
-
modelQueue.append([model
|
|
|
|
|
|
|
|
|
|
|
102 |
print(f"QUEUE:\n{modelQueue}")
|
103 |
|
104 |
|
@@ -269,6 +278,15 @@ with demo:
|
|
269 |
"## ๐ค Submit a model here:", elem_classes="markdown-text"
|
270 |
)
|
271 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
with gr.Row():
|
273 |
model_name = gr.Textbox(label="Model name")
|
274 |
revision_name = gr.Textbox(
|
@@ -288,7 +306,7 @@ with demo:
|
|
288 |
interactive=True,
|
289 |
)
|
290 |
model_type = gr.Dropdown(
|
291 |
-
choices=["๐ข base", "๐ถ
|
292 |
label="Model type",
|
293 |
multiselect=False,
|
294 |
value=None,
|
|
|
5 |
import time
|
6 |
import pandas as pd
|
7 |
from threading import Thread
|
8 |
+
import numpy as np
|
9 |
|
10 |
# Add the path to the "src" directory of detect-pretrain-code-contamination to the sys.path
|
11 |
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "detect-pretrain-code-contamination"))
|
|
|
24 |
make_clickable_names,
|
25 |
styled_error,
|
26 |
styled_message,
|
27 |
+
EVAL_COLS,
|
28 |
+
EVAL_TYPES
|
29 |
)
|
30 |
|
31 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
36 |
# CONFIGURATION:
|
37 |
ref_model = "huggyllama/llama-7b"
|
38 |
test_datasets = ["truthful_qa","cais/mmlu","ai2_arc","gsm8k","Rowan/hellaswag","winogrande"]
|
39 |
+
modelQueue = (pd.read_csv('data/queue.csv')).values.tolist()
|
40 |
+
print(modelQueue)
|
41 |
|
42 |
def restart_space(): #Most dumbest update function to ever exist, I'm sobbing in tears as I've tried to make gradio update the leaderboard literally any other way.
|
43 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
|
|
88 |
global modelQueue, server
|
89 |
while True:
|
90 |
for submission in modelQueue:
|
91 |
+
#evaluate(submission[1],submission[0].split(" ")[0])
|
92 |
#modelQueue.pop(modelQueue.index(submission))
|
93 |
|
94 |
# Uncomment those lines in order to begin testing, I test these models outside of this space and later commit the results back.
|
|
|
102 |
|
103 |
def queue(model,model_type):
|
104 |
global modelQueue
|
105 |
+
modelQueue.append([model_type,model])
|
106 |
+
|
107 |
+
file_path = "data/queue.csv"
|
108 |
+
with open(file_path, "a") as f:
|
109 |
+
f.write(f"\n{model_type},{model}")
|
110 |
+
f.close()
|
111 |
print(f"QUEUE:\n{modelQueue}")
|
112 |
|
113 |
|
|
|
278 |
"## ๐ค Submit a model here:", elem_classes="markdown-text"
|
279 |
)
|
280 |
with gr.Column():
|
281 |
+
with gr.Column():
|
282 |
+
with gr.Accordion(
|
283 |
+
f"โณ Evaluation Queue ({len(modelQueue)})",
|
284 |
+
open=False,
|
285 |
+
):
|
286 |
+
with gr.Row():
|
287 |
+
finished_eval_table = gr.components.Dataframe(
|
288 |
+
value=pd.DataFrame(modelQueue, columns=['Type','Model']),
|
289 |
+
)
|
290 |
with gr.Row():
|
291 |
model_name = gr.Textbox(label="Model name")
|
292 |
revision_name = gr.Textbox(
|
|
|
306 |
interactive=True,
|
307 |
)
|
308 |
model_type = gr.Dropdown(
|
309 |
+
choices=["๐ข base", "๐ถ finetuned"],
|
310 |
label="Model type",
|
311 |
multiselect=False,
|
312 |
value=None,
|
data/code_eval_board.csv
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
-
T,Models,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K
|
2 |
-
๐ข,roneneldan/TinyStories-3M,0.06,0.1,0.13,0.2,0.01,0
|
3 |
-
๐ข,roneneldan/TinyStories-1M,0.05,0.11,0.09,0.17,0.01,0
|
4 |
-
๐ถ,Fredithefish/ReasonixPajama-3B-HF,0.15,0.24,0.21,0.94,0.01,0.44
|
5 |
-
๐ข,mistralai/Mistral-7B-v0.1,0.54,0.51,0.46,0.75,0,0.91
|
6 |
-
๐ถ,rishiraj/meow,0.11,0.49,0.28,0.36,0.02,0.95
|
7 |
-
๐ถ,Q-bert/MetaMath-Cybertron-Starling,0.52,0.64,0.51,0.75,0.01,0.99
|
8 |
-
๐ถ,upstage/SOLAR-10.7B-Instruct-v1.0,0.11,0.49,0.28,0.36,0.01,0.96
|
|
|
1 |
+
T,Models,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,Reference Model
|
2 |
+
๐ข,roneneldan/TinyStories-3M,0.06,0.1,0.13,0.2,0.01,0,huggyllama/llama-7b
|
3 |
+
๐ข,roneneldan/TinyStories-1M,0.05,0.11,0.09,0.17,0.01,0,huggyllama/llama-7b
|
4 |
+
๐ถ,Fredithefish/ReasonixPajama-3B-HF,0.15,0.24,0.21,0.94,0.01,0.44,huggyllama/llama-7b
|
5 |
+
๐ข,mistralai/Mistral-7B-v0.1,0.54,0.51,0.46,0.75,0,0.91,huggyllama/llama-7b
|
6 |
+
๐ถ,rishiraj/meow,0.11,0.49,0.28,0.36,0.02,0.95,huggyllama/llama-7b
|
7 |
+
๐ถ,Q-bert/MetaMath-Cybertron-Starling,0.52,0.64,0.51,0.75,0.01,0.99,huggyllama/llama-7b
|
8 |
+
๐ถ,upstage/SOLAR-10.7B-Instruct-v1.0,0.11,0.49,0.28,0.36,0.01,0.96,huggyllama/llama-7b
|
data/queue.csv
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Type,Model
|
2 |
+
๐ถ finetuned, AIDC-ai-business/Marcoroni-7B-v3
|
3 |
+
๐ถ finetuned, openchat/openchat_3.5
|
4 |
+
๐ถ finetuned, teknium/OpenHermes-2.5-Mistral-7B
|
5 |
+
๐ถ finetuned, WizardLM/WizardMath-7B-V1.1
|
6 |
+
๐ถ finetuned, Intel/neural-chat-7b-v3-3
|
7 |
+
๐ถ finetuned, mistralai/Mistral-7B-Instruct-v0.2
|
8 |
+
๐ถ finetuned, ehartford/dolphin-2.1-mistral-7b
|
9 |
+
๐ถ finetuned, HuggingFaceH4/zephyr-7b-beta
|
10 |
+
๐ถ finetuned, berkeley-nest/Starling-LM-7B-alpha
|
11 |
+
๐ถ finetuned, Open-Orca/Mistral-7B-OpenOrca
|
12 |
+
๐ถ finetuned, amazon/MistralLite
|
13 |
+
๐ถ finetuned, meta-math/MetaMath-Mistral-7B
|
14 |
+
๐ถ finetuned, microsoft/Orca-2-7b
|
15 |
+
๐ถ finetuned, 01-ai/Yi-6B-200K
|
16 |
+
๐ถ finetuned, Yhyu13/LMCocktail-10.7B-v1
|
17 |
+
๐ถ finetuned, openchat/openchat-3.5-1210
|
detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc
CHANGED
Binary files a/detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/analyze.cpython-311.pyc differ
|
|
detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc
CHANGED
Binary files a/detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/eval.cpython-311.pyc differ
|
|
detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc
CHANGED
Binary files a/detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/options.cpython-311.pyc differ
|
|
detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc
CHANGED
Binary files a/detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/run.cpython-311.pyc differ
|
|
detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc
CHANGED
Binary files a/detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc and b/detect-pretrain-code-contamination/src/__pycache__/utils.cpython-311.pyc differ
|
|
detect-pretrain-code-contamination/src/eval.py
CHANGED
@@ -147,8 +147,7 @@ def process_gsm8k(data):
|
|
147 |
new_data = []
|
148 |
for ex in data:
|
149 |
new_ex = {}
|
150 |
-
|
151 |
-
output = ex["answer"]
|
152 |
new_ex["output"] = output
|
153 |
new_ex["input"] = ex["question"] + " " + output
|
154 |
new_data.append(new_ex)
|
|
|
147 |
new_data = []
|
148 |
for ex in data:
|
149 |
new_ex = {}
|
150 |
+
output = ex["answer"].split('####')[0].strip()
|
|
|
151 |
new_ex["output"] = output
|
152 |
new_ex["input"] = ex["question"] + " " + output
|
153 |
new_data.append(new_ex)
|
detect-pretrain-code-contamination/src/run.py
CHANGED
@@ -44,7 +44,10 @@ def load_model(name1):
|
|
44 |
if name1 not in models:
|
45 |
model1 = AutoModelForCausalLM.from_pretrained(name1, return_dict=True, device_map='auto')
|
46 |
model1.eval()
|
47 |
-
|
|
|
|
|
|
|
48 |
|
49 |
tokenizer1.pad_token = tokenizer1.eos_token
|
50 |
models[name1] = model1
|
|
|
44 |
if name1 not in models:
|
45 |
model1 = AutoModelForCausalLM.from_pretrained(name1, return_dict=True, device_map='auto')
|
46 |
model1.eval()
|
47 |
+
if name1.contains('mistral') or name1.contains('Mistral'): #Loading default mistral tokenizers as some tokenizers don't work out of the box.
|
48 |
+
tokenizer1 = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
|
49 |
+
else:
|
50 |
+
tokenizer1 = AutoTokenizer.from_pretrained(name1)
|
51 |
|
52 |
tokenizer1.pad_token = tokenizer1.eos_token
|
53 |
models[name1] = model1
|
src/__pycache__/css_html.cpython-311.pyc
CHANGED
Binary files a/src/__pycache__/css_html.cpython-311.pyc and b/src/__pycache__/css_html.cpython-311.pyc differ
|
|
src/__pycache__/envs.cpython-311.pyc
CHANGED
Binary files a/src/__pycache__/envs.cpython-311.pyc and b/src/__pycache__/envs.cpython-311.pyc differ
|
|
src/__pycache__/text_content.cpython-311.pyc
CHANGED
Binary files a/src/__pycache__/text_content.cpython-311.pyc and b/src/__pycache__/text_content.cpython-311.pyc differ
|
|
src/__pycache__/utils.cpython-311.pyc
CHANGED
Binary files a/src/__pycache__/utils.cpython-311.pyc and b/src/__pycache__/utils.cpython-311.pyc differ
|
|
src/utils.py
CHANGED
@@ -31,6 +31,7 @@ class AutoEvalColumn: # Auto evals column
|
|
31 |
Winogrande = ColumnContent("Winogrande", "number", True)
|
32 |
GSM8K = ColumnContent("GSM8K", "number", True)
|
33 |
dummy = ColumnContent("Models", "str", True)
|
|
|
34 |
|
35 |
|
36 |
def model_hyperlink(link, model_name):
|
@@ -77,3 +78,15 @@ def is_model_on_hub(model_name: str, revision: str) -> bool:
|
|
77 |
except Exception as e:
|
78 |
print(f"Could not get the model config from the hub.: {e}")
|
79 |
return False, "was not found on hub!"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
Winogrande = ColumnContent("Winogrande", "number", True)
|
32 |
GSM8K = ColumnContent("GSM8K", "number", True)
|
33 |
dummy = ColumnContent("Models", "str", True)
|
34 |
+
ref_model = ColumnContent("Reference Model", "str", True)
|
35 |
|
36 |
|
37 |
def model_hyperlink(link, model_name):
|
|
|
78 |
except Exception as e:
|
79 |
print(f"Could not get the model config from the hub.: {e}")
|
80 |
return False, "was not found on hub!"
|
81 |
+
|
82 |
+
@dataclass(frozen=True)
|
83 |
+
class EvalQueueColumn: # Queue column
|
84 |
+
model = ColumnContent("model", "markdown", True)
|
85 |
+
revision = ColumnContent("revision", "str", True)
|
86 |
+
private = ColumnContent("private", "bool", True)
|
87 |
+
precision = ColumnContent("precision", "str", True)
|
88 |
+
weight_type = ColumnContent("weight_type", "str", "Original")
|
89 |
+
status = ColumnContent("status", "str", True)
|
90 |
+
|
91 |
+
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
92 |
+
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|