add evaluation script
Browse files- app.py +9 -1
- src/about.py +226 -0
app.py
CHANGED
@@ -10,6 +10,7 @@ from src.about import (
|
|
10 |
EVALUATION_QUEUE_TEXT,
|
11 |
INTRODUCTION_TEXT,
|
12 |
LLM_BENCHMARKS_TEXT,
|
|
|
13 |
TITLE,
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
@@ -102,7 +103,14 @@ with demo:
|
|
102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
|
104 |
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
with gr.Column():
|
107 |
with gr.Row():
|
108 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
10 |
EVALUATION_QUEUE_TEXT,
|
11 |
INTRODUCTION_TEXT,
|
12 |
LLM_BENCHMARKS_TEXT,
|
13 |
+
EVALUATION_SCRIPT,
|
14 |
TITLE,
|
15 |
)
|
16 |
from src.display.css_html_js import custom_css
|
|
|
103 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
104 |
|
105 |
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
106 |
+
with gr.Accordion(
|
107 |
+
"Evaluation script",
|
108 |
+
open=False,
|
109 |
+
):
|
110 |
+
gr.Markdown(
|
111 |
+
EVALUATION_SCRIPT,
|
112 |
+
elem_classes="markdown-text",
|
113 |
+
)
|
114 |
with gr.Column():
|
115 |
with gr.Row():
|
116 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
src/about.py
CHANGED
@@ -68,7 +68,233 @@ If your model is displayed in the `FAILED` category, its execution stopped.
|
|
68 |
Make sure you have followed the above steps first.
|
69 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
70 |
"""
|
|
|
|
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
73 |
CITATION_BUTTON_TEXT = r"""
|
74 |
"""
|
|
|
68 |
Make sure you have followed the above steps first.
|
69 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
70 |
"""
|
71 |
+
EVALUATION_SCRIPT = """"
|
72 |
+
To evaluate the model you can access the colab notebook at [this link](https://colab.research.google.com/drive/145KAGvgdAb8BrkObUrxAVWBd9EGDqy8N?usp=sharing).
|
73 |
|
74 |
+
First install the necessary libraries
|
75 |
+
```
|
76 |
+
pip install accelerate openai anthropic datasets
|
77 |
+
```
|
78 |
+
Setup your :
|
79 |
+
* OPENAI_API_KEY
|
80 |
+
* ANTHROPIC_API_KEY
|
81 |
+
* HF_TOKEN
|
82 |
+
|
83 |
+
Select a model
|
84 |
+
```python
|
85 |
+
MODEL_ID = # model_id_here
|
86 |
+
```
|
87 |
+
Then run the following script
|
88 |
+
```python
|
89 |
+
from transformers import pipeline
|
90 |
+
import torch
|
91 |
+
import os
|
92 |
+
import json
|
93 |
+
from openai import OpenAI
|
94 |
+
import anthropic
|
95 |
+
from huggingface_hub.utils._token import get_token
|
96 |
+
from huggingface_hub import InferenceClient
|
97 |
+
HF_TOKEN = get_token()
|
98 |
+
|
99 |
+
from datasets import load_dataset
|
100 |
+
|
101 |
+
ds = load_dataset("braindao/solbench-naive-judge-random-v1",split="test")
|
102 |
+
|
103 |
+
|
104 |
+
pipe = pipeline("text-generation", model= MODEL_ID , torch_dtype=torch.bfloat16, device_map="auto")
|
105 |
+
|
106 |
+
def generate(message):
|
107 |
+
messages = [
|
108 |
+
{"role": "user", "content": message},
|
109 |
+
]
|
110 |
+
return pipe(messages,max_new_tokens=1024)[0]["generated_text"][1]["content"]
|
111 |
+
|
112 |
+
def convert_to_int(text):
|
113 |
+
value = 0
|
114 |
+
try :
|
115 |
+
value = int(text)
|
116 |
+
except :
|
117 |
+
pass
|
118 |
+
return value
|
119 |
+
|
120 |
+
def anthropic_judge(code,baseline):
|
121 |
+
prompt = f"""Analyze the provided Solidity code and assign a score from 0 to 10 based on these criteria:
|
122 |
+
|
123 |
+
1. Functionality (0-2 points)
|
124 |
+
2. Security (0-2 points)
|
125 |
+
3. Efficiency (0-2 points)
|
126 |
+
4. Readability and Style (0-2 points)
|
127 |
+
5. Similarity with the Expert Code (0-2 points)
|
128 |
+
|
129 |
+
We
|
130 |
+
Evaluate the code thoroughly, sum up the points, and return ONLY an integer value representing the final score. Your entire response should consist of a single integer between 0 and 10, inclusive.
|
131 |
+
|
132 |
+
Solidity code to evaluate:
|
133 |
+
```solidity
|
134 |
+
{code}
|
135 |
+
```
|
136 |
+
|
137 |
+
Expert Code:
|
138 |
+
```solidity
|
139 |
+
{baseline}
|
140 |
+
```
|
141 |
+
|
142 |
+
OUTPUT FORMAT: [integer]"""
|
143 |
+
|
144 |
+
|
145 |
+
sys = """You are a solidity code judge,
|
146 |
+
You will only reply with an integer value between 0-10"""
|
147 |
+
|
148 |
+
client = anthropic.Anthropic()
|
149 |
+
|
150 |
+
message = client.messages.create(
|
151 |
+
model="claude-3-5-sonnet-20240620",
|
152 |
+
max_tokens=1000,
|
153 |
+
temperature=0,
|
154 |
+
system=sys,
|
155 |
+
messages=[
|
156 |
+
{
|
157 |
+
"role": "user",
|
158 |
+
"content": [
|
159 |
+
{
|
160 |
+
"type": "text",
|
161 |
+
"text": prompt
|
162 |
+
}
|
163 |
+
]
|
164 |
+
}
|
165 |
+
]
|
166 |
+
)
|
167 |
+
return convert_to_int(message.content[0].text)
|
168 |
+
|
169 |
+
|
170 |
+
def openai_judge(code,baseline):
|
171 |
+
prompt = f"""evaluate the following solidity code and return a score between 0 and 10 based how far the code achieves the following criteria:
|
172 |
+
|
173 |
+
1. Functionality (0-2 points)
|
174 |
+
2. Security (0-2 points)
|
175 |
+
3. Efficiency (0-2 points)
|
176 |
+
4. Readability and Style (0-2 points)
|
177 |
+
5. Similarity with the Expert Code (0-2 points)
|
178 |
+
|
179 |
+
code to evaluate:
|
180 |
+
{code}
|
181 |
+
|
182 |
+
expert code:
|
183 |
+
{baseline}
|
184 |
+
|
185 |
+
return only an integer value and no additional comment, score should be either 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 or 10.
|
186 |
+
"""
|
187 |
+
client = OpenAI()
|
188 |
+
completion = client.chat.completions.create(
|
189 |
+
model="gpt-4o",
|
190 |
+
messages=[
|
191 |
+
{"role": "user", "content": prompt}
|
192 |
+
]
|
193 |
+
)
|
194 |
+
return convert_to_int(completion.choices[0].message.content)
|
195 |
+
|
196 |
+
|
197 |
+
def hf_judge(code,baseline):
|
198 |
+
prompt = f"""evaluate the following solidity code and return a score between 0 and 10 based how far the code achieves the following criteria:
|
199 |
+
|
200 |
+
1. Functionality (0-2 points)
|
201 |
+
2. Security (0-2 points)
|
202 |
+
3. Efficiency (0-2 points)
|
203 |
+
4. Readability and Style (0-2 points)
|
204 |
+
5. Similarity with the Expert Code (0-2 points)
|
205 |
+
|
206 |
+
code to evaluate:
|
207 |
+
{code}
|
208 |
+
|
209 |
+
expert code:
|
210 |
+
{baseline}
|
211 |
+
|
212 |
+
return only an integer value and no additional comment, score should be either 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 or 10.
|
213 |
+
"""
|
214 |
+
client = InferenceClient(
|
215 |
+
"meta-llama/Meta-Llama-3.1-405B-Instruct",
|
216 |
+
token=HF_TOKEN,
|
217 |
+
)
|
218 |
+
out = ""
|
219 |
+
try :
|
220 |
+
for message in client.chat_completion(
|
221 |
+
messages=[{"role":"system","content" : "you are a solidity code judge, you will only reply with an integer value between 0-10"},
|
222 |
+
{"role": "user", "content": prompt}],
|
223 |
+
max_tokens=500,
|
224 |
+
stream=True,
|
225 |
+
):
|
226 |
+
out += message.choices[0].delta.content
|
227 |
+
except :
|
228 |
+
pass
|
229 |
+
return convert_to_int(out)
|
230 |
+
|
231 |
+
def LLM_JUDGE(code,baseline,judges=["openai","anthropic","hf"]) :
|
232 |
+
out = {}
|
233 |
+
if "openai" in judges :
|
234 |
+
out["openai"] = openai_judge(code,baseline)
|
235 |
+
if "anthropic" in judges :
|
236 |
+
out["anthropic"] = anthropic_judge(code,baseline)
|
237 |
+
if "hf" in judges :
|
238 |
+
out["hf"] = hf_judge(code,baseline)
|
239 |
+
return out
|
240 |
+
|
241 |
+
# Judge model against data
|
242 |
+
from tqdm import tqdm
|
243 |
+
scores = {"openai":[],"anthropic":[],"hf":[]}
|
244 |
+
for sample in tqdm(ds) :
|
245 |
+
score = evaluate_sample(sample)
|
246 |
+
for key in score.keys():
|
247 |
+
scores[key].append(score[key])
|
248 |
+
|
249 |
+
# normalize scores
|
250 |
+
for key in scores.keys():
|
251 |
+
scores[key] = sum(scores[key])/(10*len(scores[key]))
|
252 |
+
|
253 |
+
|
254 |
+
d = {
|
255 |
+
"config": {
|
256 |
+
"model_dtype": "torch.bfloat16",
|
257 |
+
"model_name": MODEL_ID,
|
258 |
+
"model_sha": "main"
|
259 |
+
},
|
260 |
+
"results": {
|
261 |
+
"openai": {
|
262 |
+
"score": 0
|
263 |
+
},
|
264 |
+
"anthropic": {
|
265 |
+
"score": 0
|
266 |
+
},
|
267 |
+
"hf": {
|
268 |
+
"score": 0
|
269 |
+
}
|
270 |
+
}
|
271 |
+
}
|
272 |
+
|
273 |
+
for key in scores.keys() :
|
274 |
+
d["results"][key]["score"] = scores[key]
|
275 |
+
|
276 |
+
|
277 |
+
# Serializing json
|
278 |
+
json_object = json.dumps(d, indent=4)
|
279 |
+
|
280 |
+
# Writing to sample.json
|
281 |
+
file_name = MODEL_ID.split("/")[1] + ".json"
|
282 |
+
with open(file_name, "w") as outfile:
|
283 |
+
outfile.write(json_object)
|
284 |
+
|
285 |
+
```
|
286 |
+
|
287 |
+
if you are not part of braindao set `create_pr` to **True**
|
288 |
+
```python
|
289 |
+
from huggingface_hub import upload_file
|
290 |
+
upload_file(path_or_fileobj = file_name,
|
291 |
+
path_in_repo=f"{MODEL_ID}.json",
|
292 |
+
repo_id="braindao/results",
|
293 |
+
repo_type="dataset",
|
294 |
+
create_pr=False)
|
295 |
+
```
|
296 |
+
|
297 |
+
""""
|
298 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
299 |
CITATION_BUTTON_TEXT = r"""
|
300 |
"""
|