qiantong-xu
commited on
Commit
•
1307e8d
1
Parent(s):
e4fc73f
Update app.py
Browse files
app.py
CHANGED
@@ -39,18 +39,28 @@ TUNED_MODEL_RESULTS='''[llama-30b-toolbench](https://huggingface.co/sambanovasys
|
|
39 |
[codegen-16B-mono-toolbench](https://huggingface.co/sambanovasystems/codegen-16B-mono-toolbench) & 97.7 & 99.0 & 82.0 & 77.5 & 19.8 & 29.0/ 17.2& 0.0 & 3.5 & 16.2 \\'''
|
40 |
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
def get_baseline_df():
|
43 |
df_data = []
|
44 |
|
45 |
lines = UNTUNED_MODEL_RESULTS.split("\n")
|
46 |
for line in lines:
|
47 |
-
model_results = line
|
48 |
assert len(model_results) == 10
|
49 |
model_results.insert(1, "False")
|
50 |
df_data.append(model_results)
|
51 |
lines = TUNED_MODEL_RESULTS.split("\n")
|
52 |
for line in lines:
|
53 |
-
model_results = line
|
54 |
assert len(model_results) == 10
|
55 |
model_results.insert(1, "True")
|
56 |
df_data.append(model_results)
|
@@ -95,7 +105,7 @@ with block:
|
|
95 |
|
96 |
gr.Markdown(
|
97 |
"""In the table below, we summarize the 3-shot performance of all the models.
|
98 |
-
We use success rate as the primary evaluation metric for most tasks, except
|
99 |
"""
|
100 |
)
|
101 |
with gr.Row():
|
|
|
39 |
[codegen-16B-mono-toolbench](https://huggingface.co/sambanovasystems/codegen-16B-mono-toolbench) & 97.7 & 99.0 & 82.0 & 77.5 & 19.8 & 29.0/ 17.2& 0.0 & 3.5 & 16.2 \\'''
|
40 |
|
41 |
|
42 |
+
def parse_line(line):
|
43 |
+
model_results = line.replace(" ", "").strip("\\").split("&")
|
44 |
+
for i in range(1, len(model_results)):
|
45 |
+
if i == 6:
|
46 |
+
res = model_results[6].split('/')[-1].strip()
|
47 |
+
else:
|
48 |
+
res = model_results[i]
|
49 |
+
model_results[i] = float(res)
|
50 |
+
return model_results
|
51 |
+
|
52 |
def get_baseline_df():
|
53 |
df_data = []
|
54 |
|
55 |
lines = UNTUNED_MODEL_RESULTS.split("\n")
|
56 |
for line in lines:
|
57 |
+
model_results = parse_line(line)
|
58 |
assert len(model_results) == 10
|
59 |
model_results.insert(1, "False")
|
60 |
df_data.append(model_results)
|
61 |
lines = TUNED_MODEL_RESULTS.split("\n")
|
62 |
for line in lines:
|
63 |
+
model_results = parse_line(line)
|
64 |
assert len(model_results) == 10
|
65 |
model_results.insert(1, "True")
|
66 |
df_data.append(model_results)
|
|
|
105 |
|
106 |
gr.Markdown(
|
107 |
"""In the table below, we summarize the 3-shot performance of all the models.
|
108 |
+
We use success rate as the primary evaluation metric for most tasks, except that we report rewards on WebShop, and the Longest Common Subsequence (LCS) on VirtualHome, following the original metrics proposed by the respective authors.
|
109 |
"""
|
110 |
)
|
111 |
with gr.Row():
|