qiantong-xu commited on
Commit
1307e8d
1 Parent(s): e4fc73f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -3
app.py CHANGED
@@ -39,18 +39,28 @@ TUNED_MODEL_RESULTS='''[llama-30b-toolbench](https://huggingface.co/sambanovasys
39
  [codegen-16B-mono-toolbench](https://huggingface.co/sambanovasystems/codegen-16B-mono-toolbench) & 97.7 & 99.0 & 82.0 & 77.5 & 19.8 & 29.0/ 17.2& 0.0 & 3.5 & 16.2 \\'''
40
 
41
 
 
 
 
 
 
 
 
 
 
 
42
  def get_baseline_df():
43
  df_data = []
44
 
45
  lines = UNTUNED_MODEL_RESULTS.split("\n")
46
  for line in lines:
47
- model_results = line.replace(" ", "").strip("\\").split("&")
48
  assert len(model_results) == 10
49
  model_results.insert(1, "False")
50
  df_data.append(model_results)
51
  lines = TUNED_MODEL_RESULTS.split("\n")
52
  for line in lines:
53
- model_results = line.replace(" ", "").strip("\\").split("&")
54
  assert len(model_results) == 10
55
  model_results.insert(1, "True")
56
  df_data.append(model_results)
@@ -95,7 +105,7 @@ with block:
95
 
96
  gr.Markdown(
97
  """In the table below, we summarize the 3-shot performance of all the models.
98
- We use success rate as the primary evaluation metric for most tasks, except for the WebShop where we report rewards, as well as for VirtualHome where we use executability and Longest Common Subsequence (LCS), following the original metrics proposed by the respective authors.
99
  """
100
  )
101
  with gr.Row():
 
39
  [codegen-16B-mono-toolbench](https://huggingface.co/sambanovasystems/codegen-16B-mono-toolbench) & 97.7 & 99.0 & 82.0 & 77.5 & 19.8 & 29.0/ 17.2& 0.0 & 3.5 & 16.2 \\'''
40
 
41
 
42
+ def parse_line(line):
43
+ model_results = line.replace(" ", "").strip("\\").split("&")
44
+ for i in range(1, len(model_results)):
45
+ if i == 6:
46
+ res = model_results[6].split('/')[-1].strip()
47
+ else:
48
+ res = model_results[i]
49
+ model_results[i] = float(res)
50
+ return model_results
51
+
52
  def get_baseline_df():
53
  df_data = []
54
 
55
  lines = UNTUNED_MODEL_RESULTS.split("\n")
56
  for line in lines:
57
+ model_results = parse_line(line)
58
  assert len(model_results) == 10
59
  model_results.insert(1, "False")
60
  df_data.append(model_results)
61
  lines = TUNED_MODEL_RESULTS.split("\n")
62
  for line in lines:
63
+ model_results = parse_line(line)
64
  assert len(model_results) == 10
65
  model_results.insert(1, "True")
66
  df_data.append(model_results)
 
105
 
106
  gr.Markdown(
107
  """In the table below, we summarize the 3-shot performance of all the models.
108
+ We use success rate as the primary evaluation metric for most tasks, except that we report rewards on WebShop, and the Longest Common Subsequence (LCS) on VirtualHome, following the original metrics proposed by the respective authors.
109
  """
110
  )
111
  with gr.Row():