bofenghuang commited on
Commit
6ca16de
β€’
1 Parent(s): b8d7022
Files changed (3) hide show
  1. README.md +5 -4
  2. common.py +728 -0
  3. qa_browser.py +448 -0
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
  title: Mt Bench French Browser
3
- emoji: πŸ“ˆ
4
- colorFrom: gray
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 4.10.0
8
- app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Mt Bench French Browser
3
+ emoji: πŸ“Š
4
+ colorFrom: yellow
5
+ colorTo: pink
6
  sdk: gradio
7
  sdk_version: 4.10.0
8
+ app_file: qa_browser.py
9
  pinned: false
10
+ license: other
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
common.py ADDED
@@ -0,0 +1,728 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Common data structures and utilities.
3
+ """
4
+
5
+ import ast
6
+ import dataclasses
7
+ import glob
8
+ import json
9
+ import os
10
+ import re
11
+ import time
12
+ from typing import Optional
13
+
14
+ import openai
15
+ import anthropic
16
+
17
+ from mistralai.client import MistralClient
18
+
19
+ from fastchat.model.model_adapter import get_conversation_template, ANTHROPIC_MODEL_LIST, MISTRAL_MODEL_LIST
20
+
21
+ # API setting constants
22
+ API_MAX_RETRY = 16
23
+ API_RETRY_SLEEP = 10
24
+ API_ERROR_OUTPUT = "$ERROR$"
25
+
26
+ TIE_DELTA = 0.1
27
+
28
+ # Categories that need reference answers
29
+ NEED_REF_CATS = ["math", "reasoning", "coding", "arena-hard-200"]
30
+
31
+ # Extract scores from judgments
32
+ two_score_pattern = re.compile("\[\[(\d+\.?\d*),\s?(\d+\.?\d*)\]\]")
33
+ two_score_pattern_backup = re.compile("\[(\d+\.?\d*),\s?(\d+\.?\d*)\]")
34
+ one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
35
+ one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
36
+
37
+ # Sampling temperature configs for
38
+ temperature_config = {
39
+ "writing": 0.7,
40
+ "roleplay": 0.7,
41
+ "extraction": 0.0,
42
+ "math": 0.0,
43
+ "coding": 0.0,
44
+ "reasoning": 0.0,
45
+ "stem": 0.1,
46
+ "humanities": 0.1,
47
+ "arena-hard-200": 0.0,
48
+ }
49
+
50
+ reverse_model_map = {
51
+ "model_1": "model_2",
52
+ "model_2": "model_1",
53
+ }
54
+
55
+
56
+ @dataclasses.dataclass
57
+ class Judge:
58
+ model_name: str
59
+ prompt_template: dict
60
+ ref_based: bool = False
61
+ multi_turn: bool = False
62
+
63
+
64
+ @dataclasses.dataclass
65
+ class MatchSingle:
66
+ question: dict
67
+ model: str
68
+ answer: dict
69
+ judge: Judge
70
+ ref_answer: dict = None
71
+ multi_turn: bool = False
72
+
73
+
74
+ @dataclasses.dataclass
75
+ class MatchPair:
76
+ question: dict
77
+ model_1: str
78
+ model_2: str
79
+ answer_1: dict
80
+ answer_2: dict
81
+ judge: Judge
82
+ ref_answer: dict = None
83
+ multi_turn: bool = False
84
+
85
+
86
+ def load_questions(question_file: str, begin: Optional[int], end: Optional[int]):
87
+ """Load questions from a file."""
88
+ questions = []
89
+ with open(question_file, "r") as ques_file:
90
+ for line in ques_file:
91
+ if line:
92
+ questions.append(json.loads(line))
93
+ questions = questions[begin:end]
94
+ return questions
95
+
96
+
97
+ def load_model_answers(answer_dir: str):
98
+ """Load model answers.
99
+
100
+ The return value is a python dict of type:
101
+ Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
102
+ """
103
+ filenames = glob.glob(os.path.join(answer_dir, "*.jsonl"))
104
+ filenames.sort()
105
+ model_answers = {}
106
+
107
+ for filename in filenames:
108
+ model_name = os.path.basename(filename)[:-6]
109
+ answer = {}
110
+ with open(filename) as fin:
111
+ for line in fin:
112
+ line = json.loads(line)
113
+ answer[line["question_id"]] = line
114
+ model_answers[model_name] = answer
115
+
116
+ return model_answers
117
+
118
+
119
+ def load_judge_prompts(prompt_file: str):
120
+ """Load judge prompts.
121
+
122
+ The return value is a python dict of type:
123
+ Dict[judge_name: str -> dict]
124
+ """
125
+ prompts = {}
126
+ with open(prompt_file) as fin:
127
+ for line in fin:
128
+ line = json.loads(line)
129
+ prompts[line["name"]] = line
130
+ return prompts
131
+
132
+
133
+ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
134
+ kwargs = {}
135
+ model = judge.model_name
136
+ if ref_answer is not None:
137
+ kwargs["ref_answer_1"] = ref_answer["choices"][0]["turns"][0]
138
+ if multi_turn:
139
+ kwargs["ref_answer_2"] = ref_answer["choices"][0]["turns"][1]
140
+
141
+ if multi_turn:
142
+ user_prompt = judge.prompt_template["prompt_template"].format(
143
+ question_1=question["turns"][0],
144
+ question_2=question["turns"][1],
145
+ answer_1=answer["choices"][0]["turns"][0],
146
+ answer_2=answer["choices"][0]["turns"][1],
147
+ **kwargs,
148
+ )
149
+ else:
150
+ user_prompt = judge.prompt_template["prompt_template"].format(
151
+ question=question["turns"][0],
152
+ answer=answer["choices"][0]["turns"][0],
153
+ **kwargs,
154
+ )
155
+
156
+ rating = -1
157
+
158
+ system_prompt = judge.prompt_template["system_prompt"]
159
+ conv = get_conversation_template(model)
160
+ conv.set_system_message(system_prompt)
161
+ conv.append_message(conv.roles[0], user_prompt)
162
+ conv.append_message(conv.roles[1], None)
163
+
164
+ if model in ["gpt-3.5-turbo", "gpt-4"]:
165
+ judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048)
166
+ elif model in ANTHROPIC_MODEL_LIST:
167
+ judgment = chat_compeletion_anthropic(
168
+ model, conv, temperature=0, max_tokens=1024
169
+ )
170
+ elif model in MISTRAL_MODEL_LIST:
171
+ judgment = chat_compeletion_mistral(
172
+ model, conv, temperature=0, max_tokens=1024
173
+ )
174
+ else:
175
+ raise ValueError(f"Invalid judge model name: {model}")
176
+
177
+ if judge.prompt_template["output_format"] == "[[rating]]":
178
+ match = re.search(one_score_pattern, judgment)
179
+ if not match:
180
+ match = re.search(one_score_pattern_backup, judgment)
181
+
182
+ if match:
183
+ rating = ast.literal_eval(match.groups()[0])
184
+ else:
185
+ rating = -1
186
+ else:
187
+ raise ValueError(
188
+ f"invalid output format: {judge.prompt_template['output_format']}"
189
+ )
190
+
191
+ return rating, user_prompt, judgment
192
+
193
+
194
+ def play_a_match_single(match: MatchPair, output_file: str):
195
+ question, model, answer, judge, ref_answer, multi_turn = (
196
+ match.question,
197
+ match.model,
198
+ match.answer,
199
+ match.judge,
200
+ match.ref_answer,
201
+ match.multi_turn,
202
+ )
203
+
204
+ if judge.prompt_template["type"] == "single":
205
+ score, user_prompt, judgment = run_judge_single(
206
+ question, answer, judge, ref_answer, multi_turn=multi_turn
207
+ )
208
+
209
+ question_id = question["question_id"]
210
+ turn = 1 if not multi_turn else 2
211
+ result = {
212
+ "question_id": question_id,
213
+ "model": model,
214
+ "judge": (judge.model_name, judge.prompt_template["name"]),
215
+ "user_prompt": user_prompt,
216
+ "judgment": judgment,
217
+ "score": score,
218
+ "turn": turn,
219
+ "tstamp": time.time(),
220
+ }
221
+ print(
222
+ f"question: {question_id}, turn: {turn}, model: {model}, "
223
+ f"score: {score}, "
224
+ f"judge: {(judge.model_name, judge.prompt_template['name'])}"
225
+ )
226
+ else:
227
+ raise ValueError(f"invalid judge type: {judge['type']}")
228
+
229
+ if output_file:
230
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
231
+ with open(output_file, "a") as fout:
232
+ fout.write(json.dumps(result) + "\n")
233
+
234
+ return result
235
+
236
+
237
+ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=False):
238
+ kwargs = {}
239
+ model = judge.model_name
240
+ if ref_answer is not None:
241
+ kwargs["ref_answer_1"] = ref_answer["choices"][0]["turns"][0]
242
+ if multi_turn:
243
+ kwargs["ref_answer_2"] = ref_answer["choices"][0]["turns"][1]
244
+
245
+ if multi_turn:
246
+ system_prompt = judge.prompt_template["system_prompt"]
247
+ user_prompt = judge.prompt_template["prompt_template"].format(
248
+ question_1=question["turns"][0],
249
+ question_2=question["turns"][1],
250
+ answer_a_1=answer_a["choices"][0]["turns"][0],
251
+ answer_b_1=answer_b["choices"][0]["turns"][0],
252
+ answer_a_2=answer_a["choices"][0]["turns"][1],
253
+ answer_b_2=answer_b["choices"][0]["turns"][1],
254
+ **kwargs,
255
+ )
256
+ else:
257
+ system_prompt = judge.prompt_template["system_prompt"]
258
+ user_prompt = judge.prompt_template["prompt_template"].format(
259
+ question=question["turns"][0],
260
+ answer_a=answer_a["choices"][0]["turns"][0],
261
+ answer_b=answer_b["choices"][0]["turns"][0],
262
+ **kwargs,
263
+ )
264
+
265
+ winner = "error"
266
+
267
+ conv = get_conversation_template(model)
268
+ conv.append_message(conv.roles[0], user_prompt)
269
+ conv.append_message(conv.roles[1], None)
270
+
271
+ if model in ["gpt-3.5-turbo", "gpt-4"]:
272
+ conv.set_system_message(system_prompt)
273
+ judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048)
274
+ elif model in ANTHROPIC_MODEL_LIST:
275
+ if system_prompt != "You are a helpful assistant.":
276
+ user_prompt = "[Instruction]\n" + system_prompt + "\n\n" + user_prompt
277
+ conv.messages[0][1] = user_prompt
278
+ judgment = chat_compeletion_anthropic(
279
+ model, conv, temperature=0, max_tokens=1024
280
+ )
281
+ else:
282
+ raise ValueError(f"Invalid judge model name: {model}")
283
+
284
+ if judge.prompt_template["output_format"] == "[[A]]":
285
+ if "[[A]]" in judgment:
286
+ winner = "A"
287
+ elif "[[B]]" in judgment:
288
+ winner = "B"
289
+ elif "[[C]]" in judgment:
290
+ winner = "tie"
291
+ else:
292
+ winner = "error"
293
+ elif judge.prompt_template["output_format"] == "[[rating_a,rating_b]]":
294
+ match = re.search(two_score_pattern, judgment)
295
+ if not match:
296
+ match = re.search(two_score_pattern_backup, judgment)
297
+ if match:
298
+ scores = [ast.literal_eval(s.strip()) for s in match.groups()]
299
+ if abs(scores[0] - scores[1]) <= TIE_DELTA:
300
+ winner = "tie"
301
+ elif scores[0] > scores[1]:
302
+ winner = "A"
303
+ else:
304
+ winner = "B"
305
+ else:
306
+ winner = "error"
307
+ else:
308
+ raise ValueError(
309
+ f"invalid output format: {judge.prompt_template['output_format']}"
310
+ )
311
+
312
+ return winner, user_prompt, judgment
313
+
314
+
315
+ def play_a_match_pair(match: MatchPair, output_file: str):
316
+ question, model_1, model_2, answer_1, answer_2, judge, ref_answer, multi_turn = (
317
+ match.question,
318
+ match.model_1,
319
+ match.model_2,
320
+ match.answer_1,
321
+ match.answer_2,
322
+ match.judge,
323
+ match.ref_answer,
324
+ match.multi_turn,
325
+ )
326
+
327
+ if judge.prompt_template["type"] == "pairwise":
328
+ g1_winner, g1_user_prompt, g1_judgment = run_judge_pair(
329
+ question, answer_1, answer_2, judge, ref_answer, multi_turn=multi_turn
330
+ )
331
+ g2_winner, g2_user_prompt, g2_judgment = run_judge_pair(
332
+ question, answer_2, answer_1, judge, ref_answer, multi_turn=multi_turn
333
+ )
334
+
335
+ g1_map = {"A": "model_1", "B": "model_2"}
336
+ g2_map = {"A": "model_2", "B": "model_1"}
337
+ g1_winner = g1_map.get(g1_winner, g1_winner)
338
+ g2_winner = g2_map.get(g2_winner, g2_winner)
339
+ question_id = question["question_id"]
340
+ turn = 1 if not multi_turn else 2
341
+
342
+ result = {
343
+ "question_id": question_id,
344
+ "model_1": model_1,
345
+ "model_2": model_2,
346
+ "g1_winner": g1_winner,
347
+ "g2_winner": g2_winner,
348
+ "judge": (judge.model_name, judge.prompt_template["name"]),
349
+ "g1_user_prompt": g1_user_prompt,
350
+ "g1_judgment": g1_judgment,
351
+ "g2_user_prompt": g2_user_prompt,
352
+ "g2_judgment": g2_judgment,
353
+ "turn": turn,
354
+ "tstamp": time.time(),
355
+ }
356
+
357
+ print(
358
+ f"question: {question_id}, turn: {turn}, model_1: {model_1}, model_2: {model_2}, "
359
+ f"g1_winner: {g1_winner}, g2_winner: {g2_winner}, "
360
+ f"judge: {(judge.model_name, judge.prompt_template['name'])}"
361
+ )
362
+ elif judge.prompt_template["type"] == "single":
363
+ m1_score, m1_user_prompt, m1_judgment = run_judge_single(
364
+ question, answer_1, judge
365
+ )
366
+ m2_score, m2_user_prompt, m2_judgment = run_judge_single(
367
+ question, answer_2, judge
368
+ )
369
+
370
+ if abs(m1_score - m2_score) <= TIE_DELTA:
371
+ winner = "tie"
372
+ elif m1_score > m2_score:
373
+ winner = "model_1"
374
+ else:
375
+ winner = "model_2"
376
+
377
+ question_id = question["question_id"]
378
+ result = {
379
+ "question_id": question_id,
380
+ "model_1": model_1,
381
+ "model_2": model_2,
382
+ "g1_winner": winner,
383
+ "g2_winner": winner,
384
+ "judge": (judge.model_name, judge.prompt_template["name"]),
385
+ "g1_user_prompt": m1_user_prompt,
386
+ "g1_judgment": m1_judgment,
387
+ "g2_user_prompt": m2_user_prompt,
388
+ "g2_judgment": m2_judgment,
389
+ "m1_score": m1_score,
390
+ "m2_score": m2_score,
391
+ "tstamp": time.time(),
392
+ }
393
+ print(
394
+ f"question: {question_id}, model_1: {model_1}, model_2: {model_2}, "
395
+ f"winner: {winner}, m1_score: {m1_score}, m2_score: {m2_score}, "
396
+ f"judge: {(judge.model_name, judge.prompt_template['name'])}"
397
+ )
398
+ else:
399
+ raise ValueError(f"invalid judge type: {judge['type']}")
400
+
401
+ if output_file:
402
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
403
+ with open(output_file, "a") as fout:
404
+ fout.write(json.dumps(result) + "\n")
405
+
406
+ return result
407
+
408
+
409
+ def chat_compeletion_openai(model, conv, temperature, max_tokens, api_dict=None):
410
+ if api_dict is not None:
411
+ openai.api_base = api_dict["api_base"]
412
+ openai.api_key = api_dict["api_key"]
413
+ output = API_ERROR_OUTPUT
414
+ for _ in range(API_MAX_RETRY):
415
+ try:
416
+ messages = conv.to_openai_api_messages()
417
+ response = openai.ChatCompletion.create(
418
+ model=model,
419
+ messages=messages,
420
+ n=1,
421
+ temperature=temperature,
422
+ max_tokens=max_tokens,
423
+ )
424
+ output = response["choices"][0]["message"]["content"]
425
+ break
426
+ except openai.error.OpenAIError as e:
427
+ print(type(e), e)
428
+ time.sleep(API_RETRY_SLEEP)
429
+
430
+ return output
431
+
432
+
433
+ def chat_compeletion_openai_azure(model, conv, temperature, max_tokens, api_dict=None):
434
+ openai.api_type = "azure"
435
+ openai.api_version = "2023-07-01-preview"
436
+ if api_dict is not None:
437
+ openai.api_base = api_dict["api_base"]
438
+ openai.api_key = api_dict["api_key"]
439
+ else:
440
+ openai.api_base = os.environ["AZURE_OPENAI_ENDPOINT"]
441
+ openai.api_key = os.environ["AZURE_OPENAI_KEY"]
442
+
443
+ if "azure-" in model:
444
+ model = model[6:]
445
+
446
+ output = API_ERROR_OUTPUT
447
+ for _ in range(API_MAX_RETRY):
448
+ try:
449
+ messages = conv.to_openai_api_messages()
450
+ response = openai.ChatCompletion.create(
451
+ engine=model,
452
+ messages=messages,
453
+ n=1,
454
+ temperature=temperature,
455
+ max_tokens=max_tokens,
456
+ )
457
+ output = response["choices"][0]["message"]["content"]
458
+ break
459
+ except openai.error.OpenAIError as e:
460
+ print(type(e), e)
461
+ time.sleep(API_RETRY_SLEEP)
462
+ except openai.error.InvalidRequestError as e:
463
+ print(type(e), e)
464
+ break
465
+ except KeyError:
466
+ print(response)
467
+ break
468
+
469
+ return output
470
+
471
+
472
+ def chat_compeletion_anthropic(model, conv, temperature, max_tokens):
473
+ output = API_ERROR_OUTPUT
474
+ for _ in range(API_MAX_RETRY):
475
+ try:
476
+ c = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
477
+ prompt = conv.get_prompt()
478
+ response = c.completions.create(
479
+ model=model,
480
+ prompt=prompt,
481
+ stop_sequences=[anthropic.HUMAN_PROMPT],
482
+ max_tokens_to_sample=max_tokens,
483
+ temperature=temperature,
484
+ )
485
+ output = response.completion
486
+ break
487
+ except anthropic.APIError as e:
488
+ print(type(e), e)
489
+ time.sleep(API_RETRY_SLEEP)
490
+ return output.strip()
491
+
492
+
493
+ def chat_compeletion_palm(chat_state, model, conv, temperature, max_tokens):
494
+ from fastchat.serve.api_provider import init_palm_chat
495
+
496
+ assert model == "palm-2-chat-bison-001"
497
+
498
+ if chat_state is None:
499
+ chat_state = init_palm_chat("chat-bison@001")
500
+
501
+ parameters = {
502
+ "temperature": temperature,
503
+ "top_p": 0.8,
504
+ "top_k": 40,
505
+ "max_output_tokens": max_tokens,
506
+ }
507
+ output = API_ERROR_OUTPUT
508
+ for _ in range(API_MAX_RETRY):
509
+ try:
510
+ response = chat_state.send_message(conv.messages[-2][1], **parameters)
511
+ output = response.text
512
+ break
513
+ except Exception as e:
514
+ print(type(e), e)
515
+ time.sleep(API_RETRY_SLEEP)
516
+ return chat_state, output
517
+
518
+
519
+ def chat_compeletion_mistral(model, conv, temperature, max_tokens):
520
+ output = API_ERROR_OUTPUT
521
+ for _ in range(API_MAX_RETRY):
522
+ try:
523
+ c = MistralClient(api_key=os.environ["MISTRAL_API_KEY"])
524
+ messages = conv.to_mistralai_api_messages()
525
+ response = c.chat(
526
+ model=model,
527
+ messages=messages,
528
+ temperature=temperature,
529
+ max_tokens=max_tokens,
530
+ )
531
+ output = response.choices[0].message.content
532
+ break
533
+ except Exception as e:
534
+ print(type(e), e)
535
+ time.sleep(API_RETRY_SLEEP)
536
+ return output
537
+
538
+
539
+ def normalize_game_key_single(gamekey, result):
540
+ """Make the model names sorted in a game key."""
541
+ qid, model_1, model_2 = gamekey
542
+ if model_1 < model_2:
543
+ return gamekey, result
544
+ else:
545
+ new_gamekey = (qid, model_2, model_1)
546
+ new_result = {
547
+ "winners": tuple(reverse_model_map.get(x, x) for x in result["winners"]),
548
+ "g1_judgment": result["g2_judgment"],
549
+ "g2_judgment": result["g1_judgment"],
550
+ }
551
+ return new_gamekey, new_result
552
+
553
+
554
+ def normalize_game_key_dict(judgment_dict):
555
+ """Make the model names sorted in the game keys."""
556
+ ret = {}
557
+ for key, value in judgment_dict.items():
558
+ new_key, new_value = normalize_game_key_single(key, value)
559
+ ret[new_key] = new_value
560
+ return ret
561
+
562
+
563
+ def load_pairwise_model_judgments(filename: str):
564
+ """Load model judgments.
565
+
566
+ The return value is a dict of type:
567
+ Dict[judge: Tuple -> Dict[game_key: tuple -> game_result: dict]
568
+ """
569
+ judge_dict = {}
570
+
571
+ for line in open(filename):
572
+ obj = json.loads(line)
573
+ judge = tuple(obj["judge"])
574
+ qid, model_1, model_2 = obj["question_id"], obj["model_1"], obj["model_2"]
575
+
576
+ if judge not in judge_dict:
577
+ judge_dict[judge] = {}
578
+
579
+ if "winner" in obj:
580
+ winner = obj["winner"]
581
+ elif "g1_winner" in obj and "g2_winner" in obj:
582
+ g1_winner, g2_winner = obj["g1_winner"], obj["g2_winner"]
583
+ if g1_winner == g2_winner:
584
+ winner = g1_winner
585
+ else:
586
+ winner = "inconsistent"
587
+ else:
588
+ raise ValueError(f"Invalid keys: {list(obj.keys())}")
589
+
590
+ gamekey = (qid, model_1, model_2)
591
+ winners = (winner,)
592
+
593
+ judge_dict[judge][gamekey] = {
594
+ "winners": winners,
595
+ "g1_judgment": obj["g1_judgment"],
596
+ "g2_judgment": obj["g2_judgment"],
597
+ }
598
+
599
+ # Make the model names sorted in the game keys
600
+ normalized = {}
601
+ for judge, value in judge_dict.items():
602
+ normalized[judge] = normalize_game_key_dict(value)
603
+ return normalized
604
+
605
+
606
+ def load_single_model_judgments(filename: str):
607
+ """Load model judgments.
608
+
609
+ The return value is a dict of type:
610
+ Dict[judge: Tuple -> Dict[game_key: tuple -> game_result: dict]
611
+ """
612
+ judge_dict = {}
613
+
614
+ for line in open(filename):
615
+ obj = json.loads(line)
616
+ judge = tuple(obj["judge"])
617
+ qid, model = obj["question_id"], obj["model"]
618
+
619
+ if judge not in judge_dict:
620
+ judge_dict[judge] = {}
621
+
622
+ gamekey = (qid, model)
623
+
624
+ judge_dict[judge][gamekey] = {
625
+ "score": obj["score"],
626
+ "judgment": obj["judgment"],
627
+ }
628
+ return judge_dict
629
+
630
+
631
+ def resolve_pairwise_judgment_dict(
632
+ question, model_judgments_normal, model_judgments_math, multi_turn=False
633
+ ):
634
+ """Return the correct pairwise judge."""
635
+ if multi_turn:
636
+ if question["category"] in NEED_REF_CATS:
637
+ return model_judgments_math[("gpt-4", "pair-math-v1-multi-turn")]
638
+ return model_judgments_normal[("gpt-4", "pair-v2-multi-turn")]
639
+
640
+ if question["category"] in NEED_REF_CATS:
641
+ return model_judgments_math[("gpt-4", "pair-math-v1")]
642
+ else:
643
+ return model_judgments_normal[("gpt-4", "pair-v2")]
644
+
645
+
646
+ def resolve_single_judgment_dict(
647
+ question, model_judgments_normal, model_judgments_math, multi_turn=False
648
+ ):
649
+ """Return the correct single answer grading judge."""
650
+ if multi_turn:
651
+ if question["category"] in NEED_REF_CATS:
652
+ return model_judgments_math[("gpt-4", "single-math-v1-multi-turn")]
653
+ return model_judgments_normal[("gpt-4", "single-v1-multi-turn")]
654
+
655
+ if question["category"] in NEED_REF_CATS:
656
+ return model_judgments_math[("gpt-4", "single-math-v1")]
657
+ else:
658
+ return model_judgments_normal[("gpt-4", "single-v1")]
659
+
660
+
661
+ def get_pairwise_judge_explanation(gamekey, judgment_dict):
662
+ """Get model judge explanation."""
663
+ try:
664
+ qid, model_1, model_2 = gamekey
665
+ if model_1 < model_2:
666
+ res = judgment_dict[gamekey]
667
+ g1_judgment, g2_judgment = res["g1_judgment"], res["g2_judgment"]
668
+ else:
669
+ new_gamekey = (qid, model_2, model_1)
670
+ res = judgment_dict[new_gamekey]
671
+
672
+ model_1, model_2 = model_1, model_2
673
+ g1_judgment, g2_judgment = res["g2_judgment"], res["g1_judgment"]
674
+
675
+ return (
676
+ f"**Game 1**. **A**: {model_1}, **B**: {model_2}\n\n"
677
+ f"**Judgment**: {g1_judgment}"
678
+ + f"\n\n`--------------------------`\n\n"
679
+ + f"**Game 2**. **A**: {model_2}, **B**: {model_1}\n\n"
680
+ f"**Judgment**: {g2_judgment}"
681
+ )
682
+ except KeyError:
683
+ return "N/A"
684
+
685
+
686
+ def get_single_judge_explanation(gamekey, judgment_dict):
687
+ """Get model judge explanation."""
688
+ try:
689
+ qid, model = gamekey
690
+
691
+ res = judgment_dict[gamekey]
692
+
693
+ g1_judgment = res["judgment"]
694
+ g1_score = res["score"]
695
+
696
+ return (
697
+ f"**Game 1**. **A**: {model}, **Score**: {g1_score}\n\n"
698
+ f"**Judgment**: {g1_judgment}"
699
+ )
700
+ except KeyError:
701
+ return "N/A"
702
+
703
+
704
+ def check_data(questions, model_answers, ref_answers, models, judges):
705
+ # check model answers
706
+ for m in models:
707
+ assert m in model_answers, f"Missing model answer for {m}"
708
+ m_answer = model_answers[m]
709
+ for q in questions:
710
+ assert (
711
+ q["question_id"] in m_answer
712
+ ), f"Missing model {m}'s answer to Question {q['question_id']}"
713
+ # check ref answers
714
+ for jg in judges.values():
715
+ if not jg.ref_based:
716
+ continue
717
+ for q in questions:
718
+ if q["category"] not in NEED_REF_CATS:
719
+ continue
720
+ assert (
721
+ q["question_id"] in ref_answers[jg.model_name]
722
+ ), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
723
+
724
+
725
+ def get_model_list(answer_dir):
726
+ file_paths = glob.glob(f"{answer_dir}/*.jsonl")
727
+ file_names = [os.path.splitext(os.path.basename(f))[0] for f in file_paths]
728
+ return file_names
qa_browser.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 qa_browser.py --share
4
+ """
5
+
6
+ import argparse
7
+ import os
8
+ import re
9
+ from collections import defaultdict
10
+
11
+ import gradio as gr
12
+ from common import (
13
+ get_pairwise_judge_explanation,
14
+ get_single_judge_explanation,
15
+ load_model_answers,
16
+ load_pairwise_model_judgments,
17
+ load_questions,
18
+ load_single_model_judgments,
19
+ resolve_pairwise_judgment_dict,
20
+ resolve_single_judgment_dict,
21
+ )
22
+ from huggingface_hub import snapshot_download
23
+
24
+ questions = []
25
+ model_answers = {}
26
+
27
+ model_judgments_normal_single = {}
28
+ model_judgments_math_single = {}
29
+
30
+ model_judgments_normal_pairwise = {}
31
+ model_judgments_math_pairwise = {}
32
+
33
+ question_selector_map = {}
34
+ category_selector_map = defaultdict(list)
35
+
36
+
37
+ def display_question(category_selector, request: gr.Request):
38
+ choices = category_selector_map[category_selector]
39
+ return gr.Dropdown.update(
40
+ value=choices[0],
41
+ choices=choices,
42
+ )
43
+
44
+
45
+ def display_pairwise_answer(
46
+ question_selector, model_selector1, model_selector2, request: gr.Request
47
+ ):
48
+ q = question_selector_map[question_selector]
49
+ qid = q["question_id"]
50
+
51
+ ans1 = model_answers[model_selector1][qid]
52
+ ans2 = model_answers[model_selector2][qid]
53
+
54
+ chat_mds = pairwise_to_gradio_chat_mds(q, ans1, ans2)
55
+ gamekey = (qid, model_selector1, model_selector2)
56
+
57
+ judgment_dict = resolve_pairwise_judgment_dict(
58
+ q,
59
+ model_judgments_normal_pairwise,
60
+ model_judgments_math_pairwise,
61
+ multi_turn=False,
62
+ )
63
+
64
+ explanation = (
65
+ "##### Model Judgment (first turn)\n"
66
+ + get_pairwise_judge_explanation(gamekey, judgment_dict)
67
+ )
68
+
69
+ judgment_dict_turn2 = resolve_pairwise_judgment_dict(
70
+ q,
71
+ model_judgments_normal_pairwise,
72
+ model_judgments_math_pairwise,
73
+ multi_turn=True,
74
+ )
75
+
76
+ explanation_turn2 = (
77
+ "##### Model Judgment (second turn)\n"
78
+ + get_pairwise_judge_explanation(gamekey, judgment_dict_turn2)
79
+ )
80
+
81
+ return chat_mds + [explanation] + [explanation_turn2]
82
+
83
+
84
+ def display_single_answer(question_selector, model_selector1, request: gr.Request):
85
+ q = question_selector_map[question_selector]
86
+ qid = q["question_id"]
87
+
88
+ ans1 = model_answers[model_selector1][qid]
89
+
90
+ chat_mds = single_to_gradio_chat_mds(q, ans1)
91
+ gamekey = (qid, model_selector1)
92
+
93
+ judgment_dict = resolve_single_judgment_dict(
94
+ q, model_judgments_normal_single, model_judgments_math_single, multi_turn=False
95
+ )
96
+
97
+ explanation = "##### Model Judgment (first turn)\n" + get_single_judge_explanation(
98
+ gamekey, judgment_dict
99
+ )
100
+
101
+ judgment_dict_turn2 = resolve_single_judgment_dict(
102
+ q, model_judgments_normal_single, model_judgments_math_single, multi_turn=True
103
+ )
104
+
105
+ explanation_turn2 = (
106
+ "##### Model Judgment (second turn)\n"
107
+ + get_single_judge_explanation(gamekey, judgment_dict_turn2)
108
+ )
109
+
110
+ return chat_mds + [explanation] + [explanation_turn2]
111
+
112
+
113
+ newline_pattern1 = re.compile("\n\n(\d+\. )")
114
+ newline_pattern2 = re.compile("\n\n(- )")
115
+
116
+
117
+ def post_process_answer(x):
118
+ """Fix Markdown rendering problems."""
119
+ x = x.replace("\u2022", "- ")
120
+ x = re.sub(newline_pattern1, "\n\g<1>", x)
121
+ x = re.sub(newline_pattern2, "\n\g<1>", x)
122
+ return x
123
+
124
+
125
+ def pairwise_to_gradio_chat_mds(question, ans_a, ans_b, turn=None):
126
+ end = len(question["turns"]) if turn is None else turn + 1
127
+
128
+ mds = ["", "", "", "", "", "", ""]
129
+ for i in range(end):
130
+ base = i * 3
131
+ if i == 0:
132
+ mds[base + 0] = "##### User\n" + question["turns"][i]
133
+ else:
134
+ mds[base + 0] = "##### User's follow-up question \n" + question["turns"][i]
135
+ mds[base + 1] = "##### Assistant A\n" + post_process_answer(
136
+ ans_a["choices"][0]["turns"][i].strip()
137
+ )
138
+ mds[base + 2] = "##### Assistant B\n" + post_process_answer(
139
+ ans_b["choices"][0]["turns"][i].strip()
140
+ )
141
+
142
+ ref = question.get("reference", ["", ""])
143
+
144
+ ref_md = ""
145
+ if turn is None:
146
+ if ref[0] != "" or ref[1] != "":
147
+ mds[6] = f"##### Reference Solution\nQ1. {ref[0]}\nQ2. {ref[1]}"
148
+ else:
149
+ x = ref[turn] if turn < len(ref) else ""
150
+ if x:
151
+ mds[6] = f"##### Reference Solution\n{ref[turn]}"
152
+ else:
153
+ mds[6] = ""
154
+ return mds
155
+
156
+
157
+ def single_to_gradio_chat_mds(question, ans, turn=None):
158
+ end = len(question["turns"]) if turn is None else turn + 1
159
+
160
+ mds = ["", "", "", "", ""]
161
+ for i in range(end):
162
+ base = i * 2
163
+ if i == 0:
164
+ mds[base + 0] = "##### User\n" + question["turns"][i]
165
+ else:
166
+ mds[base + 0] = "##### User's follow-up question \n" + question["turns"][i]
167
+ mds[base + 1] = "##### Assistant A\n" + post_process_answer(
168
+ ans["choices"][0]["turns"][i].strip()
169
+ )
170
+
171
+ # ref = question.get("reference", ["", ""])
172
+ # tmp fix
173
+ ref = question.get("reference", ["", ""]) or ["", ""]
174
+
175
+ ref_md = ""
176
+ if turn is None:
177
+ if ref[0] != "" or ref[1] != "":
178
+ # mds[4] = f"##### Reference Solution\nQ1. {ref[0]}\nQ2. {ref[1]}"
179
+ mds[4] = f"##### Reference Solution\n***Q1***. {ref[0]}\n\n\n***Q2***. {ref[1]}"
180
+ else:
181
+ x = ref[turn] if turn < len(ref) else ""
182
+ if x:
183
+ mds[4] = f"##### Reference Solution\n{ref[turn]}"
184
+ else:
185
+ mds[4] = ""
186
+ return mds
187
+
188
+
189
+ def build_question_selector_map():
190
+ global question_selector_map, category_selector_map
191
+
192
+ # Build question selector map
193
+ for q in questions:
194
+ preview = f"{q['question_id']}: " + q["turns"][0][:128] + "..."
195
+ question_selector_map[preview] = q
196
+ category_selector_map[q["category"]].append(preview)
197
+
198
+
199
+ def sort_models(models):
200
+ priority = {
201
+ "vigostral-7b-chat": "aaaa",
202
+ "gpt-4-0314": "aaab",
203
+ "gpt-3.5-turbo-0613": "aaac",
204
+ "mixtral-8x7b-instruct-v0.1": "aaad",
205
+ "mistral-medium": "aaae",
206
+ }
207
+
208
+ models = list(models)
209
+ models.sort(key=lambda x: priority.get(x, x))
210
+ return models
211
+
212
+
213
+ def build_pairwise_browser_tab():
214
+ global question_selector_map, category_selector_map
215
+
216
+ # models = list(model_answers.keys())
217
+ models = sort_models(list(model_answers.keys()))
218
+ num_sides = 2
219
+ num_turns = 2
220
+ side_names = ["A", "B"]
221
+
222
+ question_selector_choices = list(question_selector_map.keys())
223
+ category_selector_choices = list(category_selector_map.keys())
224
+
225
+ # Selectors
226
+ with gr.Row():
227
+ with gr.Column(scale=1, min_width=200):
228
+ category_selector = gr.Dropdown(
229
+ choices=category_selector_choices, label="Category", container=False
230
+ )
231
+ with gr.Column(scale=100):
232
+ question_selector = gr.Dropdown(
233
+ choices=question_selector_choices, label="Question", container=False
234
+ )
235
+
236
+ model_selectors = [None] * num_sides
237
+ with gr.Row():
238
+ for i in range(num_sides):
239
+ with gr.Column():
240
+ if i == 0:
241
+ value = models[0]
242
+ else:
243
+ value = "gpt-3.5-turbo"
244
+ model_selectors[i] = gr.Dropdown(
245
+ choices=models,
246
+ value=value,
247
+ label=f"Model {side_names[i]}",
248
+ container=False,
249
+ )
250
+
251
+ # Conversation
252
+ chat_mds = []
253
+ for i in range(num_turns):
254
+ chat_mds.append(gr.Markdown(elem_id=f"user_question_{i+1}"))
255
+ with gr.Row():
256
+ for j in range(num_sides):
257
+ with gr.Column(scale=100):
258
+ chat_mds.append(gr.Markdown())
259
+
260
+ if j == 0:
261
+ with gr.Column(scale=1, min_width=8):
262
+ gr.Markdown()
263
+ reference = gr.Markdown(elem_id=f"reference")
264
+ chat_mds.append(reference)
265
+
266
+ model_explanation = gr.Markdown(elem_id="model_explanation")
267
+ model_explanation2 = gr.Markdown(elem_id="model_explanation")
268
+
269
+ # Callbacks
270
+ category_selector.change(display_question, [category_selector], [question_selector])
271
+ question_selector.change(
272
+ display_pairwise_answer,
273
+ [question_selector] + model_selectors,
274
+ chat_mds + [model_explanation] + [model_explanation2],
275
+ )
276
+
277
+ for i in range(num_sides):
278
+ model_selectors[i].change(
279
+ display_pairwise_answer,
280
+ [question_selector] + model_selectors,
281
+ chat_mds + [model_explanation] + [model_explanation2],
282
+ )
283
+
284
+ return (category_selector,)
285
+
286
+
287
+ def build_single_answer_browser_tab():
288
+ global question_selector_map, category_selector_map
289
+
290
+ # models = list(model_answers.keys())
291
+ models = sort_models(list(model_answers.keys()))
292
+ num_sides = 1
293
+ num_turns = 2
294
+ side_names = ["A"]
295
+
296
+ question_selector_choices = list(question_selector_map.keys())
297
+ category_selector_choices = list(category_selector_map.keys())
298
+
299
+ # Selectors
300
+ with gr.Row():
301
+ with gr.Column(scale=1, min_width=200):
302
+ category_selector = gr.Dropdown(
303
+ choices=category_selector_choices, label="Category", container=False
304
+ )
305
+ with gr.Column(scale=100):
306
+ question_selector = gr.Dropdown(
307
+ choices=question_selector_choices, label="Question", container=False
308
+ )
309
+
310
+ model_selectors = [None] * num_sides
311
+ with gr.Row():
312
+ for i in range(num_sides):
313
+ with gr.Column():
314
+ model_selectors[i] = gr.Dropdown(
315
+ choices=models,
316
+ value=models[i] if len(models) > i else "",
317
+ label=f"Model {side_names[i]}",
318
+ container=False,
319
+ )
320
+
321
+ # Conversation
322
+ chat_mds = []
323
+ for i in range(num_turns):
324
+ chat_mds.append(gr.Markdown(elem_id=f"user_question_{i+1}"))
325
+ with gr.Row():
326
+ for j in range(num_sides):
327
+ with gr.Column(scale=100):
328
+ chat_mds.append(gr.Markdown())
329
+
330
+ if j == 0:
331
+ with gr.Column(scale=1, min_width=8):
332
+ gr.Markdown()
333
+
334
+ reference = gr.Markdown(elem_id=f"reference")
335
+ chat_mds.append(reference)
336
+
337
+ model_explanation = gr.Markdown(elem_id="model_explanation")
338
+ model_explanation2 = gr.Markdown(elem_id="model_explanation")
339
+
340
+ # Callbacks
341
+ category_selector.change(display_question, [category_selector], [question_selector])
342
+ question_selector.change(
343
+ display_single_answer,
344
+ [question_selector] + model_selectors,
345
+ chat_mds + [model_explanation] + [model_explanation2],
346
+ )
347
+
348
+ for i in range(num_sides):
349
+ model_selectors[i].change(
350
+ display_single_answer,
351
+ [question_selector] + model_selectors,
352
+ chat_mds + [model_explanation] + [model_explanation2],
353
+ )
354
+
355
+ return (category_selector,)
356
+
357
+
358
+ block_css = """
359
+ #user_question_1 {
360
+ background-color: #DEEBF7;
361
+ }
362
+ #user_question_2 {
363
+ background-color: #E2F0D9;
364
+ }
365
+ #reference {
366
+ background-color: #FFF2CC;
367
+ }
368
+ #model_explanation {
369
+ background-color: #FBE5D6;
370
+ }
371
+ """
372
+
373
+
374
+ def load_demo():
375
+ dropdown_update = gr.Dropdown.update(value=list(category_selector_map.keys())[0])
376
+ # return dropdown_update, dropdown_update
377
+ return dropdown_update
378
+
379
+
380
+ def build_demo():
381
+ build_question_selector_map()
382
+
383
+ with gr.Blocks(
384
+ title="MT-Bench Browser",
385
+ theme=gr.themes.Base(text_size=gr.themes.sizes.text_lg),
386
+ css=block_css,
387
+ ) as demo:
388
+ gr.Markdown(
389
+ """
390
+ # MT-Bench-French Browser
391
+ This demo provides answers and judgments for specific LLMs on the [MT-Bench-French](https://huggingface.co/datasets/bofenghuang/mt-bench-french) dataset, enabling a quick assessment of their capabilities in the French language.
392
+
393
+ *The code for generating these answers and judgments can be found at [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge).*
394
+
395
+ *The code for this demo is adapted from [mt-bench](https://huggingface.co/spaces/lmsys/mt-bench).*
396
+ """
397
+ )
398
+ with gr.Tab("Single Answer Grading"):
399
+ (category_selector,) = build_single_answer_browser_tab()
400
+ # with gr.Tab("Pairwise Comparison"):
401
+ # (category_selector2,) = build_pairwise_browser_tab()
402
+ # demo.load(load_demo, [], [category_selector, category_selector2])
403
+ demo.load(load_demo, [], [category_selector])
404
+
405
+ return demo
406
+
407
+
408
+ if __name__ == "__main__":
409
+ parser = argparse.ArgumentParser()
410
+ parser.add_argument("--host", type=str, default="0.0.0.0")
411
+ parser.add_argument("--port", type=int)
412
+ parser.add_argument("--share", action="store_true")
413
+ parser.add_argument("--bench-name", type=str, default="mt_bench_french")
414
+ parser.add_argument("--bench-dataset-name", type=str, default="bofenghuang/mt-bench-french")
415
+ args = parser.parse_args()
416
+ print(args)
417
+
418
+ if not os.path.exists(f"data/{args.bench_name}"):
419
+ snapshot_download(repo_id=args.bench_dataset_name, local_dir=f"data/{args.bench_name}", repo_type="dataset")
420
+ print(f"Downloaded benchmark dataset {args.bench_dataset_name} to data/{args.bench_name}")
421
+
422
+ question_file = f"data/{args.bench_name}/question.jsonl"
423
+ answer_dir = f"data/{args.bench_name}/model_answer"
424
+ # pairwise_model_judgment_file = (
425
+ # f"data/{args.bench_name}/model_judgment/gpt-4_pair.jsonl"
426
+ # )
427
+ single_model_judgment_file = (
428
+ f"data/{args.bench_name}/model_judgment/gpt-4_single.jsonl"
429
+ )
430
+
431
+ # Load questions
432
+ questions = load_questions(question_file, None, None)
433
+
434
+ # Load answers
435
+ model_answers = load_model_answers(answer_dir)
436
+
437
+ # Load model judgments
438
+ model_judgments_normal_single = (
439
+ model_judgments_math_single
440
+ ) = load_single_model_judgments(single_model_judgment_file)
441
+ # model_judgments_normal_pairwise = (
442
+ # model_judgments_math_pairwise
443
+ # ) = load_pairwise_model_judgments(pairwise_model_judgment_file)
444
+
445
+ demo = build_demo()
446
+ demo.queue(concurrency_count=10, status_update_rate=10, api_open=False).launch(
447
+ server_name=args.host, server_port=args.port, share=args.share, max_threads=200
448
+ )