justinxzhao commited on
Commit
577870e
·
1 Parent(s): 3703473

Some refactoring, judging responses for direct assessment.

Browse files
Files changed (6) hide show
  1. app.py +227 -120
  2. app2.py +52 -0
  3. constants.py +119 -0
  4. judging.py +28 -0
  5. judging_dataclasses.py +28 -0
  6. prompts.py +150 -0
app.py CHANGED
@@ -7,6 +7,15 @@ import anthropic
7
  from together import Together
8
  import google.generativeai as genai
9
  import time
 
 
 
 
 
 
 
 
 
10
 
11
  dotenv.load_dotenv()
12
 
@@ -31,31 +40,6 @@ openai_client = OpenAI(
31
  # anthropic_client = anthropic.Client(api_key=ANTHROPIC_API_KEY)
32
  anthropic_client = anthropic.Anthropic()
33
 
34
- LLM_COUNCIL_MEMBERS = {
35
- "Smalls": [
36
- "openai://gpt-4o-mini",
37
- "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
38
- "vertex://gemini-1.5-flash-001",
39
- "anthropic://claude-3-haiku-20240307",
40
- ],
41
- "Flagships": [
42
- "openai://gpt-4",
43
- "together://meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
44
- "vertex://gemini-1.5-pro-001",
45
- "anthropic://claude-3-5-sonnet",
46
- ],
47
- }
48
-
49
- PROVIDER_TO_AVATAR_MAP = {
50
- "openai://gpt-4o-mini": "",
51
- "anthropic://claude-3-5-sonnet": "",
52
- "vertex://gemini-1.5-flash-001": "",
53
- "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": "",
54
- "anthropic://claude-3-haiku-20240307": "",
55
- }
56
-
57
- AGGREGATORS = ["openai://gpt-4o-mini", "openai://gpt-4o"]
58
-
59
 
60
  def anthropic_streamlit_streamer(stream):
61
  """
@@ -88,6 +72,15 @@ def together_streamlit_streamer(stream):
88
  yield chunk.choices[0].delta.content
89
 
90
 
 
 
 
 
 
 
 
 
 
91
  # Helper functions for LLM council and aggregator selection
92
  def llm_council_selector():
93
  selected_council = st.radio(
@@ -133,16 +126,17 @@ def get_google_response(model_name, prompt):
133
  return model.generate_content(prompt, stream=True)
134
 
135
 
136
- def get_llm_response(model_identifier, prompt):
 
137
  provider, model_name = model_identifier.split("://")
138
  if provider == "openai":
139
  return get_openai_response(model_name, prompt)
140
  elif provider == "anthropic":
141
- return get_anthropic_response(model_name, prompt)
142
  elif provider == "together":
143
- return get_together_response(model_name, prompt)
144
  elif provider == "vertex":
145
- return get_google_response(model_name, prompt)
146
  else:
147
  return None
148
 
@@ -151,6 +145,97 @@ def get_response_key(model):
151
  return model + ".response"
152
 
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  # Main Streamlit App
155
  def main():
156
  st.set_page_config(
@@ -183,7 +268,7 @@ def main():
183
  # App title and description
184
  st.title("Language Model Council Sandbox")
185
  st.markdown("###### Invoke a council of LLMs to generate and judge each other.")
186
- st.markdown("###### [ArXiv Paper](https://arxiv.org/abs/2406.08598)")
187
 
188
  # Authentication system
189
  if "authenticated" not in st.session_state:
@@ -206,60 +291,151 @@ def main():
206
  selected_models = llm_council_selector()
207
  st.write("Selected Models:", selected_models)
208
  selected_aggregator = aggregator_selector()
209
- st.write("Selected Aggregator:", selected_aggregator)
210
 
211
  # Prompt input
212
- prompt = st.text_area("Enter your prompt:")
213
 
214
  if st.button("Submit"):
215
  st.write("Responses:")
216
 
217
  # Fetching and streaming responses from each selected model
 
218
  for model in selected_models:
219
- # with st.chat_message(model):
220
  with st.chat_message(
221
  model,
222
  avatar=PROVIDER_TO_AVATAR_MAP[model],
223
  ):
224
  message_placeholder = st.empty()
225
- stream = get_llm_response(model, prompt)
226
  if stream:
227
- if model.startswith("anthropic"):
228
- stream = anthropic_streamlit_streamer(stream)
229
- elif model.startswith("vertex"):
230
- stream = google_streamlit_streamer(stream)
231
- elif model.startswith("together"):
232
- stream = together_streamlit_streamer(stream)
233
  st.session_state[get_response_key(model)] = (
234
  message_placeholder.write_stream(stream)
235
  )
236
 
237
- # Constructing the aggregator prompt
238
- aggregator_prompt = f"User prompt: {prompt}\n\n"
239
- aggregator_prompt += "Responses from other LLMs:\n\n"
240
- aggregator_prompt += "\n".join(
241
- [
242
- f"{model}: {st.session_state.get(get_response_key(model))} \n\n"
243
- for model in selected_models
244
- ]
245
  )
246
- aggregator_prompt += "\n\nPlease provide an aggregated response."
247
 
248
  with st.expander("Aggregator Prompt"):
249
  st.write(aggregator_prompt)
250
 
251
  # Fetching and streaming response from the aggregator
252
- st.write(f"Aggregated response from {selected_aggregator}:")
253
  with st.chat_message(
254
  selected_aggregator,
255
  avatar=PROVIDER_TO_AVATAR_MAP[selected_aggregator],
256
  ):
257
  message_placeholder = st.empty()
258
- aggregator_stream = get_llm_response(
259
  selected_aggregator, aggregator_prompt
260
  )
261
  if aggregator_stream:
262
  message_placeholder.write_stream(aggregator_stream)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  else:
264
  with cols[1]:
265
  st.warning("Please log in to access this app.")
@@ -267,72 +443,3 @@ def main():
267
 
268
  if __name__ == "__main__":
269
  main()
270
-
271
-
272
- # Fix the aggregator step.
273
- # Add a judging step.
274
- # Add visualizations.
275
-
276
-
277
- # import streamlit as st
278
- # from components import llm_council_selector
279
-
280
- # st.title("LLM Council Selector")
281
-
282
- # selected_models = llm_council_selector()
283
-
284
- # if selected_models is not None:
285
- # st.write("Selected Models:", selected_models)
286
- # else:
287
- # st.write("No models selected or component didn't return a value.")
288
-
289
-
290
- # Choose your council.
291
- # Pre-selected.
292
- # Smalls: GPT-4o-mini, llama-3.1-70b, qwen-2.0-70b
293
- # Flagships: GPT-4o, llama-3.1-405b, qwen-2.0-110b, gemini, claude-3.5-sonnet
294
- # Best: chatgpt-4o-latest, gemini-1.5-pro-exp-0827, grok-2-2024-08-13, claude-3-5-sonnet-20240620, llama-3.1-405b-instruct
295
- # Custom:
296
- # Choose from a list of available models.
297
- # All:
298
- # All available models.
299
-
300
- # Choose aggregator.
301
- # Aggregators are models proficient in synthesizing responses from other models into a single, highquality output. An effective aggregator should maintain or enhance output quality even when
302
- # integrating inputs that are of lesser quality than its own.
303
- # Choices:
304
- # - 4o-latest
305
- # - gemini-1.5
306
- # - grok-2
307
- # - claude-3.5-sonnet
308
- # - llama-3.1-405b-instruct
309
-
310
- # Provide a prompt. (Or pre-canned prompts.)
311
- # Paste chat history.
312
-
313
- # Checkbox, enable judging.
314
- #
315
- # If checked, Judging config:
316
- # Single sided
317
- # Provide criteria. (or default).
318
- # If pairwise, choose granularity (or default).
319
- # Choose criteria. (or default).
320
- # Enable position swapping?
321
-
322
- # Go button.
323
- # Sections.
324
- # 1. Model outputs.
325
- # 2. Aggregated output.
326
- # 3. Judging underneath each output.
327
- # Highlight in green, the output that was best, as determined by council.
328
- # Show graph breakdown of scores and justifications. (by criteria, # wins and # losses)
329
- # Show final overall score.
330
- # Highlight in red, the output that was worst, as determined by council.
331
- # Judging section.
332
- # Show agreement matrix.
333
- # Show bar graph of self-bias.
334
- # Plot contrarianism vs. conviction (scatter plot)
335
- # Show per-judge scores.
336
-
337
- # Calculate total cost.
338
- # Show total tokens used.
 
7
  from together import Together
8
  import google.generativeai as genai
9
  import time
10
+ from typing import List, Optional, Literal, Union
11
+ from constants import (
12
+ LLM_COUNCIL_MEMBERS,
13
+ PROVIDER_TO_AVATAR_MAP,
14
+ AGGREGATORS,
15
+ )
16
+ from prompts import *
17
+ from judging_dataclasses import *
18
+
19
 
20
  dotenv.load_dotenv()
21
 
 
40
  # anthropic_client = anthropic.Client(api_key=ANTHROPIC_API_KEY)
41
  anthropic_client = anthropic.Anthropic()
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def anthropic_streamlit_streamer(stream):
45
  """
 
72
  yield chunk.choices[0].delta.content
73
 
74
 
75
+ def llm_streamlit_streamer(stream, llm):
76
+ if llm.startswith("anthropic"):
77
+ return anthropic_streamlit_streamer(stream)
78
+ elif llm.startswith("vertex"):
79
+ return google_streamlit_streamer(stream)
80
+ elif llm.startswith("together"):
81
+ return together_streamlit_streamer(stream)
82
+
83
+
84
  # Helper functions for LLM council and aggregator selection
85
  def llm_council_selector():
86
  selected_council = st.radio(
 
126
  return model.generate_content(prompt, stream=True)
127
 
128
 
129
+ def get_llm_response_stream(model_identifier, prompt):
130
+ """Returns a streamlit-friendly stream of response tokens from the LLM."""
131
  provider, model_name = model_identifier.split("://")
132
  if provider == "openai":
133
  return get_openai_response(model_name, prompt)
134
  elif provider == "anthropic":
135
+ return anthropic_streamlit_streamer(get_anthropic_response(model_name, prompt))
136
  elif provider == "together":
137
+ return together_streamlit_streamer(get_together_response(model_name, prompt))
138
  elif provider == "vertex":
139
+ return google_streamlit_streamer(get_google_response(model_name, prompt))
140
  else:
141
  return None
142
 
 
145
  return model + ".response"
146
 
147
 
148
+ def get_model_from_response_key(response_key):
149
+ return response_key.split(".")[0]
150
+
151
+
152
+ def get_judging_key(judge_model, response_model):
153
+ return "judge." + judge_model + "." + response_model
154
+
155
+
156
+ def get_aggregator_response_key(model):
157
+ return model + ".aggregator_response"
158
+
159
+
160
+ # Streamlit form UI
161
+ def render_criteria_form(criteria_num):
162
+ """Render a criteria input form."""
163
+ with st.expander(f"Criteria {criteria_num + 1}"):
164
+ name = st.text_input(f"Name for Criteria {criteria_num + 1}")
165
+ description = st.text_area(f"Description for Criteria {criteria_num + 1}")
166
+ min_score = st.number_input(
167
+ f"Min Score for Criteria {criteria_num + 1}", min_value=0, step=1
168
+ )
169
+ max_score = st.number_input(
170
+ f"Max Score for Criteria {criteria_num + 1}", min_value=0, step=1
171
+ )
172
+ return Criteria(
173
+ name=name, description=description, min_score=min_score, max_score=max_score
174
+ )
175
+
176
+
177
+ def get_response_mapping():
178
+ # Inspect the session state for all the responses.
179
+ # This is a dictionary mapping model names to their responses.
180
+ # The aggregator response is also included in this mapping under the key "<model>.aggregator_response".
181
+ response_mapping = {}
182
+ for key in st.session_state.keys():
183
+ if key.endswith(".response"):
184
+ response_mapping[get_model_from_response_key(key)] = st.session_state[key]
185
+ if key.endswith(".aggregator_response"):
186
+ response_mapping[key] = st.session_state[key]
187
+ return response_mapping
188
+
189
+
190
+ def format_likert_comparison_options(options):
191
+ return "\n".join([f"{i + 1}: {option}" for i, option in enumerate(options)])
192
+
193
+
194
+ def format_criteria_list(criteria_list):
195
+ return "\n".join(
196
+ [f"{criteria.name}: {criteria.description}" for criteria in criteria_list]
197
+ )
198
+
199
+
200
+ def get_direct_assessment_prompt(
201
+ direct_assessment_prompt, user_prompt, response, criteria_list, options
202
+ ):
203
+ return direct_assessment_prompt.format(
204
+ user_prompt=user_prompt,
205
+ response=response,
206
+ criteria_list=f"{format_criteria_list(DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST)}",
207
+ options=f"{format_likert_comparison_options(SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS)}",
208
+ )
209
+
210
+
211
+ def get_default_direct_assessment_prompt(user_prompt):
212
+ return get_direct_assessment_prompt(
213
+ DEFAULT_DIRECT_ASSESSMENT_PROMPT,
214
+ user_prompt=user_prompt,
215
+ response="{{response}}",
216
+ criteria_list=DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST,
217
+ options=SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
218
+ )
219
+
220
+
221
+ def get_aggregator_prompt(aggregator_prompt, user_prompt, llms):
222
+ responses_from_other_llms = "\n\n".join(
223
+ [f"{model}: {st.session_state.get(get_response_key(model))}" for model in llms]
224
+ )
225
+ return aggregator_prompt.format(
226
+ user_prompt=user_prompt,
227
+ responses_from_other_llms=responses_from_other_llms,
228
+ )
229
+
230
+
231
+ def get_default_aggregator_prompt(user_prompt, llms):
232
+ return get_aggregator_prompt(
233
+ DEFAULT_AGGREGATOR_PROMPT,
234
+ user_prompt=user_prompt,
235
+ llms=llms,
236
+ )
237
+
238
+
239
  # Main Streamlit App
240
  def main():
241
  st.set_page_config(
 
268
  # App title and description
269
  st.title("Language Model Council Sandbox")
270
  st.markdown("###### Invoke a council of LLMs to generate and judge each other.")
271
+ st.markdown("###### [Paper](https://arxiv.org/abs/2406.08598)")
272
 
273
  # Authentication system
274
  if "authenticated" not in st.session_state:
 
291
  selected_models = llm_council_selector()
292
  st.write("Selected Models:", selected_models)
293
  selected_aggregator = aggregator_selector()
294
+ # st.write("Selected Aggregator:", selected_aggregator)
295
 
296
  # Prompt input
297
+ user_prompt = st.text_area("Enter your prompt:")
298
 
299
  if st.button("Submit"):
300
  st.write("Responses:")
301
 
302
  # Fetching and streaming responses from each selected model
303
+ # TODO: Make this asynchronous?
304
  for model in selected_models:
 
305
  with st.chat_message(
306
  model,
307
  avatar=PROVIDER_TO_AVATAR_MAP[model],
308
  ):
309
  message_placeholder = st.empty()
310
+ stream = get_llm_response_stream(model, user_prompt)
311
  if stream:
 
 
 
 
 
 
312
  st.session_state[get_response_key(model)] = (
313
  message_placeholder.write_stream(stream)
314
  )
315
 
316
+ # Get the aggregator prompt.
317
+ aggregator_prompt = get_default_aggregator_prompt(
318
+ user_prompt=user_prompt, llms=selected_models
 
 
 
 
 
319
  )
 
320
 
321
  with st.expander("Aggregator Prompt"):
322
  st.write(aggregator_prompt)
323
 
324
  # Fetching and streaming response from the aggregator
325
+ st.write(f"Mixture-of-Agents response from {selected_aggregator}:")
326
  with st.chat_message(
327
  selected_aggregator,
328
  avatar=PROVIDER_TO_AVATAR_MAP[selected_aggregator],
329
  ):
330
  message_placeholder = st.empty()
331
+ aggregator_stream = get_llm_response_stream(
332
  selected_aggregator, aggregator_prompt
333
  )
334
  if aggregator_stream:
335
  message_placeholder.write_stream(aggregator_stream)
336
+ st.session_state[
337
+ get_aggregator_response_key(selected_aggregator)
338
+ ] = message_placeholder.write_stream(aggregator_stream)
339
+
340
+ # Judging.
341
+ st.markdown("#### Judging Configuration Form")
342
+
343
+ # Choose the type of assessment
344
+ assessment_type = st.radio(
345
+ "Select the type of assessment",
346
+ options=["Direct Assessment", "Pairwise Comparison"],
347
+ )
348
+
349
+ # Depending on the assessment type, render different forms
350
+ if assessment_type == "Direct Assessment":
351
+ direct_assessment_prompt = st.text_area(
352
+ "Prompt for the Direct Assessment",
353
+ value=get_default_direct_assessment_prompt(user_prompt=user_prompt),
354
+ height=500,
355
+ )
356
+
357
+ # TODO: Add option to edit criteria list with a basic text field.
358
+ criteria_list = DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST
359
+
360
+ # Create DirectAssessment object when form is submitted
361
+ if st.button("Submit Direct Assessment"):
362
+
363
+ # Submit direct asssessment.
364
+ responses_for_judging = get_response_mapping()
365
+
366
+ response_judging_columns = st.columns(3)
367
+
368
+ responses_for_judging_to_streamlit_column_index_map = {
369
+ model: response_judging_columns[i % 3]
370
+ for i, model in enumerate(responses_for_judging.keys())
371
+ }
372
+
373
+ # Get judging responses.
374
+ for response_model, response in responses_for_judging.items():
375
+
376
+ st_column = response_judging_columns[
377
+ responses_for_judging_to_streamlit_column_index_map[
378
+ response_model
379
+ ]
380
+ ]
381
+
382
+ with st_column:
383
+
384
+ st.write(f"Judging {response_model}")
385
+ judging_prompt = get_direct_assessment_prompt(
386
+ direct_assessment_prompt,
387
+ user_prompt,
388
+ response,
389
+ criteria_list,
390
+ SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
391
+ )
392
+
393
+ for judging_model in selected_models:
394
+ with st.expander("Detailed assessments", expanded=True):
395
+ with st.chat_message(
396
+ judging_model,
397
+ avatar=PROVIDER_TO_AVATAR_MAP[judging_model],
398
+ ):
399
+ st.write(f"Judge: {judging_model}")
400
+ message_placeholder = st.empty()
401
+ judging_stream = get_llm_response_stream(
402
+ judging_model, judging_prompt
403
+ )
404
+ if judging_stream:
405
+ st.session_state[
406
+ get_judging_key(
407
+ judging_model, response_model
408
+ )
409
+ ] = message_placeholder.write_stream(
410
+ judging_stream
411
+ )
412
+ # When all of the judging is finished for the given response, get the actual
413
+ # values, parsed (use gpt-4o-mini for now) with json mode.
414
+ # TODO.
415
+
416
+ elif assessment_type == "Pairwise Comparison":
417
+ pairwise_comparison_prompt = st.text_area(
418
+ "Prompt for the Pairwise Comparison"
419
+ )
420
+ granularity = st.selectbox("Granularity", ["coarse", "fine", "super fine"])
421
+ ties_allowed = st.checkbox("Are ties allowed?")
422
+ position_swapping = st.checkbox("Enable position swapping?")
423
+ reference_model = st.text_input("Reference Model")
424
+
425
+ # Create PairwiseComparison object when form is submitted
426
+ if st.button("Submit Pairwise Comparison"):
427
+ pairwise_comparison_config = PairwiseComparison(
428
+ type="pairwise_comparison",
429
+ granularity=granularity,
430
+ ties_allowed=ties_allowed,
431
+ position_swapping=position_swapping,
432
+ reference_model=reference_model,
433
+ prompt=prompt,
434
+ )
435
+ st.success(f"Pairwise Comparison Created: {pairwise_comparison_config}")
436
+ # Submit pairwise comparison.
437
+ responses_for_judging = get_response_mapping()
438
+
439
  else:
440
  with cols[1]:
441
  st.warning("Please log in to access this app.")
 
443
 
444
  if __name__ == "__main__":
445
  main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app2.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import google_auth_oauthlib.flow
4
+ from googleapiclient.discovery import build
5
+ import streamlit as st
6
+ import webbrowser
7
+
8
+
9
+ redirect_uri = os.environ.get("REDIRECT_URI", "http://localhost:8502/")
10
+
11
+
12
+ def auth_flow():
13
+ st.write("Welcome to My App!")
14
+ auth_code = st.query_params.get("code")
15
+ flow = google_auth_oauthlib.flow.Flow.from_client_secrets_file(
16
+ "client_secret.json", # replace with you json credentials from your google auth app
17
+ scopes=["https://www.googleapis.com/auth/userinfo.email", "openid"],
18
+ redirect_uri=redirect_uri,
19
+ )
20
+ if auth_code:
21
+ flow.fetch_token(code=auth_code)
22
+ credentials = flow.credentials
23
+ st.write("Login Done")
24
+ user_info_service = build(
25
+ serviceName="oauth2",
26
+ version="v2",
27
+ credentials=credentials,
28
+ )
29
+ user_info = user_info_service.userinfo().get().execute()
30
+ assert user_info.get("email"), "Email not found in infos"
31
+ st.session_state["google_auth_code"] = auth_code
32
+ st.session_state["user_info"] = user_info
33
+ else:
34
+ if st.button("Sign in with Google"):
35
+ authorization_url, state = flow.authorization_url(
36
+ access_type="offline",
37
+ include_granted_scopes="true",
38
+ )
39
+ webbrowser.open_new_tab(authorization_url)
40
+
41
+
42
+ def main():
43
+ if "google_auth_code" not in st.session_state:
44
+ auth_flow()
45
+
46
+ if "google_auth_code" in st.session_state:
47
+ email = st.session_state["user_info"].get("email")
48
+ st.write(f"Hello {email}")
49
+
50
+
51
+ if __name__ == "__main__":
52
+ main()
constants.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LLM_COUNCIL_MEMBERS = {
2
+ "Smalls": [
3
+ # "openai://gpt-4o-mini",
4
+ "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
5
+ "together://meta-llama/Llama-3.2-3B-Instruct-Turbo",
6
+ # "vertex://gemini-1.5-flash-001",
7
+ # "anthropic://claude-3-haiku-20240307",
8
+ ],
9
+ "Flagships": [
10
+ "openai://gpt-4o",
11
+ "together://meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
12
+ "vertex://gemini-1.5-pro-001",
13
+ "anthropic://claude-3-5-sonnet",
14
+ ],
15
+ }
16
+
17
+ PROVIDER_TO_AVATAR_MAP = {
18
+ "openai://gpt-4o-mini": "",
19
+ "anthropic://claude-3-5-sonnet": "",
20
+ "vertex://gemini-1.5-flash-001": "",
21
+ "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": "",
22
+ "together://meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo": "",
23
+ "together://meta-llama/Llama-3.2-3B-Instruct-Turbo": "",
24
+ "anthropic://claude-3-haiku-20240307": "",
25
+ }
26
+
27
+ # AGGREGATORS = ["openai://gpt-4o-mini", "openai://gpt-4o"]
28
+ AGGREGATORS = ["together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"]
29
+
30
+
31
+ # Fix the aggregator step.
32
+ # Add a judging step.
33
+ # Add visualizations.
34
+
35
+
36
+ # import streamlit as st
37
+ # from components import llm_council_selector
38
+
39
+ # st.title("LLM Council Selector")
40
+
41
+ # selected_models = llm_council_selector()
42
+
43
+ # if selected_models is not None:
44
+ # st.write("Selected Models:", selected_models)
45
+ # else:
46
+ # st.write("No models selected or component didn't return a value.")
47
+
48
+
49
+ # Choose your council.
50
+ # Pre-selected.
51
+ # Smalls: GPT-4o-mini, llama-3.1-70b, qwen-2.0-70b
52
+ # Flagships: GPT-4o, llama-3.1-405b, qwen-2.0-110b, gemini, claude-3.5-sonnet
53
+ # Best: chatgpt-4o-latest, gemini-1.5-pro-exp-0827, grok-2-2024-08-13, claude-3-5-sonnet-20240620, llama-3.1-405b-instruct
54
+ # Custom:
55
+ # Choose from a list of available models.
56
+ # All:
57
+ # All available models.
58
+
59
+ # Choose aggregator.
60
+ # Aggregators are models proficient in synthesizing responses from other models into a single, highquality output. An effective aggregator should maintain or enhance output quality even when
61
+ # integrating inputs that are of lesser quality than its own.
62
+ # Choices:
63
+ # - 4o-latest
64
+ # - gemini-1.5
65
+ # - grok-2
66
+ # - claude-3.5-sonnet
67
+ # - llama-3.1-405b-instruct
68
+
69
+ # Provide a prompt. (Or pre-canned prompts.)
70
+ # Paste chat history.
71
+
72
+ # Checkbox, enable judging.
73
+ #
74
+ # If checked, Judging config:
75
+ # Single sided
76
+ # Provide criteria. (or default).
77
+ # If pairwise, choose granularity (or default).
78
+ # Choose criteria. (or default).
79
+ # Enable position swapping?
80
+
81
+ # Go button.
82
+ # Sections.
83
+ # 1. Model outputs.
84
+ # 2. Aggregated output.
85
+ # 3. Judging underneath each output.
86
+ # Highlight in green, the output that was best, as determined by council.
87
+ # Show graph breakdown of scores and justifications. (by criteria, # wins and # losses)
88
+ # Show final overall score.
89
+ # Highlight in red, the output that was worst, as determined by council.
90
+ # Judging section.
91
+ # Show agreement matrix.
92
+ # Show bar graph of self-bias.
93
+ # Plot contrarianism vs. conviction (scatter plot)
94
+ # Show per-judge scores.
95
+
96
+ # Calculate total cost.
97
+ # Show total tokens used.
98
+
99
+ # """
100
+ # type: [single, pairwise]
101
+
102
+ # [single]
103
+ # - criteria:
104
+ # - name
105
+ # - weight
106
+ # - description
107
+ # - scoring
108
+
109
+
110
+ # [pairwise]
111
+ # - granularity: [fine, coarse]
112
+ # - ties_allowed: [yes, no]
113
+ # - position_swapping: [yes, no]
114
+ # - reference_model: [model_name]
115
+ # - criteria:
116
+ # - name
117
+ # - weight
118
+ # - description
119
+ # """
judging.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field, conint
2
+ from typing import List, Optional, Literal, Union
3
+
4
+
5
+ class Criteria(BaseModel):
6
+ name: str
7
+ description: str
8
+ min_score: conint(ge=0)
9
+ max_score: conint(ge=0)
10
+
11
+
12
+ class DirectAssessment(BaseModel):
13
+ type: Literal["direct_assessment"]
14
+ criteria: List[Criteria]
15
+ prompt: str
16
+
17
+
18
+ class PairwiseComparison(BaseModel):
19
+ type: Literal["pairwise_comparison"]
20
+ granularity: Literal["coarse", "fine", "super fine"]
21
+ ties_allowed: bool
22
+ position_swapping: bool
23
+ reference_model: str
24
+ prompt: str
25
+
26
+
27
+ class JudgingConfig(BaseModel):
28
+ assessment: Union[DirectAssessment, PairwiseComparison]
judging_dataclasses.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field, conint
2
+ from typing import List, Optional, Literal, Union
3
+
4
+
5
+ class Criteria(BaseModel):
6
+ name: str
7
+ description: str
8
+ min_score: conint(ge=0)
9
+ max_score: conint(ge=0)
10
+
11
+
12
+ class DirectAssessment(BaseModel):
13
+ type: Literal["direct_assessment"]
14
+ criteria: List[Criteria]
15
+ prompt: str
16
+
17
+
18
+ class PairwiseComparison(BaseModel):
19
+ type: Literal["pairwise_comparison"]
20
+ granularity: Literal["coarse", "fine", "super fine"]
21
+ ties_allowed: bool
22
+ position_swapping: bool
23
+ reference_model: str
24
+ prompt: str
25
+
26
+
27
+ class JudgingConfig(BaseModel):
28
+ assessment: Union[DirectAssessment, PairwiseComparison]
prompts.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from judging_dataclasses import Criteria
2
+
3
+
4
+ DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses.
5
+
6
+ [USER PROMPT START]
7
+ {user_prompt}
8
+ [USER PROMPT END]
9
+
10
+ Responses from other LLMs:
11
+ {responses_from_other_llms}
12
+
13
+ Please provide a response the combines the best aspects of the responses above."""
14
+
15
+
16
+ DEFAULT_DIRECT_ASSESSMENT_PROMPT = """We are trying to assess the quality of a response to a user query.
17
+
18
+ [USER PROMPT START]
19
+ {user_prompt}
20
+ [USER PROMPT END]
21
+
22
+ The response is as follows:
23
+
24
+ [RESPONSE START]
25
+ {response}
26
+ [RESPONSE END]
27
+
28
+ Please evaluate the quality of the response based on the following criteria:
29
+
30
+ {criteria_list}
31
+
32
+ Options:
33
+ {options}
34
+
35
+ For each of the criterion, provide a short explanation describing how you would evaluate the response based on that criterion. Then, provide your final rating for that criterion."""
36
+
37
+ DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST = [
38
+ Criteria(
39
+ name="helpfulness",
40
+ description="Provides meaningful information and clear solutions that address the query.",
41
+ min_score=1,
42
+ max_score=7,
43
+ ),
44
+ Criteria(
45
+ name="relevance",
46
+ description="Stays on topic and directly relates to the query without unnecessary details.",
47
+ min_score=1,
48
+ max_score=7,
49
+ ),
50
+ Criteria(
51
+ name="conciseness",
52
+ description="Communicates clearly and efficiently, avoiding excess content while retaining substance.",
53
+ min_score=1,
54
+ max_score=7,
55
+ ),
56
+ ]
57
+
58
+ # 7-point likert scale.
59
+ SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS = [
60
+ "Strongly Disagree",
61
+ "Disagree",
62
+ "Slightly Disagree",
63
+ "Neither Agree Nor Disagree",
64
+ "Slightly Agree",
65
+ "Agree",
66
+ "Strongly Agree",
67
+ ]
68
+
69
+ # 6-point likert scale.
70
+ SIX_POINT_DIRECT_ASSESSMENT_OPTIONS = [
71
+ "Strongly Disagree",
72
+ "Disagree",
73
+ "Slightly Disagree",
74
+ "Slightly Agree",
75
+ "Agree",
76
+ "Strongly Agree",
77
+ ]
78
+
79
+ # 5-point likert scale.
80
+ FIVE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
81
+ "Strongly Disagree",
82
+ "Disagree",
83
+ "Neither Agree Nor Disagree",
84
+ "Agree",
85
+ "Strongly Agree",
86
+ ]
87
+
88
+ # 4-point likert scale.
89
+ FOUR_POINT_DIRECT_ASSESSMENT_OPTIONS = [
90
+ "Strongly Disagree",
91
+ "Disagree",
92
+ "Agree",
93
+ "Strongly Agree",
94
+ ]
95
+
96
+ # 3-point likert scale.
97
+ THREE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
98
+ "Disagree",
99
+ "Neither Agree Nor Disagree",
100
+ "Agree",
101
+ ]
102
+
103
+ # 2-point likert scale.
104
+ BINARY_DIRECT_ASSESSMENT_OPTIONS = [
105
+ "Disagree",
106
+ "Agree",
107
+ ]
108
+
109
+
110
+ DEFAULT_PAIRWISE_COMPARISON_PROMPT = """We are trying to compare the quality of two responses to a user query.
111
+
112
+ [USER PROMPT START]
113
+ {prompt}
114
+ [USER PROMPT END]
115
+
116
+ [RESPONSE A START]
117
+ {first_completion}
118
+ [RESPONSE A END]
119
+
120
+ [RESPONSE B START]
121
+ {second_completion}
122
+ [RESPONSE B END]
123
+
124
+ Begin your evaluation by comparing the two responses and provide a short explanation. Some themes to consider in your evaluation: {themes_to_consider}.
125
+
126
+ After providing your explanation, output your final verdict as one of the following options:
127
+ {pairwise_comparison_options}
128
+ """
129
+
130
+ DEFAULT_PAIRWISE_COMPARISON_THEMES_TO_CONSIDER = [
131
+ (
132
+ "helpfulness",
133
+ "Provides meaningful information and clear solutions that address the query.",
134
+ ),
135
+ (
136
+ "relevance",
137
+ "Stays on topic and directly relates to the query without unnecessary details.",
138
+ ),
139
+ (
140
+ "conciseness",
141
+ "Communicates clearly and efficiently, avoiding excess content while retaining substance.",
142
+ ),
143
+ ]
144
+
145
+ # COARSE WITH TIE.
146
+ DEFAULT_PAIRWISE_COMPARISON_OPTIONS = [
147
+ ("A>B", "Response A is better than Response B"),
148
+ ("B<A", "Response B is better than Response A"),
149
+ ("A=B", "Both responses are equally good"),
150
+ ]