Spaces:
Sleeping
Sleeping
justinxzhao
commited on
Commit
·
577870e
1
Parent(s):
3703473
Some refactoring, judging responses for direct assessment.
Browse files- app.py +227 -120
- app2.py +52 -0
- constants.py +119 -0
- judging.py +28 -0
- judging_dataclasses.py +28 -0
- prompts.py +150 -0
app.py
CHANGED
@@ -7,6 +7,15 @@ import anthropic
|
|
7 |
from together import Together
|
8 |
import google.generativeai as genai
|
9 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
dotenv.load_dotenv()
|
12 |
|
@@ -31,31 +40,6 @@ openai_client = OpenAI(
|
|
31 |
# anthropic_client = anthropic.Client(api_key=ANTHROPIC_API_KEY)
|
32 |
anthropic_client = anthropic.Anthropic()
|
33 |
|
34 |
-
LLM_COUNCIL_MEMBERS = {
|
35 |
-
"Smalls": [
|
36 |
-
"openai://gpt-4o-mini",
|
37 |
-
"together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
38 |
-
"vertex://gemini-1.5-flash-001",
|
39 |
-
"anthropic://claude-3-haiku-20240307",
|
40 |
-
],
|
41 |
-
"Flagships": [
|
42 |
-
"openai://gpt-4",
|
43 |
-
"together://meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
44 |
-
"vertex://gemini-1.5-pro-001",
|
45 |
-
"anthropic://claude-3-5-sonnet",
|
46 |
-
],
|
47 |
-
}
|
48 |
-
|
49 |
-
PROVIDER_TO_AVATAR_MAP = {
|
50 |
-
"openai://gpt-4o-mini": "",
|
51 |
-
"anthropic://claude-3-5-sonnet": "",
|
52 |
-
"vertex://gemini-1.5-flash-001": "",
|
53 |
-
"together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": "",
|
54 |
-
"anthropic://claude-3-haiku-20240307": "",
|
55 |
-
}
|
56 |
-
|
57 |
-
AGGREGATORS = ["openai://gpt-4o-mini", "openai://gpt-4o"]
|
58 |
-
|
59 |
|
60 |
def anthropic_streamlit_streamer(stream):
|
61 |
"""
|
@@ -88,6 +72,15 @@ def together_streamlit_streamer(stream):
|
|
88 |
yield chunk.choices[0].delta.content
|
89 |
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
# Helper functions for LLM council and aggregator selection
|
92 |
def llm_council_selector():
|
93 |
selected_council = st.radio(
|
@@ -133,16 +126,17 @@ def get_google_response(model_name, prompt):
|
|
133 |
return model.generate_content(prompt, stream=True)
|
134 |
|
135 |
|
136 |
-
def
|
|
|
137 |
provider, model_name = model_identifier.split("://")
|
138 |
if provider == "openai":
|
139 |
return get_openai_response(model_name, prompt)
|
140 |
elif provider == "anthropic":
|
141 |
-
return get_anthropic_response(model_name, prompt)
|
142 |
elif provider == "together":
|
143 |
-
return get_together_response(model_name, prompt)
|
144 |
elif provider == "vertex":
|
145 |
-
return get_google_response(model_name, prompt)
|
146 |
else:
|
147 |
return None
|
148 |
|
@@ -151,6 +145,97 @@ def get_response_key(model):
|
|
151 |
return model + ".response"
|
152 |
|
153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
# Main Streamlit App
|
155 |
def main():
|
156 |
st.set_page_config(
|
@@ -183,7 +268,7 @@ def main():
|
|
183 |
# App title and description
|
184 |
st.title("Language Model Council Sandbox")
|
185 |
st.markdown("###### Invoke a council of LLMs to generate and judge each other.")
|
186 |
-
st.markdown("###### [
|
187 |
|
188 |
# Authentication system
|
189 |
if "authenticated" not in st.session_state:
|
@@ -206,60 +291,151 @@ def main():
|
|
206 |
selected_models = llm_council_selector()
|
207 |
st.write("Selected Models:", selected_models)
|
208 |
selected_aggregator = aggregator_selector()
|
209 |
-
st.write("Selected Aggregator:", selected_aggregator)
|
210 |
|
211 |
# Prompt input
|
212 |
-
|
213 |
|
214 |
if st.button("Submit"):
|
215 |
st.write("Responses:")
|
216 |
|
217 |
# Fetching and streaming responses from each selected model
|
|
|
218 |
for model in selected_models:
|
219 |
-
# with st.chat_message(model):
|
220 |
with st.chat_message(
|
221 |
model,
|
222 |
avatar=PROVIDER_TO_AVATAR_MAP[model],
|
223 |
):
|
224 |
message_placeholder = st.empty()
|
225 |
-
stream =
|
226 |
if stream:
|
227 |
-
if model.startswith("anthropic"):
|
228 |
-
stream = anthropic_streamlit_streamer(stream)
|
229 |
-
elif model.startswith("vertex"):
|
230 |
-
stream = google_streamlit_streamer(stream)
|
231 |
-
elif model.startswith("together"):
|
232 |
-
stream = together_streamlit_streamer(stream)
|
233 |
st.session_state[get_response_key(model)] = (
|
234 |
message_placeholder.write_stream(stream)
|
235 |
)
|
236 |
|
237 |
-
#
|
238 |
-
aggregator_prompt =
|
239 |
-
|
240 |
-
aggregator_prompt += "\n".join(
|
241 |
-
[
|
242 |
-
f"{model}: {st.session_state.get(get_response_key(model))} \n\n"
|
243 |
-
for model in selected_models
|
244 |
-
]
|
245 |
)
|
246 |
-
aggregator_prompt += "\n\nPlease provide an aggregated response."
|
247 |
|
248 |
with st.expander("Aggregator Prompt"):
|
249 |
st.write(aggregator_prompt)
|
250 |
|
251 |
# Fetching and streaming response from the aggregator
|
252 |
-
st.write(f"
|
253 |
with st.chat_message(
|
254 |
selected_aggregator,
|
255 |
avatar=PROVIDER_TO_AVATAR_MAP[selected_aggregator],
|
256 |
):
|
257 |
message_placeholder = st.empty()
|
258 |
-
aggregator_stream =
|
259 |
selected_aggregator, aggregator_prompt
|
260 |
)
|
261 |
if aggregator_stream:
|
262 |
message_placeholder.write_stream(aggregator_stream)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
else:
|
264 |
with cols[1]:
|
265 |
st.warning("Please log in to access this app.")
|
@@ -267,72 +443,3 @@ def main():
|
|
267 |
|
268 |
if __name__ == "__main__":
|
269 |
main()
|
270 |
-
|
271 |
-
|
272 |
-
# Fix the aggregator step.
|
273 |
-
# Add a judging step.
|
274 |
-
# Add visualizations.
|
275 |
-
|
276 |
-
|
277 |
-
# import streamlit as st
|
278 |
-
# from components import llm_council_selector
|
279 |
-
|
280 |
-
# st.title("LLM Council Selector")
|
281 |
-
|
282 |
-
# selected_models = llm_council_selector()
|
283 |
-
|
284 |
-
# if selected_models is not None:
|
285 |
-
# st.write("Selected Models:", selected_models)
|
286 |
-
# else:
|
287 |
-
# st.write("No models selected or component didn't return a value.")
|
288 |
-
|
289 |
-
|
290 |
-
# Choose your council.
|
291 |
-
# Pre-selected.
|
292 |
-
# Smalls: GPT-4o-mini, llama-3.1-70b, qwen-2.0-70b
|
293 |
-
# Flagships: GPT-4o, llama-3.1-405b, qwen-2.0-110b, gemini, claude-3.5-sonnet
|
294 |
-
# Best: chatgpt-4o-latest, gemini-1.5-pro-exp-0827, grok-2-2024-08-13, claude-3-5-sonnet-20240620, llama-3.1-405b-instruct
|
295 |
-
# Custom:
|
296 |
-
# Choose from a list of available models.
|
297 |
-
# All:
|
298 |
-
# All available models.
|
299 |
-
|
300 |
-
# Choose aggregator.
|
301 |
-
# Aggregators are models proficient in synthesizing responses from other models into a single, highquality output. An effective aggregator should maintain or enhance output quality even when
|
302 |
-
# integrating inputs that are of lesser quality than its own.
|
303 |
-
# Choices:
|
304 |
-
# - 4o-latest
|
305 |
-
# - gemini-1.5
|
306 |
-
# - grok-2
|
307 |
-
# - claude-3.5-sonnet
|
308 |
-
# - llama-3.1-405b-instruct
|
309 |
-
|
310 |
-
# Provide a prompt. (Or pre-canned prompts.)
|
311 |
-
# Paste chat history.
|
312 |
-
|
313 |
-
# Checkbox, enable judging.
|
314 |
-
#
|
315 |
-
# If checked, Judging config:
|
316 |
-
# Single sided
|
317 |
-
# Provide criteria. (or default).
|
318 |
-
# If pairwise, choose granularity (or default).
|
319 |
-
# Choose criteria. (or default).
|
320 |
-
# Enable position swapping?
|
321 |
-
|
322 |
-
# Go button.
|
323 |
-
# Sections.
|
324 |
-
# 1. Model outputs.
|
325 |
-
# 2. Aggregated output.
|
326 |
-
# 3. Judging underneath each output.
|
327 |
-
# Highlight in green, the output that was best, as determined by council.
|
328 |
-
# Show graph breakdown of scores and justifications. (by criteria, # wins and # losses)
|
329 |
-
# Show final overall score.
|
330 |
-
# Highlight in red, the output that was worst, as determined by council.
|
331 |
-
# Judging section.
|
332 |
-
# Show agreement matrix.
|
333 |
-
# Show bar graph of self-bias.
|
334 |
-
# Plot contrarianism vs. conviction (scatter plot)
|
335 |
-
# Show per-judge scores.
|
336 |
-
|
337 |
-
# Calculate total cost.
|
338 |
-
# Show total tokens used.
|
|
|
7 |
from together import Together
|
8 |
import google.generativeai as genai
|
9 |
import time
|
10 |
+
from typing import List, Optional, Literal, Union
|
11 |
+
from constants import (
|
12 |
+
LLM_COUNCIL_MEMBERS,
|
13 |
+
PROVIDER_TO_AVATAR_MAP,
|
14 |
+
AGGREGATORS,
|
15 |
+
)
|
16 |
+
from prompts import *
|
17 |
+
from judging_dataclasses import *
|
18 |
+
|
19 |
|
20 |
dotenv.load_dotenv()
|
21 |
|
|
|
40 |
# anthropic_client = anthropic.Client(api_key=ANTHROPIC_API_KEY)
|
41 |
anthropic_client = anthropic.Anthropic()
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
def anthropic_streamlit_streamer(stream):
|
45 |
"""
|
|
|
72 |
yield chunk.choices[0].delta.content
|
73 |
|
74 |
|
75 |
+
def llm_streamlit_streamer(stream, llm):
|
76 |
+
if llm.startswith("anthropic"):
|
77 |
+
return anthropic_streamlit_streamer(stream)
|
78 |
+
elif llm.startswith("vertex"):
|
79 |
+
return google_streamlit_streamer(stream)
|
80 |
+
elif llm.startswith("together"):
|
81 |
+
return together_streamlit_streamer(stream)
|
82 |
+
|
83 |
+
|
84 |
# Helper functions for LLM council and aggregator selection
|
85 |
def llm_council_selector():
|
86 |
selected_council = st.radio(
|
|
|
126 |
return model.generate_content(prompt, stream=True)
|
127 |
|
128 |
|
129 |
+
def get_llm_response_stream(model_identifier, prompt):
|
130 |
+
"""Returns a streamlit-friendly stream of response tokens from the LLM."""
|
131 |
provider, model_name = model_identifier.split("://")
|
132 |
if provider == "openai":
|
133 |
return get_openai_response(model_name, prompt)
|
134 |
elif provider == "anthropic":
|
135 |
+
return anthropic_streamlit_streamer(get_anthropic_response(model_name, prompt))
|
136 |
elif provider == "together":
|
137 |
+
return together_streamlit_streamer(get_together_response(model_name, prompt))
|
138 |
elif provider == "vertex":
|
139 |
+
return google_streamlit_streamer(get_google_response(model_name, prompt))
|
140 |
else:
|
141 |
return None
|
142 |
|
|
|
145 |
return model + ".response"
|
146 |
|
147 |
|
148 |
+
def get_model_from_response_key(response_key):
|
149 |
+
return response_key.split(".")[0]
|
150 |
+
|
151 |
+
|
152 |
+
def get_judging_key(judge_model, response_model):
|
153 |
+
return "judge." + judge_model + "." + response_model
|
154 |
+
|
155 |
+
|
156 |
+
def get_aggregator_response_key(model):
|
157 |
+
return model + ".aggregator_response"
|
158 |
+
|
159 |
+
|
160 |
+
# Streamlit form UI
|
161 |
+
def render_criteria_form(criteria_num):
|
162 |
+
"""Render a criteria input form."""
|
163 |
+
with st.expander(f"Criteria {criteria_num + 1}"):
|
164 |
+
name = st.text_input(f"Name for Criteria {criteria_num + 1}")
|
165 |
+
description = st.text_area(f"Description for Criteria {criteria_num + 1}")
|
166 |
+
min_score = st.number_input(
|
167 |
+
f"Min Score for Criteria {criteria_num + 1}", min_value=0, step=1
|
168 |
+
)
|
169 |
+
max_score = st.number_input(
|
170 |
+
f"Max Score for Criteria {criteria_num + 1}", min_value=0, step=1
|
171 |
+
)
|
172 |
+
return Criteria(
|
173 |
+
name=name, description=description, min_score=min_score, max_score=max_score
|
174 |
+
)
|
175 |
+
|
176 |
+
|
177 |
+
def get_response_mapping():
|
178 |
+
# Inspect the session state for all the responses.
|
179 |
+
# This is a dictionary mapping model names to their responses.
|
180 |
+
# The aggregator response is also included in this mapping under the key "<model>.aggregator_response".
|
181 |
+
response_mapping = {}
|
182 |
+
for key in st.session_state.keys():
|
183 |
+
if key.endswith(".response"):
|
184 |
+
response_mapping[get_model_from_response_key(key)] = st.session_state[key]
|
185 |
+
if key.endswith(".aggregator_response"):
|
186 |
+
response_mapping[key] = st.session_state[key]
|
187 |
+
return response_mapping
|
188 |
+
|
189 |
+
|
190 |
+
def format_likert_comparison_options(options):
|
191 |
+
return "\n".join([f"{i + 1}: {option}" for i, option in enumerate(options)])
|
192 |
+
|
193 |
+
|
194 |
+
def format_criteria_list(criteria_list):
|
195 |
+
return "\n".join(
|
196 |
+
[f"{criteria.name}: {criteria.description}" for criteria in criteria_list]
|
197 |
+
)
|
198 |
+
|
199 |
+
|
200 |
+
def get_direct_assessment_prompt(
|
201 |
+
direct_assessment_prompt, user_prompt, response, criteria_list, options
|
202 |
+
):
|
203 |
+
return direct_assessment_prompt.format(
|
204 |
+
user_prompt=user_prompt,
|
205 |
+
response=response,
|
206 |
+
criteria_list=f"{format_criteria_list(DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST)}",
|
207 |
+
options=f"{format_likert_comparison_options(SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS)}",
|
208 |
+
)
|
209 |
+
|
210 |
+
|
211 |
+
def get_default_direct_assessment_prompt(user_prompt):
|
212 |
+
return get_direct_assessment_prompt(
|
213 |
+
DEFAULT_DIRECT_ASSESSMENT_PROMPT,
|
214 |
+
user_prompt=user_prompt,
|
215 |
+
response="{{response}}",
|
216 |
+
criteria_list=DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST,
|
217 |
+
options=SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
|
218 |
+
)
|
219 |
+
|
220 |
+
|
221 |
+
def get_aggregator_prompt(aggregator_prompt, user_prompt, llms):
|
222 |
+
responses_from_other_llms = "\n\n".join(
|
223 |
+
[f"{model}: {st.session_state.get(get_response_key(model))}" for model in llms]
|
224 |
+
)
|
225 |
+
return aggregator_prompt.format(
|
226 |
+
user_prompt=user_prompt,
|
227 |
+
responses_from_other_llms=responses_from_other_llms,
|
228 |
+
)
|
229 |
+
|
230 |
+
|
231 |
+
def get_default_aggregator_prompt(user_prompt, llms):
|
232 |
+
return get_aggregator_prompt(
|
233 |
+
DEFAULT_AGGREGATOR_PROMPT,
|
234 |
+
user_prompt=user_prompt,
|
235 |
+
llms=llms,
|
236 |
+
)
|
237 |
+
|
238 |
+
|
239 |
# Main Streamlit App
|
240 |
def main():
|
241 |
st.set_page_config(
|
|
|
268 |
# App title and description
|
269 |
st.title("Language Model Council Sandbox")
|
270 |
st.markdown("###### Invoke a council of LLMs to generate and judge each other.")
|
271 |
+
st.markdown("###### [Paper](https://arxiv.org/abs/2406.08598)")
|
272 |
|
273 |
# Authentication system
|
274 |
if "authenticated" not in st.session_state:
|
|
|
291 |
selected_models = llm_council_selector()
|
292 |
st.write("Selected Models:", selected_models)
|
293 |
selected_aggregator = aggregator_selector()
|
294 |
+
# st.write("Selected Aggregator:", selected_aggregator)
|
295 |
|
296 |
# Prompt input
|
297 |
+
user_prompt = st.text_area("Enter your prompt:")
|
298 |
|
299 |
if st.button("Submit"):
|
300 |
st.write("Responses:")
|
301 |
|
302 |
# Fetching and streaming responses from each selected model
|
303 |
+
# TODO: Make this asynchronous?
|
304 |
for model in selected_models:
|
|
|
305 |
with st.chat_message(
|
306 |
model,
|
307 |
avatar=PROVIDER_TO_AVATAR_MAP[model],
|
308 |
):
|
309 |
message_placeholder = st.empty()
|
310 |
+
stream = get_llm_response_stream(model, user_prompt)
|
311 |
if stream:
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
st.session_state[get_response_key(model)] = (
|
313 |
message_placeholder.write_stream(stream)
|
314 |
)
|
315 |
|
316 |
+
# Get the aggregator prompt.
|
317 |
+
aggregator_prompt = get_default_aggregator_prompt(
|
318 |
+
user_prompt=user_prompt, llms=selected_models
|
|
|
|
|
|
|
|
|
|
|
319 |
)
|
|
|
320 |
|
321 |
with st.expander("Aggregator Prompt"):
|
322 |
st.write(aggregator_prompt)
|
323 |
|
324 |
# Fetching and streaming response from the aggregator
|
325 |
+
st.write(f"Mixture-of-Agents response from {selected_aggregator}:")
|
326 |
with st.chat_message(
|
327 |
selected_aggregator,
|
328 |
avatar=PROVIDER_TO_AVATAR_MAP[selected_aggregator],
|
329 |
):
|
330 |
message_placeholder = st.empty()
|
331 |
+
aggregator_stream = get_llm_response_stream(
|
332 |
selected_aggregator, aggregator_prompt
|
333 |
)
|
334 |
if aggregator_stream:
|
335 |
message_placeholder.write_stream(aggregator_stream)
|
336 |
+
st.session_state[
|
337 |
+
get_aggregator_response_key(selected_aggregator)
|
338 |
+
] = message_placeholder.write_stream(aggregator_stream)
|
339 |
+
|
340 |
+
# Judging.
|
341 |
+
st.markdown("#### Judging Configuration Form")
|
342 |
+
|
343 |
+
# Choose the type of assessment
|
344 |
+
assessment_type = st.radio(
|
345 |
+
"Select the type of assessment",
|
346 |
+
options=["Direct Assessment", "Pairwise Comparison"],
|
347 |
+
)
|
348 |
+
|
349 |
+
# Depending on the assessment type, render different forms
|
350 |
+
if assessment_type == "Direct Assessment":
|
351 |
+
direct_assessment_prompt = st.text_area(
|
352 |
+
"Prompt for the Direct Assessment",
|
353 |
+
value=get_default_direct_assessment_prompt(user_prompt=user_prompt),
|
354 |
+
height=500,
|
355 |
+
)
|
356 |
+
|
357 |
+
# TODO: Add option to edit criteria list with a basic text field.
|
358 |
+
criteria_list = DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST
|
359 |
+
|
360 |
+
# Create DirectAssessment object when form is submitted
|
361 |
+
if st.button("Submit Direct Assessment"):
|
362 |
+
|
363 |
+
# Submit direct asssessment.
|
364 |
+
responses_for_judging = get_response_mapping()
|
365 |
+
|
366 |
+
response_judging_columns = st.columns(3)
|
367 |
+
|
368 |
+
responses_for_judging_to_streamlit_column_index_map = {
|
369 |
+
model: response_judging_columns[i % 3]
|
370 |
+
for i, model in enumerate(responses_for_judging.keys())
|
371 |
+
}
|
372 |
+
|
373 |
+
# Get judging responses.
|
374 |
+
for response_model, response in responses_for_judging.items():
|
375 |
+
|
376 |
+
st_column = response_judging_columns[
|
377 |
+
responses_for_judging_to_streamlit_column_index_map[
|
378 |
+
response_model
|
379 |
+
]
|
380 |
+
]
|
381 |
+
|
382 |
+
with st_column:
|
383 |
+
|
384 |
+
st.write(f"Judging {response_model}")
|
385 |
+
judging_prompt = get_direct_assessment_prompt(
|
386 |
+
direct_assessment_prompt,
|
387 |
+
user_prompt,
|
388 |
+
response,
|
389 |
+
criteria_list,
|
390 |
+
SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
|
391 |
+
)
|
392 |
+
|
393 |
+
for judging_model in selected_models:
|
394 |
+
with st.expander("Detailed assessments", expanded=True):
|
395 |
+
with st.chat_message(
|
396 |
+
judging_model,
|
397 |
+
avatar=PROVIDER_TO_AVATAR_MAP[judging_model],
|
398 |
+
):
|
399 |
+
st.write(f"Judge: {judging_model}")
|
400 |
+
message_placeholder = st.empty()
|
401 |
+
judging_stream = get_llm_response_stream(
|
402 |
+
judging_model, judging_prompt
|
403 |
+
)
|
404 |
+
if judging_stream:
|
405 |
+
st.session_state[
|
406 |
+
get_judging_key(
|
407 |
+
judging_model, response_model
|
408 |
+
)
|
409 |
+
] = message_placeholder.write_stream(
|
410 |
+
judging_stream
|
411 |
+
)
|
412 |
+
# When all of the judging is finished for the given response, get the actual
|
413 |
+
# values, parsed (use gpt-4o-mini for now) with json mode.
|
414 |
+
# TODO.
|
415 |
+
|
416 |
+
elif assessment_type == "Pairwise Comparison":
|
417 |
+
pairwise_comparison_prompt = st.text_area(
|
418 |
+
"Prompt for the Pairwise Comparison"
|
419 |
+
)
|
420 |
+
granularity = st.selectbox("Granularity", ["coarse", "fine", "super fine"])
|
421 |
+
ties_allowed = st.checkbox("Are ties allowed?")
|
422 |
+
position_swapping = st.checkbox("Enable position swapping?")
|
423 |
+
reference_model = st.text_input("Reference Model")
|
424 |
+
|
425 |
+
# Create PairwiseComparison object when form is submitted
|
426 |
+
if st.button("Submit Pairwise Comparison"):
|
427 |
+
pairwise_comparison_config = PairwiseComparison(
|
428 |
+
type="pairwise_comparison",
|
429 |
+
granularity=granularity,
|
430 |
+
ties_allowed=ties_allowed,
|
431 |
+
position_swapping=position_swapping,
|
432 |
+
reference_model=reference_model,
|
433 |
+
prompt=prompt,
|
434 |
+
)
|
435 |
+
st.success(f"Pairwise Comparison Created: {pairwise_comparison_config}")
|
436 |
+
# Submit pairwise comparison.
|
437 |
+
responses_for_judging = get_response_mapping()
|
438 |
+
|
439 |
else:
|
440 |
with cols[1]:
|
441 |
st.warning("Please log in to access this app.")
|
|
|
443 |
|
444 |
if __name__ == "__main__":
|
445 |
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app2.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import google_auth_oauthlib.flow
|
4 |
+
from googleapiclient.discovery import build
|
5 |
+
import streamlit as st
|
6 |
+
import webbrowser
|
7 |
+
|
8 |
+
|
9 |
+
redirect_uri = os.environ.get("REDIRECT_URI", "http://localhost:8502/")
|
10 |
+
|
11 |
+
|
12 |
+
def auth_flow():
|
13 |
+
st.write("Welcome to My App!")
|
14 |
+
auth_code = st.query_params.get("code")
|
15 |
+
flow = google_auth_oauthlib.flow.Flow.from_client_secrets_file(
|
16 |
+
"client_secret.json", # replace with you json credentials from your google auth app
|
17 |
+
scopes=["https://www.googleapis.com/auth/userinfo.email", "openid"],
|
18 |
+
redirect_uri=redirect_uri,
|
19 |
+
)
|
20 |
+
if auth_code:
|
21 |
+
flow.fetch_token(code=auth_code)
|
22 |
+
credentials = flow.credentials
|
23 |
+
st.write("Login Done")
|
24 |
+
user_info_service = build(
|
25 |
+
serviceName="oauth2",
|
26 |
+
version="v2",
|
27 |
+
credentials=credentials,
|
28 |
+
)
|
29 |
+
user_info = user_info_service.userinfo().get().execute()
|
30 |
+
assert user_info.get("email"), "Email not found in infos"
|
31 |
+
st.session_state["google_auth_code"] = auth_code
|
32 |
+
st.session_state["user_info"] = user_info
|
33 |
+
else:
|
34 |
+
if st.button("Sign in with Google"):
|
35 |
+
authorization_url, state = flow.authorization_url(
|
36 |
+
access_type="offline",
|
37 |
+
include_granted_scopes="true",
|
38 |
+
)
|
39 |
+
webbrowser.open_new_tab(authorization_url)
|
40 |
+
|
41 |
+
|
42 |
+
def main():
|
43 |
+
if "google_auth_code" not in st.session_state:
|
44 |
+
auth_flow()
|
45 |
+
|
46 |
+
if "google_auth_code" in st.session_state:
|
47 |
+
email = st.session_state["user_info"].get("email")
|
48 |
+
st.write(f"Hello {email}")
|
49 |
+
|
50 |
+
|
51 |
+
if __name__ == "__main__":
|
52 |
+
main()
|
constants.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LLM_COUNCIL_MEMBERS = {
|
2 |
+
"Smalls": [
|
3 |
+
# "openai://gpt-4o-mini",
|
4 |
+
"together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
5 |
+
"together://meta-llama/Llama-3.2-3B-Instruct-Turbo",
|
6 |
+
# "vertex://gemini-1.5-flash-001",
|
7 |
+
# "anthropic://claude-3-haiku-20240307",
|
8 |
+
],
|
9 |
+
"Flagships": [
|
10 |
+
"openai://gpt-4o",
|
11 |
+
"together://meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
|
12 |
+
"vertex://gemini-1.5-pro-001",
|
13 |
+
"anthropic://claude-3-5-sonnet",
|
14 |
+
],
|
15 |
+
}
|
16 |
+
|
17 |
+
PROVIDER_TO_AVATAR_MAP = {
|
18 |
+
"openai://gpt-4o-mini": "",
|
19 |
+
"anthropic://claude-3-5-sonnet": "",
|
20 |
+
"vertex://gemini-1.5-flash-001": "",
|
21 |
+
"together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": "",
|
22 |
+
"together://meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo": "",
|
23 |
+
"together://meta-llama/Llama-3.2-3B-Instruct-Turbo": "",
|
24 |
+
"anthropic://claude-3-haiku-20240307": "",
|
25 |
+
}
|
26 |
+
|
27 |
+
# AGGREGATORS = ["openai://gpt-4o-mini", "openai://gpt-4o"]
|
28 |
+
AGGREGATORS = ["together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"]
|
29 |
+
|
30 |
+
|
31 |
+
# Fix the aggregator step.
|
32 |
+
# Add a judging step.
|
33 |
+
# Add visualizations.
|
34 |
+
|
35 |
+
|
36 |
+
# import streamlit as st
|
37 |
+
# from components import llm_council_selector
|
38 |
+
|
39 |
+
# st.title("LLM Council Selector")
|
40 |
+
|
41 |
+
# selected_models = llm_council_selector()
|
42 |
+
|
43 |
+
# if selected_models is not None:
|
44 |
+
# st.write("Selected Models:", selected_models)
|
45 |
+
# else:
|
46 |
+
# st.write("No models selected or component didn't return a value.")
|
47 |
+
|
48 |
+
|
49 |
+
# Choose your council.
|
50 |
+
# Pre-selected.
|
51 |
+
# Smalls: GPT-4o-mini, llama-3.1-70b, qwen-2.0-70b
|
52 |
+
# Flagships: GPT-4o, llama-3.1-405b, qwen-2.0-110b, gemini, claude-3.5-sonnet
|
53 |
+
# Best: chatgpt-4o-latest, gemini-1.5-pro-exp-0827, grok-2-2024-08-13, claude-3-5-sonnet-20240620, llama-3.1-405b-instruct
|
54 |
+
# Custom:
|
55 |
+
# Choose from a list of available models.
|
56 |
+
# All:
|
57 |
+
# All available models.
|
58 |
+
|
59 |
+
# Choose aggregator.
|
60 |
+
# Aggregators are models proficient in synthesizing responses from other models into a single, highquality output. An effective aggregator should maintain or enhance output quality even when
|
61 |
+
# integrating inputs that are of lesser quality than its own.
|
62 |
+
# Choices:
|
63 |
+
# - 4o-latest
|
64 |
+
# - gemini-1.5
|
65 |
+
# - grok-2
|
66 |
+
# - claude-3.5-sonnet
|
67 |
+
# - llama-3.1-405b-instruct
|
68 |
+
|
69 |
+
# Provide a prompt. (Or pre-canned prompts.)
|
70 |
+
# Paste chat history.
|
71 |
+
|
72 |
+
# Checkbox, enable judging.
|
73 |
+
#
|
74 |
+
# If checked, Judging config:
|
75 |
+
# Single sided
|
76 |
+
# Provide criteria. (or default).
|
77 |
+
# If pairwise, choose granularity (or default).
|
78 |
+
# Choose criteria. (or default).
|
79 |
+
# Enable position swapping?
|
80 |
+
|
81 |
+
# Go button.
|
82 |
+
# Sections.
|
83 |
+
# 1. Model outputs.
|
84 |
+
# 2. Aggregated output.
|
85 |
+
# 3. Judging underneath each output.
|
86 |
+
# Highlight in green, the output that was best, as determined by council.
|
87 |
+
# Show graph breakdown of scores and justifications. (by criteria, # wins and # losses)
|
88 |
+
# Show final overall score.
|
89 |
+
# Highlight in red, the output that was worst, as determined by council.
|
90 |
+
# Judging section.
|
91 |
+
# Show agreement matrix.
|
92 |
+
# Show bar graph of self-bias.
|
93 |
+
# Plot contrarianism vs. conviction (scatter plot)
|
94 |
+
# Show per-judge scores.
|
95 |
+
|
96 |
+
# Calculate total cost.
|
97 |
+
# Show total tokens used.
|
98 |
+
|
99 |
+
# """
|
100 |
+
# type: [single, pairwise]
|
101 |
+
|
102 |
+
# [single]
|
103 |
+
# - criteria:
|
104 |
+
# - name
|
105 |
+
# - weight
|
106 |
+
# - description
|
107 |
+
# - scoring
|
108 |
+
|
109 |
+
|
110 |
+
# [pairwise]
|
111 |
+
# - granularity: [fine, coarse]
|
112 |
+
# - ties_allowed: [yes, no]
|
113 |
+
# - position_swapping: [yes, no]
|
114 |
+
# - reference_model: [model_name]
|
115 |
+
# - criteria:
|
116 |
+
# - name
|
117 |
+
# - weight
|
118 |
+
# - description
|
119 |
+
# """
|
judging.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field, conint
|
2 |
+
from typing import List, Optional, Literal, Union
|
3 |
+
|
4 |
+
|
5 |
+
class Criteria(BaseModel):
|
6 |
+
name: str
|
7 |
+
description: str
|
8 |
+
min_score: conint(ge=0)
|
9 |
+
max_score: conint(ge=0)
|
10 |
+
|
11 |
+
|
12 |
+
class DirectAssessment(BaseModel):
|
13 |
+
type: Literal["direct_assessment"]
|
14 |
+
criteria: List[Criteria]
|
15 |
+
prompt: str
|
16 |
+
|
17 |
+
|
18 |
+
class PairwiseComparison(BaseModel):
|
19 |
+
type: Literal["pairwise_comparison"]
|
20 |
+
granularity: Literal["coarse", "fine", "super fine"]
|
21 |
+
ties_allowed: bool
|
22 |
+
position_swapping: bool
|
23 |
+
reference_model: str
|
24 |
+
prompt: str
|
25 |
+
|
26 |
+
|
27 |
+
class JudgingConfig(BaseModel):
|
28 |
+
assessment: Union[DirectAssessment, PairwiseComparison]
|
judging_dataclasses.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field, conint
|
2 |
+
from typing import List, Optional, Literal, Union
|
3 |
+
|
4 |
+
|
5 |
+
class Criteria(BaseModel):
|
6 |
+
name: str
|
7 |
+
description: str
|
8 |
+
min_score: conint(ge=0)
|
9 |
+
max_score: conint(ge=0)
|
10 |
+
|
11 |
+
|
12 |
+
class DirectAssessment(BaseModel):
|
13 |
+
type: Literal["direct_assessment"]
|
14 |
+
criteria: List[Criteria]
|
15 |
+
prompt: str
|
16 |
+
|
17 |
+
|
18 |
+
class PairwiseComparison(BaseModel):
|
19 |
+
type: Literal["pairwise_comparison"]
|
20 |
+
granularity: Literal["coarse", "fine", "super fine"]
|
21 |
+
ties_allowed: bool
|
22 |
+
position_swapping: bool
|
23 |
+
reference_model: str
|
24 |
+
prompt: str
|
25 |
+
|
26 |
+
|
27 |
+
class JudgingConfig(BaseModel):
|
28 |
+
assessment: Union[DirectAssessment, PairwiseComparison]
|
prompts.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from judging_dataclasses import Criteria
|
2 |
+
|
3 |
+
|
4 |
+
DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses.
|
5 |
+
|
6 |
+
[USER PROMPT START]
|
7 |
+
{user_prompt}
|
8 |
+
[USER PROMPT END]
|
9 |
+
|
10 |
+
Responses from other LLMs:
|
11 |
+
{responses_from_other_llms}
|
12 |
+
|
13 |
+
Please provide a response the combines the best aspects of the responses above."""
|
14 |
+
|
15 |
+
|
16 |
+
DEFAULT_DIRECT_ASSESSMENT_PROMPT = """We are trying to assess the quality of a response to a user query.
|
17 |
+
|
18 |
+
[USER PROMPT START]
|
19 |
+
{user_prompt}
|
20 |
+
[USER PROMPT END]
|
21 |
+
|
22 |
+
The response is as follows:
|
23 |
+
|
24 |
+
[RESPONSE START]
|
25 |
+
{response}
|
26 |
+
[RESPONSE END]
|
27 |
+
|
28 |
+
Please evaluate the quality of the response based on the following criteria:
|
29 |
+
|
30 |
+
{criteria_list}
|
31 |
+
|
32 |
+
Options:
|
33 |
+
{options}
|
34 |
+
|
35 |
+
For each of the criterion, provide a short explanation describing how you would evaluate the response based on that criterion. Then, provide your final rating for that criterion."""
|
36 |
+
|
37 |
+
DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST = [
|
38 |
+
Criteria(
|
39 |
+
name="helpfulness",
|
40 |
+
description="Provides meaningful information and clear solutions that address the query.",
|
41 |
+
min_score=1,
|
42 |
+
max_score=7,
|
43 |
+
),
|
44 |
+
Criteria(
|
45 |
+
name="relevance",
|
46 |
+
description="Stays on topic and directly relates to the query without unnecessary details.",
|
47 |
+
min_score=1,
|
48 |
+
max_score=7,
|
49 |
+
),
|
50 |
+
Criteria(
|
51 |
+
name="conciseness",
|
52 |
+
description="Communicates clearly and efficiently, avoiding excess content while retaining substance.",
|
53 |
+
min_score=1,
|
54 |
+
max_score=7,
|
55 |
+
),
|
56 |
+
]
|
57 |
+
|
58 |
+
# 7-point likert scale.
|
59 |
+
SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS = [
|
60 |
+
"Strongly Disagree",
|
61 |
+
"Disagree",
|
62 |
+
"Slightly Disagree",
|
63 |
+
"Neither Agree Nor Disagree",
|
64 |
+
"Slightly Agree",
|
65 |
+
"Agree",
|
66 |
+
"Strongly Agree",
|
67 |
+
]
|
68 |
+
|
69 |
+
# 6-point likert scale.
|
70 |
+
SIX_POINT_DIRECT_ASSESSMENT_OPTIONS = [
|
71 |
+
"Strongly Disagree",
|
72 |
+
"Disagree",
|
73 |
+
"Slightly Disagree",
|
74 |
+
"Slightly Agree",
|
75 |
+
"Agree",
|
76 |
+
"Strongly Agree",
|
77 |
+
]
|
78 |
+
|
79 |
+
# 5-point likert scale.
|
80 |
+
FIVE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
|
81 |
+
"Strongly Disagree",
|
82 |
+
"Disagree",
|
83 |
+
"Neither Agree Nor Disagree",
|
84 |
+
"Agree",
|
85 |
+
"Strongly Agree",
|
86 |
+
]
|
87 |
+
|
88 |
+
# 4-point likert scale.
|
89 |
+
FOUR_POINT_DIRECT_ASSESSMENT_OPTIONS = [
|
90 |
+
"Strongly Disagree",
|
91 |
+
"Disagree",
|
92 |
+
"Agree",
|
93 |
+
"Strongly Agree",
|
94 |
+
]
|
95 |
+
|
96 |
+
# 3-point likert scale.
|
97 |
+
THREE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
|
98 |
+
"Disagree",
|
99 |
+
"Neither Agree Nor Disagree",
|
100 |
+
"Agree",
|
101 |
+
]
|
102 |
+
|
103 |
+
# 2-point likert scale.
|
104 |
+
BINARY_DIRECT_ASSESSMENT_OPTIONS = [
|
105 |
+
"Disagree",
|
106 |
+
"Agree",
|
107 |
+
]
|
108 |
+
|
109 |
+
|
110 |
+
DEFAULT_PAIRWISE_COMPARISON_PROMPT = """We are trying to compare the quality of two responses to a user query.
|
111 |
+
|
112 |
+
[USER PROMPT START]
|
113 |
+
{prompt}
|
114 |
+
[USER PROMPT END]
|
115 |
+
|
116 |
+
[RESPONSE A START]
|
117 |
+
{first_completion}
|
118 |
+
[RESPONSE A END]
|
119 |
+
|
120 |
+
[RESPONSE B START]
|
121 |
+
{second_completion}
|
122 |
+
[RESPONSE B END]
|
123 |
+
|
124 |
+
Begin your evaluation by comparing the two responses and provide a short explanation. Some themes to consider in your evaluation: {themes_to_consider}.
|
125 |
+
|
126 |
+
After providing your explanation, output your final verdict as one of the following options:
|
127 |
+
{pairwise_comparison_options}
|
128 |
+
"""
|
129 |
+
|
130 |
+
DEFAULT_PAIRWISE_COMPARISON_THEMES_TO_CONSIDER = [
|
131 |
+
(
|
132 |
+
"helpfulness",
|
133 |
+
"Provides meaningful information and clear solutions that address the query.",
|
134 |
+
),
|
135 |
+
(
|
136 |
+
"relevance",
|
137 |
+
"Stays on topic and directly relates to the query without unnecessary details.",
|
138 |
+
),
|
139 |
+
(
|
140 |
+
"conciseness",
|
141 |
+
"Communicates clearly and efficiently, avoiding excess content while retaining substance.",
|
142 |
+
),
|
143 |
+
]
|
144 |
+
|
145 |
+
# COARSE WITH TIE.
|
146 |
+
DEFAULT_PAIRWISE_COMPARISON_OPTIONS = [
|
147 |
+
("A>B", "Response A is better than Response B"),
|
148 |
+
("B<A", "Response B is better than Response A"),
|
149 |
+
("A=B", "Both responses are equally good"),
|
150 |
+
]
|