Spaces:

llm-council
/

sandbox

Sleeping

App Files Files Community

justinxzhao commited on Sep 29, 2024

Commit

577870e

1 Parent(s): 3703473

Some refactoring, judging responses for direct assessment.

Browse files

Files changed (6) hide show

app.py +227 -120
app2.py +52 -0
constants.py +119 -0
judging.py +28 -0
judging_dataclasses.py +28 -0
prompts.py +150 -0

app.py CHANGED Viewed

@@ -7,6 +7,15 @@ import anthropic
 from together import Together
 import google.generativeai as genai
 import time
 dotenv.load_dotenv()
@@ -31,31 +40,6 @@ openai_client = OpenAI(
 # anthropic_client = anthropic.Client(api_key=ANTHROPIC_API_KEY)
 anthropic_client = anthropic.Anthropic()
-LLM_COUNCIL_MEMBERS = {
-    "Smalls": [
-        "openai://gpt-4o-mini",
-        "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
-        "vertex://gemini-1.5-flash-001",
-        "anthropic://claude-3-haiku-20240307",
-    ],
-    "Flagships": [
-        "openai://gpt-4",
-        "together://meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
-        "vertex://gemini-1.5-pro-001",
-        "anthropic://claude-3-5-sonnet",
-    ],
-}
-PROVIDER_TO_AVATAR_MAP = {
-    "openai://gpt-4o-mini": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIwLjk5ZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMjU2IDI2MCI+PHBhdGggZD0iTTIzOS4xODQgMTA2LjIwM2E2NC43MiA2NC43MiAwIDAgMC01LjU3Ni01My4xMDNDMjE5LjQ1MiAyOC40NTkgMTkxIDE1Ljc4NCAxNjMuMjEzIDIxLjc0QTY1LjU4NiA2NS41ODYgMCAwIDAgNTIuMDk2IDQ1LjIyYTY0LjcyIDY0LjcyIDAgMCAwLTQzLjIzIDMxLjM2Yy0xNC4zMSAyNC42MDItMTEuMDYxIDU1LjYzNCA4LjAzMyA3Ni43NGE2NC42NyA2NC42NyAwIDAgMCA1LjUyNSA1My4xMDJjMTQuMTc0IDI0LjY1IDQyLjY0NCAzNy4zMjQgNzAuNDQ2IDMxLjM2YTY0LjcyIDY0LjcyIDAgMCAwIDQ4Ljc1NCAyMS43NDRjMjguNDgxLjAyNSA1My43MTQtMTguMzYxIDYyLjQxNC00NS40ODFhNjQuNzcgNjQuNzcgMCAwIDAgNDMuMjI5LTMxLjM2YzE0LjEzNy0yNC41NTggMTAuODc1LTU1LjQyMy04LjA4My03Ni40ODNtLTk3LjU2IDEzNi4zMzhhNDguNCA0OC40IDAgMCAxLTMxLjEwNS0xMS4yNTVsMS41MzUtLjg3bDUxLjY3LTI5LjgyNWE4LjYgOC42IDAgMCAwIDQuMjQ3LTcuMzY3di03Mi44NWwyMS44NDUgMTIuNjM2Yy4yMTguMTExLjM3LjMyLjQwOS41NjN2NjAuMzY3Yy0uMDU2IDI2LjgxOC0yMS43ODMgNDguNTQ1LTQ4LjYwMSA0OC42MDFNMzcuMTU4IDE5Ny45M2E0OC4zNSA0OC4zNSAwIDAgMS01Ljc4MS0zMi41ODlsMS41MzQuOTIxbDUxLjcyMiAyOS44MjZhOC4zNCA4LjM0IDAgMCAwIDguNDQxIDBsNjMuMTgxLTM2LjQyNXYyNS4yMjFhLjg3Ljg3IDAgMCAxLS4zNTguNjY1bC01Mi4zMzUgMzAuMTg0Yy0yMy4yNTcgMTMuMzk4LTUyLjk3IDUuNDMxLTY2LjQwNC0xNy44MDNNMjMuNTQ5IDg1LjM4YTQ4LjUgNDguNSAwIDAgMSAyNS41OC0yMS4zMzN2NjEuMzlhOC4yOSA4LjI5IDAgMCAwIDQuMTk1IDcuMzE2bDYyLjg3NCAzNi4yNzJsLTIxLjg0NSAxMi42MzZhLjgyLjgyIDAgMCAxLS43NjcgMEw0MS4zNTMgMTUxLjUzYy0yMy4yMTEtMTMuNDU0LTMxLjE3MS00My4xNDQtMTcuODA0LTY2LjQwNXptMTc5LjQ2NiA0MS42OTVsLTYzLjA4LTM2LjYzTDE2MS43MyA3Ny44NmEuODIuODIgMCAwIDEgLjc2OCAwbDUyLjIzMyAzMC4xODRhNDguNiA0OC42IDAgMCAxLTcuMzE2IDg3LjYzNXYtNjEuMzkxYTguNTQgOC41NCAwIDAgMC00LjQtNy4yMTNtMjEuNzQyLTMyLjY5bC0xLjUzNS0uOTIybC01MS42MTktMzAuMDgxYTguMzkgOC4zOSAwIDAgMC04LjQ5MiAwTDk5Ljk4IDk5LjgwOFY3NC41ODdhLjcyLjcyIDAgMCAxIC4zMDctLjY2NWw1Mi4yMzMtMzAuMTMzYTQ4LjY1MiA0OC42NTIgMCAwIDEgNzIuMjM2IDUwLjM5MXpNODguMDYxIDEzOS4wOTdsLTIxLjg0NS0xMi41ODVhLjg3Ljg3IDAgMCAxLS40MS0uNjE0VjY1LjY4NWE0OC42NTIgNDguNjUyIDAgMCAxIDc5Ljc1Ny0zNy4zNDZsLTEuNTM1Ljg3bC01MS42NyAyOS44MjVhOC42IDguNiAwIDAgMC00LjI0NiA3LjM2N3ptMTEuODY4LTI1LjU4TDEyOC4wNjcgOTcuM2wyOC4xODggMTYuMjE4djMyLjQzNGwtMjguMDg2IDE2LjIxOGwtMjguMTg4LTE2LjIxOHoiLz48L3N2Zz4=",
-    "anthropic://claude-3-5-sonnet": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMjQgMjQiPjxwYXRoIGZpbGw9ImN1cnJlbnRDb2xvciIgZD0iTTE3LjMwNCAzLjU0MWgtMy42NzJsNi42OTYgMTYuOTE4SDI0Wm0tMTAuNjA4IDBMMCAyMC40NTloMy43NDRsMS4zNy0zLjU1M2g3LjAwNWwxLjM2OSAzLjU1M2gzLjc0NEwxMC41MzYgMy41NDFabS0uMzcxIDEwLjIyM0w4LjYxNiA3LjgybDIuMjkxIDUuOTQ1WiIvPjwvc3ZnPg==",
-    "vertex://gemini-1.5-flash-001": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMjQgMjQiPjxwYXRoIGZpbGw9IiM0MjY4ZmYiIGQ9Ik0yNCAxMi4wMjRjLTYuNDM3LjM4OC0xMS41OSA1LjUzOS0xMS45NzcgMTEuOTc2aC0uMDQ3QzExLjU4OCAxNy41NjMgNi40MzYgMTIuNDEyIDAgMTIuMDI0di0uMDQ3QzYuNDM3IDExLjU4OCAxMS41ODggNi40MzcgMTEuOTc2IDBoLjA0N2MuMzg4IDYuNDM3IDUuNTQgMTEuNTg4IDExLjk3NyAxMS45Nzd6Ii8+PC9zdmc+",
-    "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMzIgMzIiPjxnIGZpbGw9Im5vbmUiPjxwYXRoIGZpbGw9IiNiNGFjYmMiIGQ9Ik0yMC44NzEgMjQuNzh2LTYuMDZoMy4wMXY1Ljc3YzAgLjM0LS4xMi42Mi0uMzEuOTRsLTIuNDEgNC4yYy0uMTguMjMtLjQ1LjM3LS43NS4zN2gtMS4wM2MtLjIzIDAtLjM4LS4yNC0uMjgtLjQ1bDEuNjctNC4zNWMuMDctLjEzLjEtLjI3LjEtLjQyTTE3LjA5MSAzMGMuMiAwIC4yNi0uMjEuMjItLjM4bC0yLjMyLTguNjFoLTIuOTlsLjg1IDMuNTVjLjE5LjcxLjY2IDEuMzIgMS4zIDEuNjljLjE0LjA4LjI1LjIyLjI5LjM4bC44NyAzLjE0Yy4wNC4xNy4yMS4yMi4zOC4yMnoiLz48cGF0aCBmaWxsPSIjY2RjNGQ2IiBkPSJtMjguNTQxIDIzLjA4bC0xLjI3LS45NmEuOTQuOTQgMCAwIDEtLjI3LS42NnYtMi43NWMwLS42NC0uMTYtMS4yMzgtLjQ0Mi0xLjc2cS4yMTMuMDUuNDQyLjA1YTIgMiAwIDEgMC0xLjk0OS0xLjU0OWEzLjggMy44IDAgMCAwLTEuOC0uNDUxaC04LjE3Yy0uNjYgMC0xLjI3LS40Mi0xLjU3LTEuMDFMMTAuMDQxIDMuNWEyLjIzIDIuMjMgMCAwIDAtMi4xLTEuNWMtLjE4IDAtLjMuMTctLjI0LjM0TDguNTcxIDVjLS4yIDAtMS4wNy4yMy0xLjg1LjczbC0uODA2LjQ5OEw3LjAwMiAxMHY4LjI2YzAgMi4wMSAxLjI1IDMuNzIgMy4wMSA0LjQxdjcuMDJjLS4wMS4xNy4xMy4zMS4zLjMxaDEuMzdjLjE4IDAgLjMyLS4xNC4zMi0uMzF2LTEuOTZjMC0uMTcuMDctLjMyLjE4LS40NGMuNTMtLjUyLjgyLTEuMjMuODItMS45N1YyM2g1LjA3YzEuMjcgMCAyLjQ5LS41NSAzLjMzLTEuNWMwIC45NC40OCAxLjcyIDEuMzggMi4zMmwzLjk2IDIuNDNjLjE2LjExLjI2LjMuMjYuNXYyLjkzYzAgLjE3LjE0LjMxLjMxLjMxaDEuMzdjLjE3IDAgLjMxLS4xNC4zMS0uMzF2LTUuNTFjLjAxLS40LS4xNS0uOC0uNDUtMS4wOSIvPjxwYXRoIGZpbGw9IiNmM2FkNjEiIGQ9Ik02Ljg0MSA2Yy0uMzYgMC0uNzIuMS0xLjAzLjI5bC0yLjE5IDEuMzVjLS4zNi4yMy0uNi42MS0uNjIgMS4wM2MtLjAzLjczLjU1IDEuMzMgMS4yNyAxLjMzaDMuNDljLjU3IDAgMS4wNC0uNDcgMS4wNC0xLjA1di0xYzAtMS4wNy0uODgtMS45NS0xLjk2LTEuOTUiLz48cGF0aCBmaWxsPSIjMWMxYzFjIiBkPSJNNi41IDhhLjUuNSAwIDEgMCAwLTFhLjUuNSAwIDAgMCAwIDFtLTEuOTk5LjVjMC0uMjgtLjIyLS41LS41LS41aC0uNzZhMS4yIDEuMiAwIDAgMC0uMjEgMWguOTdjLjI4IDAgLjUtLjIyLjUtLjUiLz48cGF0aCBmaWxsPSIjZjNhZDYxIiBkPSJNMjguOTkxIDI4aC0xLjk5djEuNjhjMCAuMTcuMTQuMzEuMzEuMzFoMS4zN2MuMTcgMCAuMzEtLjE0LjMxLS4zMXptLTE2Ljk5IDBoLTEuOTl2MS42OWMtLjAxLjE3LjEzLjMxLjMuMzFoMS4zN2MuMTggMCAuMzItLjE0LjMyLS4zMXptNS4wODggMmwtMS4zOTgtLjAxYy0uMTcgMC0uMzQtLjA1LS4zOC0uMjJsLS40OS0xLjc3aDIuMDU0bC40MzYgMS42MmMuMDQuMTctLjAyLjM3OC0uMjE2LjM4em0yLjI4OCAwYS4zMTMuMzEzIDAgMCAxLS4yNzYtLjQ1bC41OTUtMS41NWgyLjRsLS45MzUgMS42M2EuOTUuOTUgMCAwIDEtLjc0Mi4zN3oiLz48L2c+PC9zdmc+",
-    "anthropic://claude-3-haiku-20240307": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMjQgMjQiPjxwYXRoIGZpbGw9ImN1cnJlbnRDb2xvciIgZD0iTTE3LjMwNCAzLjU0MWgtMy42NzJsNi42OTYgMTYuOTE4SDI0Wm0tMTAuNjA4IDBMMCAyMC40NTloMy43NDRsMS4zNy0zLjU1M2g3LjAwNWwxLjM2OSAzLjU1M2gzLjc0NEwxMC41MzYgMy41NDFabS0uMzcxIDEwLjIyM0w4LjYxNiA3LjgybDIuMjkxIDUuOTQ1WiIvPjwvc3ZnPg==",
-}
-AGGREGATORS = ["openai://gpt-4o-mini", "openai://gpt-4o"]
 def anthropic_streamlit_streamer(stream):
     """
@@ -88,6 +72,15 @@ def together_streamlit_streamer(stream):
         yield chunk.choices[0].delta.content
 # Helper functions for LLM council and aggregator selection
 def llm_council_selector():
     selected_council = st.radio(
@@ -133,16 +126,17 @@ def get_google_response(model_name, prompt):
     return model.generate_content(prompt, stream=True)
-def get_llm_response(model_identifier, prompt):
     provider, model_name = model_identifier.split("://")
     if provider == "openai":
         return get_openai_response(model_name, prompt)
     elif provider == "anthropic":
-        return get_anthropic_response(model_name, prompt)
     elif provider == "together":
-        return get_together_response(model_name, prompt)
     elif provider == "vertex":
-        return get_google_response(model_name, prompt)
     else:
         return None
@@ -151,6 +145,97 @@ def get_response_key(model):
     return model + ".response"
 # Main Streamlit App
 def main():
     st.set_page_config(
@@ -183,7 +268,7 @@ def main():
     # App title and description
     st.title("Language Model Council Sandbox")
     st.markdown("###### Invoke a council of LLMs to generate and judge each other.")
-    st.markdown("###### [ArXiv Paper](https://arxiv.org/abs/2406.08598)")
     # Authentication system
     if "authenticated" not in st.session_state:
@@ -206,60 +291,151 @@ def main():
         selected_models = llm_council_selector()
         st.write("Selected Models:", selected_models)
         selected_aggregator = aggregator_selector()
-        st.write("Selected Aggregator:", selected_aggregator)
         # Prompt input
-        prompt = st.text_area("Enter your prompt:")
         if st.button("Submit"):
             st.write("Responses:")
             # Fetching and streaming responses from each selected model
             for model in selected_models:
-                # with st.chat_message(model):
                 with st.chat_message(
                     model,
                     avatar=PROVIDER_TO_AVATAR_MAP[model],
                 ):
                     message_placeholder = st.empty()
-                    stream = get_llm_response(model, prompt)
                     if stream:
-                        if model.startswith("anthropic"):
-                            stream = anthropic_streamlit_streamer(stream)
-                        elif model.startswith("vertex"):
-                            stream = google_streamlit_streamer(stream)
-                        elif model.startswith("together"):
-                            stream = together_streamlit_streamer(stream)
                         st.session_state[get_response_key(model)] = (
                             message_placeholder.write_stream(stream)
                         )
-            # Constructing the aggregator prompt
-            aggregator_prompt = f"User prompt: {prompt}\n\n"
-            aggregator_prompt += "Responses from other LLMs:\n\n"
-            aggregator_prompt += "\n".join(
-                [
-                    f"{model}: {st.session_state.get(get_response_key(model))} \n\n"
-                    for model in selected_models
-                ]
             )
-            aggregator_prompt += "\n\nPlease provide an aggregated response."
             with st.expander("Aggregator Prompt"):
                 st.write(aggregator_prompt)
             # Fetching and streaming response from the aggregator
-            st.write(f"Aggregated response from {selected_aggregator}:")
             with st.chat_message(
                 selected_aggregator,
                 avatar=PROVIDER_TO_AVATAR_MAP[selected_aggregator],
             ):
                 message_placeholder = st.empty()
-                aggregator_stream = get_llm_response(
                     selected_aggregator, aggregator_prompt
                 )
                 if aggregator_stream:
                     message_placeholder.write_stream(aggregator_stream)
     else:
         with cols[1]:
             st.warning("Please log in to access this app.")
@@ -267,72 +443,3 @@ def main():
 if __name__ == "__main__":
     main()
-# Fix the aggregator step.
-# Add a judging step.
-# Add visualizations.
-# import streamlit as st
-# from components import llm_council_selector
-# st.title("LLM Council Selector")
-# selected_models = llm_council_selector()
-# if selected_models is not None:
-#     st.write("Selected Models:", selected_models)
-# else:
-#     st.write("No models selected or component didn't return a value.")
-# Choose your council.
-# Pre-selected.
-#    Smalls: GPT-4o-mini, llama-3.1-70b, qwen-2.0-70b
-#    Flagships: GPT-4o, llama-3.1-405b, qwen-2.0-110b, gemini, claude-3.5-sonnet
-#    Best: chatgpt-4o-latest, gemini-1.5-pro-exp-0827, grok-2-2024-08-13, claude-3-5-sonnet-20240620, llama-3.1-405b-instruct
-# Custom:
-#    Choose from a list of available models.
-# All:
-#    All available models.
-# Choose aggregator.
-# Aggregators are models proficient in synthesizing responses from other models into a single, highquality output. An effective aggregator should maintain or enhance output quality even when
-# integrating inputs that are of lesser quality than its own.
-# Choices:
-#   - 4o-latest
-#   - gemini-1.5
-#   - grok-2
-#   - claude-3.5-sonnet
-#   - llama-3.1-405b-instruct
-# Provide a prompt. (Or pre-canned prompts.)
-# Paste chat history.
-# Checkbox, enable judging.
-#
-# If checked, Judging config:
-# Single sided
-#   Provide criteria. (or default).
-# If pairwise, choose granularity (or default).
-#   Choose criteria. (or default).
-#   Enable position swapping?
-# Go button.
-# Sections.
-# 1. Model outputs.
-# 2. Aggregated output.
-# 3. Judging underneath each output.
-#  Highlight in green, the output that was best, as determined by council.
-#  Show graph breakdown of scores and justifications. (by criteria, # wins and # losses)
-#  Show final overall score.
-#  Highlight in red, the output that was worst, as determined by council.
-# Judging section.
-#   Show agreement matrix.
-#   Show bar graph of self-bias.
-#   Plot contrarianism vs. conviction (scatter plot)
-#   Show per-judge scores.
-# Calculate total cost.
-# Show total tokens used.

 from together import Together
 import google.generativeai as genai
 import time
+from typing import List, Optional, Literal, Union
+from constants import (
+    LLM_COUNCIL_MEMBERS,
+    PROVIDER_TO_AVATAR_MAP,
+    AGGREGATORS,
+)
+from prompts import *
+from judging_dataclasses import *
 dotenv.load_dotenv()
 # anthropic_client = anthropic.Client(api_key=ANTHROPIC_API_KEY)
 anthropic_client = anthropic.Anthropic()
 def anthropic_streamlit_streamer(stream):
     """
         yield chunk.choices[0].delta.content
+def llm_streamlit_streamer(stream, llm):
+    if llm.startswith("anthropic"):
+        return anthropic_streamlit_streamer(stream)
+    elif llm.startswith("vertex"):
+        return google_streamlit_streamer(stream)
+    elif llm.startswith("together"):
+        return together_streamlit_streamer(stream)
 # Helper functions for LLM council and aggregator selection
 def llm_council_selector():
     selected_council = st.radio(
     return model.generate_content(prompt, stream=True)
+def get_llm_response_stream(model_identifier, prompt):
+    """Returns a streamlit-friendly stream of response tokens from the LLM."""
     provider, model_name = model_identifier.split("://")
     if provider == "openai":
         return get_openai_response(model_name, prompt)
     elif provider == "anthropic":
+        return anthropic_streamlit_streamer(get_anthropic_response(model_name, prompt))
     elif provider == "together":
+        return together_streamlit_streamer(get_together_response(model_name, prompt))
     elif provider == "vertex":
+        return google_streamlit_streamer(get_google_response(model_name, prompt))
     else:
         return None
     return model + ".response"
+def get_model_from_response_key(response_key):
+    return response_key.split(".")[0]
+def get_judging_key(judge_model, response_model):
+    return "judge." + judge_model + "." + response_model
+def get_aggregator_response_key(model):
+    return model + ".aggregator_response"
+# Streamlit form UI
+def render_criteria_form(criteria_num):
+    """Render a criteria input form."""
+    with st.expander(f"Criteria {criteria_num + 1}"):
+        name = st.text_input(f"Name for Criteria {criteria_num + 1}")
+        description = st.text_area(f"Description for Criteria {criteria_num + 1}")
+        min_score = st.number_input(
+            f"Min Score for Criteria {criteria_num + 1}", min_value=0, step=1
+        )
+        max_score = st.number_input(
+            f"Max Score for Criteria {criteria_num + 1}", min_value=0, step=1
+        )
+    return Criteria(
+        name=name, description=description, min_score=min_score, max_score=max_score
+    )
+def get_response_mapping():
+    # Inspect the session state for all the responses.
+    # This is a dictionary mapping model names to their responses.
+    # The aggregator response is also included in this mapping under the key "<model>.aggregator_response".
+    response_mapping = {}
+    for key in st.session_state.keys():
+        if key.endswith(".response"):
+            response_mapping[get_model_from_response_key(key)] = st.session_state[key]
+        if key.endswith(".aggregator_response"):
+            response_mapping[key] = st.session_state[key]
+    return response_mapping
+def format_likert_comparison_options(options):
+    return "\n".join([f"{i + 1}: {option}" for i, option in enumerate(options)])
+def format_criteria_list(criteria_list):
+    return "\n".join(
+        [f"{criteria.name}: {criteria.description}" for criteria in criteria_list]
+    )
+def get_direct_assessment_prompt(
+    direct_assessment_prompt, user_prompt, response, criteria_list, options
+):
+    return direct_assessment_prompt.format(
+        user_prompt=user_prompt,
+        response=response,
+        criteria_list=f"{format_criteria_list(DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST)}",
+        options=f"{format_likert_comparison_options(SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS)}",
+    )
+def get_default_direct_assessment_prompt(user_prompt):
+    return get_direct_assessment_prompt(
+        DEFAULT_DIRECT_ASSESSMENT_PROMPT,
+        user_prompt=user_prompt,
+        response="{{response}}",
+        criteria_list=DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST,
+        options=SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
+    )
+def get_aggregator_prompt(aggregator_prompt, user_prompt, llms):
+    responses_from_other_llms = "\n\n".join(
+        [f"{model}: {st.session_state.get(get_response_key(model))}" for model in llms]
+    )
+    return aggregator_prompt.format(
+        user_prompt=user_prompt,
+        responses_from_other_llms=responses_from_other_llms,
+    )
+def get_default_aggregator_prompt(user_prompt, llms):
+    return get_aggregator_prompt(
+        DEFAULT_AGGREGATOR_PROMPT,
+        user_prompt=user_prompt,
+        llms=llms,
+    )
 # Main Streamlit App
 def main():
     st.set_page_config(
     # App title and description
     st.title("Language Model Council Sandbox")
     st.markdown("###### Invoke a council of LLMs to generate and judge each other.")
+    st.markdown("###### [Paper](https://arxiv.org/abs/2406.08598)")
     # Authentication system
     if "authenticated" not in st.session_state:
         selected_models = llm_council_selector()
         st.write("Selected Models:", selected_models)
         selected_aggregator = aggregator_selector()
+        # st.write("Selected Aggregator:", selected_aggregator)
         # Prompt input
+        user_prompt = st.text_area("Enter your prompt:")
         if st.button("Submit"):
             st.write("Responses:")
             # Fetching and streaming responses from each selected model
+            # TODO: Make this asynchronous?
             for model in selected_models:
                 with st.chat_message(
                     model,
                     avatar=PROVIDER_TO_AVATAR_MAP[model],
                 ):
                     message_placeholder = st.empty()
+                    stream = get_llm_response_stream(model, user_prompt)
                     if stream:
                         st.session_state[get_response_key(model)] = (
                             message_placeholder.write_stream(stream)
                         )
+            # Get the aggregator prompt.
+            aggregator_prompt = get_default_aggregator_prompt(
+                user_prompt=user_prompt, llms=selected_models
             )
             with st.expander("Aggregator Prompt"):
                 st.write(aggregator_prompt)
             # Fetching and streaming response from the aggregator
+            st.write(f"Mixture-of-Agents response from {selected_aggregator}:")
             with st.chat_message(
                 selected_aggregator,
                 avatar=PROVIDER_TO_AVATAR_MAP[selected_aggregator],
             ):
                 message_placeholder = st.empty()
+                aggregator_stream = get_llm_response_stream(
                     selected_aggregator, aggregator_prompt
                 )
                 if aggregator_stream:
                     message_placeholder.write_stream(aggregator_stream)
+                    st.session_state[
+                        get_aggregator_response_key(selected_aggregator)
+                    ] = message_placeholder.write_stream(aggregator_stream)
+        # Judging.
+        st.markdown("#### Judging Configuration Form")
+        # Choose the type of assessment
+        assessment_type = st.radio(
+            "Select the type of assessment",
+            options=["Direct Assessment", "Pairwise Comparison"],
+        )
+        # Depending on the assessment type, render different forms
+        if assessment_type == "Direct Assessment":
+            direct_assessment_prompt = st.text_area(
+                "Prompt for the Direct Assessment",
+                value=get_default_direct_assessment_prompt(user_prompt=user_prompt),
+                height=500,
+            )
+            # TODO: Add option to edit criteria list with a basic text field.
+            criteria_list = DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST
+            # Create DirectAssessment object when form is submitted
+            if st.button("Submit Direct Assessment"):
+                # Submit direct asssessment.
+                responses_for_judging = get_response_mapping()
+                response_judging_columns = st.columns(3)
+                responses_for_judging_to_streamlit_column_index_map = {
+                    model: response_judging_columns[i % 3]
+                    for i, model in enumerate(responses_for_judging.keys())
+                }
+                # Get judging responses.
+                for response_model, response in responses_for_judging.items():
+                    st_column = response_judging_columns[
+                        responses_for_judging_to_streamlit_column_index_map[
+                            response_model
+                        ]
+                    ]
+                    with st_column:
+                        st.write(f"Judging {response_model}")
+                        judging_prompt = get_direct_assessment_prompt(
+                            direct_assessment_prompt,
+                            user_prompt,
+                            response,
+                            criteria_list,
+                            SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
+                        )
+                        for judging_model in selected_models:
+                            with st.expander("Detailed assessments", expanded=True):
+                                with st.chat_message(
+                                    judging_model,
+                                    avatar=PROVIDER_TO_AVATAR_MAP[judging_model],
+                                ):
+                                    st.write(f"Judge: {judging_model}")
+                                    message_placeholder = st.empty()
+                                    judging_stream = get_llm_response_stream(
+                                        judging_model, judging_prompt
+                                    )
+                                    if judging_stream:
+                                        st.session_state[
+                                            get_judging_key(
+                                                judging_model, response_model
+                                            )
+                                        ] = message_placeholder.write_stream(
+                                            judging_stream
+                                        )
+                        # When all of the judging is finished for the given response, get the actual
+                        # values, parsed (use gpt-4o-mini for now) with json mode.
+                        # TODO.
+        elif assessment_type == "Pairwise Comparison":
+            pairwise_comparison_prompt = st.text_area(
+                "Prompt for the Pairwise Comparison"
+            )
+            granularity = st.selectbox("Granularity", ["coarse", "fine", "super fine"])
+            ties_allowed = st.checkbox("Are ties allowed?")
+            position_swapping = st.checkbox("Enable position swapping?")
+            reference_model = st.text_input("Reference Model")
+            # Create PairwiseComparison object when form is submitted
+            if st.button("Submit Pairwise Comparison"):
+                pairwise_comparison_config = PairwiseComparison(
+                    type="pairwise_comparison",
+                    granularity=granularity,
+                    ties_allowed=ties_allowed,
+                    position_swapping=position_swapping,
+                    reference_model=reference_model,
+                    prompt=prompt,
+                )
+                st.success(f"Pairwise Comparison Created: {pairwise_comparison_config}")
+                # Submit pairwise comparison.
+                responses_for_judging = get_response_mapping()
     else:
         with cols[1]:
             st.warning("Please log in to access this app.")
 if __name__ == "__main__":
     main()

app2.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os
+import google_auth_oauthlib.flow
+from googleapiclient.discovery import build
+import streamlit as st
+import webbrowser
+redirect_uri = os.environ.get("REDIRECT_URI", "http://localhost:8502/")
+def auth_flow():
+    st.write("Welcome to My App!")
+    auth_code = st.query_params.get("code")
+    flow = google_auth_oauthlib.flow.Flow.from_client_secrets_file(
+        "client_secret.json",  # replace with you json credentials from your google auth app
+        scopes=["https://www.googleapis.com/auth/userinfo.email", "openid"],
+        redirect_uri=redirect_uri,
+    )
+    if auth_code:
+        flow.fetch_token(code=auth_code)
+        credentials = flow.credentials
+        st.write("Login Done")
+        user_info_service = build(
+            serviceName="oauth2",
+            version="v2",
+            credentials=credentials,
+        )
+        user_info = user_info_service.userinfo().get().execute()
+        assert user_info.get("email"), "Email not found in infos"
+        st.session_state["google_auth_code"] = auth_code
+        st.session_state["user_info"] = user_info
+    else:
+        if st.button("Sign in with Google"):
+            authorization_url, state = flow.authorization_url(
+                access_type="offline",
+                include_granted_scopes="true",
+            )
+            webbrowser.open_new_tab(authorization_url)
+def main():
+    if "google_auth_code" not in st.session_state:
+        auth_flow()
+    if "google_auth_code" in st.session_state:
+        email = st.session_state["user_info"].get("email")
+        st.write(f"Hello {email}")
+if __name__ == "__main__":
+    main()

constants.py ADDED Viewed

	@@ -0,0 +1,119 @@

+LLM_COUNCIL_MEMBERS = {
+    "Smalls": [
+        # "openai://gpt-4o-mini",
+        "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+        "together://meta-llama/Llama-3.2-3B-Instruct-Turbo",
+        # "vertex://gemini-1.5-flash-001",
+        # "anthropic://claude-3-haiku-20240307",
+    ],
+    "Flagships": [
+        "openai://gpt-4o",
+        "together://meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+        "vertex://gemini-1.5-pro-001",
+        "anthropic://claude-3-5-sonnet",
+    ],
+}
+PROVIDER_TO_AVATAR_MAP = {
+    "openai://gpt-4o-mini": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIwLjk5ZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMjU2IDI2MCI+PHBhdGggZD0iTTIzOS4xODQgMTA2LjIwM2E2NC43MiA2NC43MiAwIDAgMC01LjU3Ni01My4xMDNDMjE5LjQ1MiAyOC40NTkgMTkxIDE1Ljc4NCAxNjMuMjEzIDIxLjc0QTY1LjU4NiA2NS41ODYgMCAwIDAgNTIuMDk2IDQ1LjIyYTY0LjcyIDY0LjcyIDAgMCAwLTQzLjIzIDMxLjM2Yy0xNC4zMSAyNC42MDItMTEuMDYxIDU1LjYzNCA4LjAzMyA3Ni43NGE2NC42NyA2NC42NyAwIDAgMCA1LjUyNSA1My4xMDJjMTQuMTc0IDI0LjY1IDQyLjY0NCAzNy4zMjQgNzAuNDQ2IDMxLjM2YTY0LjcyIDY0LjcyIDAgMCAwIDQ4Ljc1NCAyMS43NDRjMjguNDgxLjAyNSA1My43MTQtMTguMzYxIDYyLjQxNC00NS40ODFhNjQuNzcgNjQuNzcgMCAwIDAgNDMuMjI5LTMxLjM2YzE0LjEzNy0yNC41NTggMTAuODc1LTU1LjQyMy04LjA4My03Ni40ODNtLTk3LjU2IDEzNi4zMzhhNDguNCA0OC40IDAgMCAxLTMxLjEwNS0xMS4yNTVsMS41MzUtLjg3bDUxLjY3LTI5LjgyNWE4LjYgOC42IDAgMCAwIDQuMjQ3LTcuMzY3di03Mi44NWwyMS44NDUgMTIuNjM2Yy4yMTguMTExLjM3LjMyLjQwOS41NjN2NjAuMzY3Yy0uMDU2IDI2LjgxOC0yMS43ODMgNDguNTQ1LTQ4LjYwMSA0OC42MDFNMzcuMTU4IDE5Ny45M2E0OC4zNSA0OC4zNSAwIDAgMS01Ljc4MS0zMi41ODlsMS41MzQuOTIxbDUxLjcyMiAyOS44MjZhOC4zNCA4LjM0IDAgMCAwIDguNDQxIDBsNjMuMTgxLTM2LjQyNXYyNS4yMjFhLjg3Ljg3IDAgMCAxLS4zNTguNjY1bC01Mi4zMzUgMzAuMTg0Yy0yMy4yNTcgMTMuMzk4LTUyLjk3IDUuNDMxLTY2LjQwNC0xNy44MDNNMjMuNTQ5IDg1LjM4YTQ4LjUgNDguNSAwIDAgMSAyNS41OC0yMS4zMzN2NjEuMzlhOC4yOSA4LjI5IDAgMCAwIDQuMTk1IDcuMzE2bDYyLjg3NCAzNi4yNzJsLTIxLjg0NSAxMi42MzZhLjgyLjgyIDAgMCAxLS43NjcgMEw0MS4zNTMgMTUxLjUzYy0yMy4yMTEtMTMuNDU0LTMxLjE3MS00My4xNDQtMTcuODA0LTY2LjQwNXptMTc5LjQ2NiA0MS42OTVsLTYzLjA4LTM2LjYzTDE2MS43MyA3Ny44NmEuODIuODIgMCAwIDEgLjc2OCAwbDUyLjIzMyAzMC4xODRhNDguNiA0OC42IDAgMCAxLTcuMzE2IDg3LjYzNXYtNjEuMzkxYTguNTQgOC41NCAwIDAgMC00LjQtNy4yMTNtMjEuNzQyLTMyLjY5bC0xLjUzNS0uOTIybC01MS42MTktMzAuMDgxYTguMzkgOC4zOSAwIDAgMC04LjQ5MiAwTDk5Ljk4IDk5LjgwOFY3NC41ODdhLjcyLjcyIDAgMCAxIC4zMDctLjY2NWw1Mi4yMzMtMzAuMTMzYTQ4LjY1MiA0OC42NTIgMCAwIDEgNzIuMjM2IDUwLjM5MXpNODguMDYxIDEzOS4wOTdsLTIxLjg0NS0xMi41ODVhLjg3Ljg3IDAgMCAxLS40MS0uNjE0VjY1LjY4NWE0OC42NTIgNDguNjUyIDAgMCAxIDc5Ljc1Ny0zNy4zNDZsLTEuNTM1Ljg3bC01MS42NyAyOS44MjVhOC42IDguNiAwIDAgMC00LjI0NiA3LjM2N3ptMTEuODY4LTI1LjU4TDEyOC4wNjcgOTcuM2wyOC4xODggMTYuMjE4djMyLjQzNGwtMjguMDg2IDE2LjIxOGwtMjguMTg4LTE2LjIxOHoiLz48L3N2Zz4=",
+    "anthropic://claude-3-5-sonnet": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMjQgMjQiPjxwYXRoIGZpbGw9ImN1cnJlbnRDb2xvciIgZD0iTTE3LjMwNCAzLjU0MWgtMy42NzJsNi42OTYgMTYuOTE4SDI0Wm0tMTAuNjA4IDBMMCAyMC40NTloMy43NDRsMS4zNy0zLjU1M2g3LjAwNWwxLjM2OSAzLjU1M2gzLjc0NEwxMC41MzYgMy41NDFabS0uMzcxIDEwLjIyM0w4LjYxNiA3LjgybDIuMjkxIDUuOTQ1WiIvPjwvc3ZnPg==",
+    "vertex://gemini-1.5-flash-001": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMjQgMjQiPjxwYXRoIGZpbGw9IiM0MjY4ZmYiIGQ9Ik0yNCAxMi4wMjRjLTYuNDM3LjM4OC0xMS41OSA1LjUzOS0xMS45NzcgMTEuOTc2aC0uMDQ3QzExLjU4OCAxNy41NjMgNi40MzYgMTIuNDEyIDAgMTIuMDI0di0uMDQ3QzYuNDM3IDExLjU4OCAxMS41ODggNi40MzcgMTEuOTc2IDBoLjA0N2MuMzg4IDYuNDM3IDUuNTQgMTEuNTg4IDExLjk3NyAxMS45Nzd6Ii8+PC9zdmc+",
+    "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMzIgMzIiPjxnIGZpbGw9Im5vbmUiPjxwYXRoIGZpbGw9IiNiNGFjYmMiIGQ9Ik0yMC44NzEgMjQuNzh2LTYuMDZoMy4wMXY1Ljc3YzAgLjM0LS4xMi42Mi0uMzEuOTRsLTIuNDEgNC4yYy0uMTguMjMtLjQ1LjM3LS43NS4zN2gtMS4wM2MtLjIzIDAtLjM4LS4yNC0uMjgtLjQ1bDEuNjctNC4zNWMuMDctLjEzLjEtLjI3LjEtLjQyTTE3LjA5MSAzMGMuMiAwIC4yNi0uMjEuMjItLjM4bC0yLjMyLTguNjFoLTIuOTlsLjg1IDMuNTVjLjE5LjcxLjY2IDEuMzIgMS4zIDEuNjljLjE0LjA4LjI1LjIyLjI5LjM4bC44NyAzLjE0Yy4wNC4xNy4yMS4yMi4zOC4yMnoiLz48cGF0aCBmaWxsPSIjY2RjNGQ2IiBkPSJtMjguNTQxIDIzLjA4bC0xLjI3LS45NmEuOTQuOTQgMCAwIDEtLjI3LS42NnYtMi43NWMwLS42NC0uMTYtMS4yMzgtLjQ0Mi0xLjc2cS4yMTMuMDUuNDQyLjA1YTIgMiAwIDEgMC0xLjk0OS0xLjU0OWEzLjggMy44IDAgMCAwLTEuOC0uNDUxaC04LjE3Yy0uNjYgMC0xLjI3LS40Mi0xLjU3LTEuMDFMMTAuMDQxIDMuNWEyLjIzIDIuMjMgMCAwIDAtMi4xLTEuNWMtLjE4IDAtLjMuMTctLjI0LjM0TDguNTcxIDVjLS4yIDAtMS4wNy4yMy0xLjg1LjczbC0uODA2LjQ5OEw3LjAwMiAxMHY4LjI2YzAgMi4wMSAxLjI1IDMuNzIgMy4wMSA0LjQxdjcuMDJjLS4wMS4xNy4xMy4zMS4zLjMxaDEuMzdjLjE4IDAgLjMyLS4xNC4zMi0uMzF2LTEuOTZjMC0uMTcuMDctLjMyLjE4LS40NGMuNTMtLjUyLjgyLTEuMjMuODItMS45N1YyM2g1LjA3YzEuMjcgMCAyLjQ5LS41NSAzLjMzLTEuNWMwIC45NC40OCAxLjcyIDEuMzggMi4zMmwzLjk2IDIuNDNjLjE2LjExLjI2LjMuMjYuNXYyLjkzYzAgLjE3LjE0LjMxLjMxLjMxaDEuMzdjLjE3IDAgLjMxLS4xNC4zMS0uMzF2LTUuNTFjLjAxLS40LS4xNS0uOC0uNDUtMS4wOSIvPjxwYXRoIGZpbGw9IiNmM2FkNjEiIGQ9Ik02Ljg0MSA2Yy0uMzYgMC0uNzIuMS0xLjAzLjI5bC0yLjE5IDEuMzVjLS4zNi4yMy0uNi42MS0uNjIgMS4wM2MtLjAzLjczLjU1IDEuMzMgMS4yNyAxLjMzaDMuNDljLjU3IDAgMS4wNC0uNDcgMS4wNC0xLjA1di0xYzAtMS4wNy0uODgtMS45NS0xLjk2LTEuOTUiLz48cGF0aCBmaWxsPSIjMWMxYzFjIiBkPSJNNi41IDhhLjUuNSAwIDEgMCAwLTFhLjUuNSAwIDAgMCAwIDFtLTEuOTk5LjVjMC0uMjgtLjIyLS41LS41LS41aC0uNzZhMS4yIDEuMiAwIDAgMC0uMjEgMWguOTdjLjI4IDAgLjUtLjIyLjUtLjUiLz48cGF0aCBmaWxsPSIjZjNhZDYxIiBkPSJNMjguOTkxIDI4aC0xLjk5djEuNjhjMCAuMTcuMTQuMzEuMzEuMzFoMS4zN2MuMTcgMCAuMzEtLjE0LjMxLS4zMXptLTE2Ljk5IDBoLTEuOTl2MS42OWMtLjAxLjE3LjEzLjMxLjMuMzFoMS4zN2MuMTggMCAuMzItLjE0LjMyLS4zMXptNS4wODggMmwtMS4zOTgtLjAxYy0uMTcgMC0uMzQtLjA1LS4zOC0uMjJsLS40OS0xLjc3aDIuMDU0bC40MzYgMS42MmMuMDQuMTctLjAyLjM3OC0uMjE2LjM4em0yLjI4OCAwYS4zMTMuMzEzIDAgMCAxLS4yNzYtLjQ1bC41OTUtMS41NWgyLjRsLS45MzUgMS42M2EuOTUuOTUgMCAwIDEtLjc0Mi4zN3oiLz48L2c+PC9zdmc+",
+    "together://meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMzIgMzIiPjxnIGZpbGw9Im5vbmUiPjxwYXRoIGZpbGw9IiNiNGFjYmMiIGQ9Ik0yMC44NzEgMjQuNzh2LTYuMDZoMy4wMXY1Ljc3YzAgLjM0LS4xMi42Mi0uMzEuOTRsLTIuNDEgNC4yYy0uMTguMjMtLjQ1LjM3LS43NS4zN2gtMS4wM2MtLjIzIDAtLjM4LS4yNC0uMjgtLjQ1bDEuNjctNC4zNWMuMDctLjEzLjEtLjI3LjEtLjQyTTE3LjA5MSAzMGMuMiAwIC4yNi0uMjEuMjItLjM4bC0yLjMyLTguNjFoLTIuOTlsLjg1IDMuNTVjLjE5LjcxLjY2IDEuMzIgMS4zIDEuNjljLjE0LjA4LjI1LjIyLjI5LjM4bC44NyAzLjE0Yy4wNC4xNy4yMS4yMi4zOC4yMnoiLz48cGF0aCBmaWxsPSIjY2RjNGQ2IiBkPSJtMjguNTQxIDIzLjA4bC0xLjI3LS45NmEuOTQuOTQgMCAwIDEtLjI3LS42NnYtMi43NWMwLS42NC0uMTYtMS4yMzgtLjQ0Mi0xLjc2cS4yMTMuMDUuNDQyLjA1YTIgMiAwIDEgMC0xLjk0OS0xLjU0OWEzLjggMy44IDAgMCAwLTEuOC0uNDUxaC04LjE3Yy0uNjYgMC0xLjI3LS40Mi0xLjU3LTEuMDFMMTAuMDQxIDMuNWEyLjIzIDIuMjMgMCAwIDAtMi4xLTEuNWMtLjE4IDAtLjMuMTctLjI0LjM0TDguNTcxIDVjLS4yIDAtMS4wNy4yMy0xLjg1LjczbC0uODA2LjQ5OEw3LjAwMiAxMHY4LjI2YzAgMi4wMSAxLjI1IDMuNzIgMy4wMSA0LjQxdjcuMDJjLS4wMS4xNy4xMy4zMS4zLjMxaDEuMzdjLjE4IDAgLjMyLS4xNC4zMi0uMzF2LTEuOTZjMC0uMTcuMDctLjMyLjE4LS40NGMuNTMtLjUyLjgyLTEuMjMuODItMS45N1YyM2g1LjA3YzEuMjcgMCAyLjQ5LS41NSAzLjMzLTEuNWMwIC45NC40OCAxLjcyIDEuMzggMi4zMmwzLjk2IDIuNDNjLjE2LjExLjI2LjMuMjYuNXYyLjkzYzAgLjE3LjE0LjMxLjMxLjMxaDEuMzdjLjE3IDAgLjMxLS4xNC4zMS0uMzF2LTUuNTFjLjAxLS40LS4xNS0uOC0uNDUtMS4wOSIvPjxwYXRoIGZpbGw9IiNmM2FkNjEiIGQ9Ik02Ljg0MSA2Yy0uMzYgMC0uNzIuMS0xLjAzLjI5bC0yLjE5IDEuMzVjLS4zNi4yMy0uNi42MS0uNjIgMS4wM2MtLjAzLjczLjU1IDEuMzMgMS4yNyAxLjMzaDMuNDljLjU3IDAgMS4wNC0uNDcgMS4wNC0xLjA1di0xYzAtMS4wNy0uODgtMS45NS0xLjk2LTEuOTUiLz48cGF0aCBmaWxsPSIjMWMxYzFjIiBkPSJNNi41IDhhLjUuNSAwIDEgMCAwLTFhLjUuNSAwIDAgMCAwIDFtLTEuOTk5LjVjMC0uMjgtLjIyLS41LS41LS41aC0uNzZhMS4yIDEuMiAwIDAgMC0uMjEgMWguOTdjLjI4IDAgLjUtLjIyLjUtLjUiLz48cGF0aCBmaWxsPSIjZjNhZDYxIiBkPSJNMjguOTkxIDI4aC0xLjk5djEuNjhjMCAuMTcuMTQuMzEuMzEuMzFoMS4zN2MuMTcgMCAuMzEtLjE0LjMxLS4zMXptLTE2Ljk5IDBoLTEuOTl2MS42OWMtLjAxLjE3LjEzLjMxLjMuMzFoMS4zN2MuMTggMCAuMzItLjE0LjMyLS4zMXptNS4wODggMmwtMS4zOTgtLjAxYy0uMTcgMC0uMzQtLjA1LS4zOC0uMjJsLS40OS0xLjc3aDIuMDU0bC40MzYgMS42MmMuMDQuMTctLjAyLjM3OC0uMjE2LjM4em0yLjI4OCAwYS4zMTMuMzEzIDAgMCAxLS4yNzYtLjQ1bC41OTUtMS41NWgyLjRsLS45MzUgMS42M2EuOTUuOTUgMCAwIDEtLjc0Mi4zN3oiLz48L2c+PC9zdmc+",
+    "together://meta-llama/Llama-3.2-3B-Instruct-Turbo": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMzIgMzIiPjxnIGZpbGw9Im5vbmUiPjxwYXRoIGZpbGw9IiNiNGFjYmMiIGQ9Ik0yMC44NzEgMjQuNzh2LTYuMDZoMy4wMXY1Ljc3YzAgLjM0LS4xMi42Mi0uMzEuOTRsLTIuNDEgNC4yYy0uMTguMjMtLjQ1LjM3LS43NS4zN2gtMS4wM2MtLjIzIDAtLjM4LS4yNC0uMjgtLjQ1bDEuNjctNC4zNWMuMDctLjEzLjEtLjI3LjEtLjQyTTE3LjA5MSAzMGMuMiAwIC4yNi0uMjEuMjItLjM4bC0yLjMyLTguNjFoLTIuOTlsLjg1IDMuNTVjLjE5LjcxLjY2IDEuMzIgMS4zIDEuNjljLjE0LjA4LjI1LjIyLjI5LjM4bC44NyAzLjE0Yy4wNC4xNy4yMS4yMi4zOC4yMnoiLz48cGF0aCBmaWxsPSIjY2RjNGQ2IiBkPSJtMjguNTQxIDIzLjA4bC0xLjI3LS45NmEuOTQuOTQgMCAwIDEtLjI3LS42NnYtMi43NWMwLS42NC0uMTYtMS4yMzgtLjQ0Mi0xLjc2cS4yMTMuMDUuNDQyLjA1YTIgMiAwIDEgMC0xLjk0OS0xLjU0OWEzLjggMy44IDAgMCAwLTEuOC0uNDUxaC04LjE3Yy0uNjYgMC0xLjI3LS40Mi0xLjU3LTEuMDFMMTAuMDQxIDMuNWEyLjIzIDIuMjMgMCAwIDAtMi4xLTEuNWMtLjE4IDAtLjMuMTctLjI0LjM0TDguNTcxIDVjLS4yIDAtMS4wNy4yMy0xLjg1LjczbC0uODA2LjQ5OEw3LjAwMiAxMHY4LjI2YzAgMi4wMSAxLjI1IDMuNzIgMy4wMSA0LjQxdjcuMDJjLS4wMS4xNy4xMy4zMS4zLjMxaDEuMzdjLjE4IDAgLjMyLS4xNC4zMi0uMzF2LTEuOTZjMC0uMTcuMDctLjMyLjE4LS40NGMuNTMtLjUyLjgyLTEuMjMuODItMS45N1YyM2g1LjA3YzEuMjcgMCAyLjQ5LS41NSAzLjMzLTEuNWMwIC45NC40OCAxLjcyIDEuMzggMi4zMmwzLjk2IDIuNDNjLjE2LjExLjI2LjMuMjYuNXYyLjkzYzAgLjE3LjE0LjMxLjMxLjMxaDEuMzdjLjE3IDAgLjMxLS4xNC4zMS0uMzF2LTUuNTFjLjAxLS40LS4xNS0uOC0uNDUtMS4wOSIvPjxwYXRoIGZpbGw9IiNmM2FkNjEiIGQ9Ik02Ljg0MSA2Yy0uMzYgMC0uNzIuMS0xLjAzLjI5bC0yLjE5IDEuMzVjLS4zNi4yMy0uNi42MS0uNjIgMS4wM2MtLjAzLjczLjU1IDEuMzMgMS4yNyAxLjMzaDMuNDljLjU3IDAgMS4wNC0uNDcgMS4wNC0xLjA1di0xYzAtMS4wNy0uODgtMS45NS0xLjk2LTEuOTUiLz48cGF0aCBmaWxsPSIjMWMxYzFjIiBkPSJNNi41IDhhLjUuNSAwIDEgMCAwLTFhLjUuNSAwIDAgMCAwIDFtLTEuOTk5LjVjMC0uMjgtLjIyLS41LS41LS41aC0uNzZhMS4yIDEuMiAwIDAgMC0uMjEgMWguOTdjLjI4IDAgLjUtLjIyLjUtLjUiLz48cGF0aCBmaWxsPSIjZjNhZDYxIiBkPSJNMjguOTkxIDI4aC0xLjk5djEuNjhjMCAuMTcuMTQuMzEuMzEuMzFoMS4zN2MuMTcgMCAuMzEtLjE0LjMxLS4zMXptLTE2Ljk5IDBoLTEuOTl2MS42OWMtLjAxLjE3LjEzLjMxLjMuMzFoMS4zN2MuMTggMCAuMzItLjE0LjMyLS4zMXptNS4wODggMmwtMS4zOTgtLjAxYy0uMTcgMC0uMzQtLjA1LS4zOC0uMjJsLS40OS0xLjc3aDIuMDU0bC40MzYgMS42MmMuMDQuMTctLjAyLjM3OC0uMjE2LjM4em0yLjI4OCAwYS4zMTMuMzEzIDAgMCAxLS4yNzYtLjQ1bC41OTUtMS41NWgyLjRsLS45MzUgMS42M2EuOTUuOTUgMCAwIDEtLjc0Mi4zN3oiLz48L2c+PC9zdmc+",
+    "anthropic://claude-3-haiku-20240307": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMjQgMjQiPjxwYXRoIGZpbGw9ImN1cnJlbnRDb2xvciIgZD0iTTE3LjMwNCAzLjU0MWgtMy42NzJsNi42OTYgMTYuOTE4SDI0Wm0tMTAuNjA4IDBMMCAyMC40NTloMy43NDRsMS4zNy0zLjU1M2g3LjAwNWwxLjM2OSAzLjU1M2gzLjc0NEwxMC41MzYgMy41NDFabS0uMzcxIDEwLjIyM0w4LjYxNiA3LjgybDIuMjkxIDUuOTQ1WiIvPjwvc3ZnPg==",
+}
+# AGGREGATORS = ["openai://gpt-4o-mini", "openai://gpt-4o"]
+AGGREGATORS = ["together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"]
+# Fix the aggregator step.
+# Add a judging step.
+# Add visualizations.
+# import streamlit as st
+# from components import llm_council_selector
+# st.title("LLM Council Selector")
+# selected_models = llm_council_selector()
+# if selected_models is not None:
+#     st.write("Selected Models:", selected_models)
+# else:
+#     st.write("No models selected or component didn't return a value.")
+# Choose your council.
+# Pre-selected.
+#    Smalls: GPT-4o-mini, llama-3.1-70b, qwen-2.0-70b
+#    Flagships: GPT-4o, llama-3.1-405b, qwen-2.0-110b, gemini, claude-3.5-sonnet
+#    Best: chatgpt-4o-latest, gemini-1.5-pro-exp-0827, grok-2-2024-08-13, claude-3-5-sonnet-20240620, llama-3.1-405b-instruct
+# Custom:
+#    Choose from a list of available models.
+# All:
+#    All available models.
+# Choose aggregator.
+# Aggregators are models proficient in synthesizing responses from other models into a single, highquality output. An effective aggregator should maintain or enhance output quality even when
+# integrating inputs that are of lesser quality than its own.
+# Choices:
+#   - 4o-latest
+#   - gemini-1.5
+#   - grok-2
+#   - claude-3.5-sonnet
+#   - llama-3.1-405b-instruct
+# Provide a prompt. (Or pre-canned prompts.)
+# Paste chat history.
+# Checkbox, enable judging.
+#
+# If checked, Judging config:
+# Single sided
+#   Provide criteria. (or default).
+# If pairwise, choose granularity (or default).
+#   Choose criteria. (or default).
+#   Enable position swapping?
+# Go button.
+# Sections.
+# 1. Model outputs.
+# 2. Aggregated output.
+# 3. Judging underneath each output.
+#  Highlight in green, the output that was best, as determined by council.
+#  Show graph breakdown of scores and justifications. (by criteria, # wins and # losses)
+#  Show final overall score.
+#  Highlight in red, the output that was worst, as determined by council.
+# Judging section.
+#   Show agreement matrix.
+#   Show bar graph of self-bias.
+#   Plot contrarianism vs. conviction (scatter plot)
+#   Show per-judge scores.
+# Calculate total cost.
+# Show total tokens used.
+# """
+# type: [single, pairwise]
+# [single]
+# - criteria:
+#     - name
+#     - weight
+#     - description
+#     - scoring
+# [pairwise]
+# - granularity: [fine, coarse]
+# - ties_allowed: [yes, no]
+# - position_swapping: [yes, no]
+# - reference_model: [model_name]
+# - criteria:
+#     - name
+#     - weight
+#     - description
+# """

judging.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from pydantic import BaseModel, Field, conint
+from typing import List, Optional, Literal, Union
+class Criteria(BaseModel):
+    name: str
+    description: str
+    min_score: conint(ge=0)
+    max_score: conint(ge=0)
+class DirectAssessment(BaseModel):
+    type: Literal["direct_assessment"]
+    criteria: List[Criteria]
+    prompt: str
+class PairwiseComparison(BaseModel):
+    type: Literal["pairwise_comparison"]
+    granularity: Literal["coarse", "fine", "super fine"]
+    ties_allowed: bool
+    position_swapping: bool
+    reference_model: str
+    prompt: str
+class JudgingConfig(BaseModel):
+    assessment: Union[DirectAssessment, PairwiseComparison]

judging_dataclasses.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from pydantic import BaseModel, Field, conint
+from typing import List, Optional, Literal, Union
+class Criteria(BaseModel):
+    name: str
+    description: str
+    min_score: conint(ge=0)
+    max_score: conint(ge=0)
+class DirectAssessment(BaseModel):
+    type: Literal["direct_assessment"]
+    criteria: List[Criteria]
+    prompt: str
+class PairwiseComparison(BaseModel):
+    type: Literal["pairwise_comparison"]
+    granularity: Literal["coarse", "fine", "super fine"]
+    ties_allowed: bool
+    position_swapping: bool
+    reference_model: str
+    prompt: str
+class JudgingConfig(BaseModel):
+    assessment: Union[DirectAssessment, PairwiseComparison]

prompts.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from judging_dataclasses import Criteria
+DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses.
+[USER PROMPT START]
+{user_prompt}
+[USER PROMPT END]
+Responses from other LLMs:
+{responses_from_other_llms}
+Please provide a response the combines the best aspects of the responses above."""
+DEFAULT_DIRECT_ASSESSMENT_PROMPT = """We are trying to assess the quality of a response to a user query.
+[USER PROMPT START]
+{user_prompt}
+[USER PROMPT END]
+The response is as follows:
+[RESPONSE START]
+{response}
+[RESPONSE END]
+Please evaluate the quality of the response based on the following criteria:
+{criteria_list}
+Options:
+{options}
+For each of the criterion, provide a short explanation describing how you would evaluate the response based on that criterion. Then, provide your final rating for that criterion."""
+DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST = [
+    Criteria(
+        name="helpfulness",
+        description="Provides meaningful information and clear solutions that address the query.",
+        min_score=1,
+        max_score=7,
+    ),
+    Criteria(
+        name="relevance",
+        description="Stays on topic and directly relates to the query without unnecessary details.",
+        min_score=1,
+        max_score=7,
+    ),
+    Criteria(
+        name="conciseness",
+        description="Communicates clearly and efficiently, avoiding excess content while retaining substance.",
+        min_score=1,
+        max_score=7,
+    ),
+]
+# 7-point likert scale.
+SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS = [
+    "Strongly Disagree",
+    "Disagree",
+    "Slightly Disagree",
+    "Neither Agree Nor Disagree",
+    "Slightly Agree",
+    "Agree",
+    "Strongly Agree",
+]
+# 6-point likert scale.
+SIX_POINT_DIRECT_ASSESSMENT_OPTIONS = [
+    "Strongly Disagree",
+    "Disagree",
+    "Slightly Disagree",
+    "Slightly Agree",
+    "Agree",
+    "Strongly Agree",
+]
+# 5-point likert scale.
+FIVE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
+    "Strongly Disagree",
+    "Disagree",
+    "Neither Agree Nor Disagree",
+    "Agree",
+    "Strongly Agree",
+]
+# 4-point likert scale.
+FOUR_POINT_DIRECT_ASSESSMENT_OPTIONS = [
+    "Strongly Disagree",
+    "Disagree",
+    "Agree",
+    "Strongly Agree",
+]
+# 3-point likert scale.
+THREE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
+    "Disagree",
+    "Neither Agree Nor Disagree",
+    "Agree",
+]
+# 2-point likert scale.
+BINARY_DIRECT_ASSESSMENT_OPTIONS = [
+    "Disagree",
+    "Agree",
+]
+DEFAULT_PAIRWISE_COMPARISON_PROMPT = """We are trying to compare the quality of two responses to a user query.
+[USER PROMPT START]
+{prompt}
+[USER PROMPT END]
+[RESPONSE A START]
+{first_completion}
+[RESPONSE A END]
+[RESPONSE B START]
+{second_completion}
+[RESPONSE B END]
+Begin your evaluation by comparing the two responses and provide a short explanation. Some themes to consider in your evaluation: {themes_to_consider}.
+After providing your explanation, output your final verdict as one of the following options:
+{pairwise_comparison_options}
+"""
+DEFAULT_PAIRWISE_COMPARISON_THEMES_TO_CONSIDER = [
+    (
+        "helpfulness",
+        "Provides meaningful information and clear solutions that address the query.",
+    ),
+    (
+        "relevance",
+        "Stays on topic and directly relates to the query without unnecessary details.",
+    ),
+    (
+        "conciseness",
+        "Communicates clearly and efficiently, avoiding excess content while retaining substance.",
+    ),
+]
+# COARSE WITH TIE.
+DEFAULT_PAIRWISE_COMPARISON_OPTIONS = [
+    ("A>B", "Response A is better than Response B"),
+    ("B<A", "Response B is better than Response A"),
+    ("A=B", "Both responses are equally good"),
+]