Spaces:
Running
Running
from datetime import datetime | |
import json | |
import gradio as gr | |
import re | |
import random | |
from collections import defaultdict | |
import pandas as pd | |
import os | |
from gen_api_answer import get_model_response, parse_model_response | |
from common import * | |
# Model and ELO score data | |
DEFAULT_ELO = 1500 # Starting ELO for new models | |
K_FACTOR = 32 # Standard chess K-factor, adjust as needed | |
elo_scores = defaultdict(lambda: DEFAULT_ELO) | |
vote_counts = defaultdict(int) | |
# Load the model_data from JSONL | |
def load_model_data(): | |
model_data = {} | |
try: | |
with open('data/models.jsonl', 'r') as f: | |
for line in f: | |
model = json.loads(line) | |
model_data[model['name']] = { | |
'organization': model['organization'], | |
'license': model['license'], | |
'api_model': model['api_model'] | |
} | |
except FileNotFoundError: | |
print("Warning: models.jsonl not found") | |
return {} | |
return model_data | |
model_data = load_model_data() | |
current_session_id = 0 | |
voting_data = [] | |
def get_new_session_id(): | |
global current_session_id | |
current_session_id += 1 | |
return f"user{current_session_id}" | |
def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id): | |
vote_entry = { | |
"timestamp": datetime.now().isoformat(), | |
"prompt": prompt, | |
"response_a": response_a, | |
"response_b": response_b, | |
"model_a": model_a, | |
"model_b": model_b, | |
"winner": winner, | |
"judge_id": judge_id, | |
} | |
voting_data.append(vote_entry) | |
# Save to file after each vote | |
with open('voting_data.json', 'w') as f: | |
json.dump(voting_data, f, indent=2) | |
def parse_variables(prompt): | |
# Extract variables enclosed in double curly braces | |
variables = re.findall(r'{{(.*?)}}', prompt) | |
# Remove duplicates while preserving order | |
seen = set() | |
variables = [x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))] | |
return variables | |
def get_final_prompt(eval_prompt, variable_values): | |
# Replace variables in the eval prompt with their values | |
for var, val in variable_values.items(): | |
eval_prompt = eval_prompt.replace('{{' + var + '}}', val) | |
return eval_prompt | |
def submit_prompt(eval_prompt, *variable_values): | |
try: | |
variables = parse_variables(eval_prompt) | |
variable_values_dict = {var: val for var, val in zip(variables, variable_values)} | |
final_prompt = get_final_prompt(eval_prompt, variable_values_dict) | |
models = list(model_data.keys()) | |
model1, model2 = random.sample(models, 2) | |
model_a, model_b = (model1, model2) if random.random() < 0.5 else (model2, model1) | |
response_a = get_model_response(model_a, model_data.get(model_a), final_prompt) | |
response_b = get_model_response(model_b, model_data.get(model_b), final_prompt) | |
return ( | |
response_a, | |
response_b, | |
gr.update(visible=True), | |
gr.update(visible=True), | |
model_a, | |
model_b | |
) | |
except Exception as e: | |
print(f"Error in submit_prompt: {str(e)}") | |
return ( | |
"Error generating response", | |
"Error generating response", | |
gr.update(visible=False), | |
gr.update(visible=False), | |
None, | |
None | |
) | |
def vote(choice, model_a, model_b, prompt, response_a, response_b, judge_id): | |
# Update ELO scores based on user choice | |
elo_a = elo_scores[model_a] | |
elo_b = elo_scores[model_b] | |
# Calculate expected scores | |
Ea = 1 / (1 + 10 ** ((elo_b - elo_a) / 400)) | |
Eb = 1 / (1 + 10 ** ((elo_a - elo_b) / 400)) | |
# Assign actual scores | |
if choice == 'A': | |
Sa, Sb = 1, 0 | |
elif choice == 'B': | |
Sa, Sb = 0, 1 | |
else: | |
Sa, Sb = 0.5, 0.5 | |
# Update scores and vote counts | |
elo_scores[model_a] += K_FACTOR * (Sa - Ea) | |
elo_scores[model_b] += K_FACTOR * (Sb - Eb) | |
vote_counts[model_a] += 1 | |
vote_counts[model_b] += 1 | |
# Store the vote data | |
store_vote_data(prompt, response_a, response_b, model_a, model_b, choice, judge_id) | |
# Return updates for UI components | |
return { | |
action_buttons_row: gr.update(visible=False), | |
model_name_a: gr.update(value=f"*Model: {model_a}*"), | |
model_name_b: gr.update(value=f"*Model: {model_b}*"), | |
send_btn: gr.update(interactive=True), | |
regenerate_button: gr.update(visible=True, interactive=True) | |
} | |
def get_leaderboard(): | |
# Generate leaderboard data | |
leaderboard = [] | |
for model, elo in elo_scores.items(): | |
votes = vote_counts[model] | |
ci = 1.96 * (400 / (votes + 1) ** 0.5) # Approximate 95% confidence interval | |
data = { | |
'Model': model, | |
'ELO Score': f"{elo:.2f}", | |
'95% CI': f"±{ci:.2f}", | |
'# Votes': votes, | |
'Organization': model_data[model]['organization'], | |
'License': model_data[model]['license'], | |
} | |
leaderboard.append(data) | |
# Sort by ELO score | |
leaderboard.sort(key=lambda x: float(x['ELO Score']), reverse=True) | |
return leaderboard | |
def regenerate_prompt(model_a, model_b, eval_prompt, *variable_values): | |
variables = parse_variables(eval_prompt) | |
variable_values_dict = {var: val for var, val in zip(variables, variable_values)} | |
final_prompt = get_final_prompt(eval_prompt, variable_values_dict) | |
# Get available models excluding the previous ones | |
available_models = [m for m in model_data.keys() if m not in (model_a, model_b)] | |
# If we have enough models for new pairs | |
if len(available_models) >= 2: | |
model1, model2 = random.sample(available_models, 2) | |
else: | |
# Fallback to allowing previous models if necessary | |
model1, model2 = random.sample(list(model_data.keys()), 2) | |
response_a = get_model_response(model1, model_data.get(model1), final_prompt) | |
response_b = get_model_response(model2, model_data.get(model2), final_prompt) | |
# Parse the responses | |
score_a, critique_a = parse_model_response(response_a) | |
score_b, critique_b = parse_model_response(response_b) | |
return ( | |
score_a, # score_a textbox | |
critique_a, # critique_a textbox | |
score_b, # score_b textbox | |
critique_b, # critique_b textbox | |
gr.update(visible=True), # action_buttons_row | |
gr.update(value="*Model: Unknown*"), # model_name_a | |
gr.update(value="*Model: Unknown*"), # model_name_b | |
model1, # model_a_state | |
model2 # model_b_state | |
) | |
def calculate_elo_change(rating_a, rating_b, winner): | |
"""Calculate ELO rating changes for both players.""" | |
expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400)) | |
expected_b = 1 - expected_a | |
if winner == "A": | |
score_a, score_b = 1, 0 | |
elif winner == "B": | |
score_a, score_b = 0, 1 | |
else: # Handle ties | |
score_a, score_b = 0.5, 0.5 | |
change_a = K_FACTOR * (score_a - expected_a) | |
change_b = K_FACTOR * (score_b - expected_b) | |
return change_a, change_b | |
def update_leaderboard(): | |
"""Calculate current ELO ratings from voting history.""" | |
ratings = defaultdict(lambda: DEFAULT_ELO) | |
matches = defaultdict(int) | |
wins = defaultdict(int) | |
# Load voting data | |
try: | |
with open('voting_data.json', 'r') as f: | |
voting_data = json.load(f) | |
except FileNotFoundError: | |
return pd.DataFrame() | |
# Process each vote | |
for vote in voting_data: | |
model_a = vote['model_a'] | |
model_b = vote['model_b'] | |
winner = vote['winner'] | |
# Skip if models aren't in current model_data | |
if model_a not in model_data or model_b not in model_data: | |
continue | |
# Update match counts | |
matches[model_a] += 1 | |
matches[model_b] += 1 | |
if winner == "A": | |
wins[model_a] += 1 | |
elif winner == "B": | |
wins[model_b] += 1 | |
else: # Handle ties | |
wins[model_a] += 0.5 | |
wins[model_b] += 0.5 | |
# Update ELO ratings | |
change_a, change_b = calculate_elo_change(ratings[model_a], ratings[model_b], winner) | |
ratings[model_a] += change_a | |
ratings[model_b] += change_b | |
# Create leaderboard DataFrame | |
leaderboard_data = [] | |
for model in model_data.keys(): # Only include current models | |
win_rate = (wins[model] / matches[model] * 100) if matches[model] > 0 else 0 | |
ci = 1.96 * (400 / (matches[model] + 1) ** 0.5) if matches[model] > 0 else 0 # Confidence interval | |
leaderboard_data.append({ | |
'Model': model, | |
'ELO': round(ratings[model], 1), | |
'95% CI': f"±{ci:.1f}", | |
'Matches': matches[model], | |
'Win Rate': f"{win_rate:.1f}%", | |
'Organization': model_data[model]['organization'], | |
'License': model_data[model]['license'] | |
}) | |
# Sort by ELO rating | |
df = pd.DataFrame(leaderboard_data) | |
return df.sort_values('ELO', ascending=False).reset_index(drop=True) | |
# Update the display_leaderboard function | |
def display_leaderboard(): | |
df = update_leaderboard() | |
return gr.DataFrame( | |
value=df, | |
headers=['Model', 'ELO', '95% CI', 'Matches', 'Organization', 'License'], | |
datatype=['str', 'number', 'str', 'number', 'str', 'str', 'str'], | |
row_count=(len(df) + 1, 'dynamic'), | |
) | |
# Update the leaderboard table definition in the UI | |
leaderboard_table = gr.Dataframe( | |
headers=['Model', 'ELO', '95% CI', 'Matches', 'Organization', 'License'], | |
datatype=['str', 'number', 'str', 'number', 'str', 'str', 'str'] | |
) | |
def get_leaderboard_stats(): | |
"""Get summary statistics for the leaderboard.""" | |
try: | |
with open('voting_data.json', 'r') as f: | |
voting_data = json.load(f) | |
total_votes = len(voting_data) | |
total_models = len(model_data) | |
last_updated = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC") | |
return f""" | |
### Leaderboard Stats | |
- **Total Models**: {total_models} | |
- **Total Votes**: {total_votes} | |
- **Last Updated**: {last_updated} | |
""" | |
except FileNotFoundError: | |
return "No voting data available" | |
def initialize_voting_data(): | |
"""Initialize or clear the voting data file.""" | |
empty_data = [] | |
with open('voting_data.json', 'w') as f: | |
json.dump(empty_data, f) | |
# Add this near the start of your app initialization, before the Gradio interface setup | |
if __name__ == "__main__": | |
initialize_voting_data() | |
# ... rest of your Gradio app setup ... | |
# Example evaluation metrics data | |
EXAMPLE_METRICS = { | |
"Hallucination": { | |
"prompt": DEFAULT_EVAL_PROMPT, # We'll replace these with actual examples | |
"input": DEFAULT_INPUT, | |
"response": DEFAULT_RESPONSE | |
}, | |
"Precision": { | |
"prompt": DEFAULT_EVAL_PROMPT, | |
"input": DEFAULT_INPUT, | |
"response": DEFAULT_RESPONSE | |
}, | |
"Recall": { | |
"prompt": DEFAULT_EVAL_PROMPT, | |
"input": DEFAULT_INPUT, | |
"response": DEFAULT_RESPONSE | |
}, | |
"Logical coherence": { | |
"prompt": DEFAULT_EVAL_PROMPT, | |
"input": DEFAULT_INPUT, | |
"response": DEFAULT_RESPONSE | |
}, | |
"Faithfulness": { | |
"prompt": DEFAULT_EVAL_PROMPT, | |
"input": DEFAULT_INPUT, | |
"response": DEFAULT_RESPONSE | |
} | |
} | |
def set_example_metric(metric_name): | |
if metric_name == "Custom": | |
return [ | |
DEFAULT_EVAL_PROMPT, | |
DEFAULT_INPUT, | |
DEFAULT_RESPONSE | |
] | |
metric_data = EXAMPLE_METRICS[metric_name] | |
return [ | |
metric_data["prompt"], | |
metric_data["input"], | |
metric_data["response"] | |
] | |
# Select random metric at startup | |
def get_random_metric(): | |
metrics = list(EXAMPLE_METRICS.keys()) | |
return set_example_metric(random.choice(metrics)) | |
with gr.Blocks(theme='default', css=CSS_STYLES) as demo: | |
judge_id = gr.State(get_new_session_id()) | |
gr.Markdown(MAIN_TITLE) | |
gr.Markdown(HOW_IT_WORKS) | |
with gr.Tabs(): | |
with gr.TabItem("Judge Arena"): | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown(BATTLE_RULES) | |
gr.Markdown(EVAL_DESCRIPTION) | |
# Add Example Metrics Section | |
with gr.Accordion("Example evaluation metrics", open=True): | |
with gr.Row(): | |
custom_btn = gr.Button("Custom", variant="secondary") | |
hallucination_btn = gr.Button("Hallucination") | |
precision_btn = gr.Button("Precision") | |
recall_btn = gr.Button("Recall") | |
coherence_btn = gr.Button("Logical coherence") | |
faithfulness_btn = gr.Button("Faithfulness") | |
# Eval Prompt and Variables side by side | |
with gr.Row(): | |
# Left column - Eval Prompt | |
with gr.Column(scale=1): | |
eval_prompt = gr.TextArea( | |
label="Evaluator Prompt", | |
lines=1, | |
value=DEFAULT_EVAL_PROMPT, | |
placeholder="Type your eval prompt here... denote variables in {{curly brackets}} to be populated on the right.", | |
show_label=True | |
) | |
# Right column - Variable Mapping | |
with gr.Column(scale=1): | |
gr.Markdown("### Sample to test the evaluator") | |
# Create inputs for up to 5 variables, with first two visible by default | |
variable_rows = [] | |
for i in range(5): | |
initial_visibility = True if i < 2 else False | |
with gr.Group(visible=initial_visibility) as var_row: | |
# Variable input with direct label | |
initial_value = DEFAULT_INPUT if i == 0 else DEFAULT_RESPONSE | |
initial_label = "input" if i == 0 else "response" if i == 1 else f"variable_{i+1}" | |
var_input = gr.Textbox( | |
label=initial_label, | |
value=initial_value, | |
container=True | |
) | |
variable_rows.append((var_row, var_input)) | |
# Send button | |
with gr.Row(elem_classes="send-button-row"): | |
send_btn = gr.Button( | |
value="Test the evaluators", | |
variant="primary", | |
size="lg", | |
scale=1 | |
) | |
# Add divider heading for model outputs | |
gr.Markdown(VOTING_HEADER) | |
# Model Responses side-by-side | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Model A") | |
score_a = gr.Textbox(label="Score", interactive=False) | |
critique_a = gr.TextArea(label="Critique", lines=8, interactive=False) | |
model_name_a = gr.Markdown("*Model: Unknown*") | |
with gr.Column(): | |
gr.Markdown("### Model B") | |
score_b = gr.Textbox(label="Score", interactive=False) | |
critique_b = gr.TextArea(label="Critique", lines=8, interactive=False) | |
model_name_b = gr.Markdown("*Model: Unknown*") | |
# Initially hide vote buttons and regenerate button | |
with gr.Row(visible=False) as action_buttons_row: | |
vote_a = gr.Button("Choose A", variant="primary") | |
vote_tie = gr.Button("Tie", variant="secondary") | |
vote_b = gr.Button("Choose B", variant="primary") | |
regenerate_button = gr.Button("Regenerate with different models", variant="secondary", visible=False) | |
# Add spacing and acknowledgements at the bottom | |
gr.Markdown(ACKNOWLEDGEMENTS) | |
with gr.TabItem("Leaderboard"): | |
refresh_button = gr.Button("Refresh") | |
stats_display = gr.Markdown() | |
leaderboard_table = gr.Dataframe( | |
headers=['Model', 'ELO', '95% CI', 'Matches', 'Organization', 'License'], | |
datatype=['str', 'number', 'str', 'number', 'str', 'str'] | |
) | |
with gr.TabItem("Policy"): | |
gr.Markdown(POLICY_CONTENT) | |
# Define state variables for model tracking | |
model_a_state = gr.State() | |
model_b_state = gr.State() | |
# Update variable inputs based on the eval prompt | |
def update_variables(eval_prompt): | |
variables = parse_variables(eval_prompt) | |
updates = [] | |
for i in range(5): | |
var_row, var_input = variable_rows[i] | |
if i < len(variables): | |
# Set default values for 'input' and 'response', otherwise leave empty | |
if variables[i] == "input": | |
initial_value = DEFAULT_INPUT | |
elif variables[i] == "response": | |
initial_value = DEFAULT_RESPONSE | |
else: | |
initial_value = "" # Empty for new variables | |
updates.extend([ | |
gr.update(visible=True), # var_row | |
gr.update(value=initial_value, label=variables[i], visible=True) # var_input with dynamic label | |
]) | |
else: | |
updates.extend([ | |
gr.update(visible=False), # var_row | |
gr.update(value="", visible=False) # var_input | |
]) | |
return updates | |
eval_prompt.change(fn=update_variables, inputs=eval_prompt, outputs=[item for sublist in variable_rows for item in sublist]) | |
# Regenerate button functionality | |
regenerate_button.click( | |
fn=regenerate_prompt, | |
inputs=[model_a_state, model_b_state, eval_prompt] + [var_input for _, var_input in variable_rows], | |
outputs=[ | |
score_a, | |
critique_a, | |
score_b, | |
critique_b, | |
action_buttons_row, | |
model_name_a, | |
model_name_b, | |
model_a_state, | |
model_b_state | |
] | |
) | |
# Update model names after responses are generated | |
def update_model_names(model_a, model_b): | |
return gr.update(value=f"*Model: {model_a}*"), gr.update(value=f"*Model: {model_b}*") | |
# Store the last submitted prompt and variables for comparison | |
last_submission = gr.State({}) | |
# Update the vote button click handlers | |
vote_a.click( | |
fn=lambda *args: vote('A', *args), | |
inputs=[model_a_state, model_b_state, eval_prompt, score_a, score_b, judge_id], | |
outputs=[action_buttons_row, model_name_a, model_name_b, send_btn, regenerate_button] | |
) | |
vote_b.click( | |
fn=lambda *args: vote('B', *args), | |
inputs=[model_a_state, model_b_state, eval_prompt, score_a, score_b, judge_id], | |
outputs=[action_buttons_row, model_name_a, model_name_b, send_btn, regenerate_button] | |
) | |
vote_tie.click( | |
fn=lambda *args: vote('Tie', *args), | |
inputs=[model_a_state, model_b_state, eval_prompt, score_a, score_b, judge_id], | |
outputs=[action_buttons_row, model_name_a, model_name_b, send_btn, regenerate_button] | |
) | |
# Update the send button handler to store the submitted inputs | |
def submit_and_store(prompt, *variables): | |
# Create a copy of the current submission | |
current_submission = {"prompt": prompt, "variables": variables} | |
# Get the responses | |
response_a, response_b, buttons_visible, regen_visible, model_a, model_b = submit_prompt(prompt, *variables) | |
# Parse the responses | |
score_a, critique_a = parse_model_response(response_a) | |
score_b, critique_b = parse_model_response(response_b) | |
# Update the last_submission state with the current values | |
last_submission.value = current_submission | |
return ( | |
score_a, | |
critique_a, | |
score_b, | |
critique_b, | |
buttons_visible, | |
gr.update(visible=True, interactive=True), # Show and enable regenerate button | |
model_a, | |
model_b, | |
gr.update(value="*Model: Unknown*"), | |
gr.update(value="*Model: Unknown*") | |
) | |
send_btn.click( | |
fn=submit_and_store, | |
inputs=[eval_prompt] + [var_input for _, var_input in variable_rows], | |
outputs=[ | |
score_a, | |
critique_a, | |
score_b, | |
critique_b, | |
action_buttons_row, | |
regenerate_button, | |
model_a_state, | |
model_b_state, | |
model_name_a, # Add model name outputs | |
model_name_b | |
] | |
) | |
# Update the input change handlers to also disable regenerate button | |
def handle_input_changes(prompt, *variables): | |
"""Enable send button and manage regenerate button based on input changes""" | |
last_inputs = last_submission.value | |
current_inputs = {"prompt": prompt, "variables": variables} | |
inputs_changed = last_inputs != current_inputs | |
return [ | |
gr.update(interactive=True), # send button always enabled | |
gr.update(interactive=not inputs_changed) # regenerate button disabled if inputs changed | |
] | |
# Update the change handlers for prompt and variables | |
eval_prompt.change( | |
fn=handle_input_changes, | |
inputs=[eval_prompt] + [var_input for _, var_input in variable_rows], | |
outputs=[send_btn, regenerate_button] | |
) | |
for _, var_input in variable_rows: | |
var_input.change( | |
fn=handle_input_changes, | |
inputs=[eval_prompt] + [var_input for _, var_input in variable_rows], | |
outputs=[send_btn, regenerate_button] | |
) | |
# Update the leaderboard | |
def refresh_leaderboard(): | |
leaderboard = get_leaderboard() | |
data = [ | |
[ | |
entry['Model'], | |
float(entry['ELO Score']), | |
entry['95% CI'], | |
entry['# Votes'], | |
entry['Organization'], | |
entry['License'] | |
] for entry in leaderboard | |
] | |
stats = get_leaderboard_stats() | |
return [gr.update(value=data), gr.update(value=stats)] | |
refresh_button.click( | |
fn=refresh_leaderboard, | |
inputs=None, | |
outputs=[leaderboard_table, stats_display] | |
) | |
# Add the load event at the very end, just before demo.launch() | |
demo.load( | |
fn=refresh_leaderboard, | |
inputs=None, | |
outputs=[leaderboard_table, stats_display] | |
) | |
# Add click handlers for metric buttons | |
custom_btn.click( | |
fn=lambda: set_example_metric("Custom"), | |
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]] | |
) | |
hallucination_btn.click( | |
fn=lambda: set_example_metric("Hallucination"), | |
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]] | |
) | |
precision_btn.click( | |
fn=lambda: set_example_metric("Precision"), | |
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]] | |
) | |
recall_btn.click( | |
fn=lambda: set_example_metric("Recall"), | |
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]] | |
) | |
coherence_btn.click( | |
fn=lambda: set_example_metric("Logical coherence"), | |
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]] | |
) | |
faithfulness_btn.click( | |
fn=lambda: set_example_metric("Faithfulness"), | |
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]] | |
) | |
# Set random metric at startup | |
demo.load( | |
fn=get_random_metric, | |
outputs=[eval_prompt, variable_rows[0][1], variable_rows[1][1]] | |
) | |
demo.launch() |