Spaces:
Runtime error
Runtime error
File size: 4,969 Bytes
a8ede2f dc1ba50 a8ede2f dc1ba50 a8ede2f dc1ba50 a8ede2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.graph_objs import Figure
from src.leaderboard.filter_models import FLAGGED_MODELS
from src.display.utils import AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
from src.leaderboard.read_evals import EvalResult
def create_scores_df(raw_data: list[EvalResult]) -> dict[str, pd.DataFrame]:
"""
Generates a DataFrame containing the maximum scores until each date.
:param results_df: A DataFrame containing result information including metric scores and dates.
:return: A new DataFrame containing the maximum scores until each date for every metric.
"""
# Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
results_df = pd.DataFrame(raw_data)
#results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
results_df.sort_values(by="date", inplace=True)
# Step 2: Initialize the scores dictionary
scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
# Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
current_max = 0
last_date = ""
column = task.col_name
for _, row in results_df.iterrows():
current_model = row["full_model"]
if current_model in FLAGGED_MODELS:
continue
current_date = row["date"]
if task.benchmark == "Average":
current_score = np.mean(list(row["results"].values()))
else:
current_score = row["results"][task.benchmark]
if current_score > current_max:
if current_date == last_date and len(scores[column]) > 0:
scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
else:
scores[column].append({"model": current_model, "date": current_date, "score": current_score})
current_max = current_score
last_date = current_date
# Step 4: Return all dictionaries as DataFrames
return {k: pd.DataFrame(v) for k, v in scores.items()}
def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
"""
Transforms the scores DataFrame into a new format suitable for plotting.
:param scores_df: A DataFrame containing metric scores and dates.
:return: A new DataFrame reshaped for plotting purposes.
"""
# Initialize the list to store DataFrames
dfs = []
# Iterate over the cols and create a new DataFrame for each column
for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
d = scores_df[col].reset_index(drop=True)
d["task"] = col
dfs.append(d)
# Concatenate all the created DataFrames
concat_df = pd.concat(dfs, ignore_index=True)
# Sort values by 'date'
concat_df.sort_values(by="date", inplace=True)
concat_df.reset_index(drop=True, inplace=True)
return concat_df
def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) -> Figure:
"""
Create a Plotly figure object with lines representing different metrics
and horizontal dotted lines representing human baselines.
:param df: The DataFrame containing the metric values, names, and dates.
:param metrics: A list of strings representing the names of the metrics
to be included in the plot.
:param title: A string representing the title of the plot.
:return: A Plotly figure object with lines representing metrics and
horizontal dotted lines representing human baselines.
"""
# Filter the DataFrame based on the specified metrics
df = df[df["task"].isin(metrics)]
# Create a line figure using plotly express with specified markers and custom data
fig = px.line(
df,
x="date",
y="score",
color="task",
markers=True,
custom_data=["task", "score", "model"],
title=title,
)
# Update hovertemplate for better hover interaction experience
fig.update_traces(
hovertemplate="<br>".join(
[
"Model Name: %{customdata[2]}",
"Metric Name: %{customdata[0]}",
"Date: %{x}",
"Metric Value: %{y}",
]
)
)
# Update the range of the y-axis
fig.update_layout(yaxis_range=[0, 100])
# Create a dictionary to hold the color mapping for each metric
metric_color_mapping = {}
# Map each metric name to its color in the figure
for trace in fig.data:
metric_color_mapping[trace.name] = trace.line.color
return fig
# Example Usage:
# human_baselines dictionary is defined.
# chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")
|