import pandas as pd import wandb def get_wandb_data(entity: str, project: str, api_key: str, job_type: str) -> pd.DataFrame: api = wandb.Api(api_key=api_key) # Project is specified by filter_dict = {"jobType": job_type} runs = api.runs(f"{entity}/{project}", filters=filter_dict) summary_list, config_list, name_list = [], [], [] for run in runs: # .summary contains the output keys/values for metrics like accuracy. # We call ._json_dict to omit large files summary_list.append(run.summary._json_dict) # .config contains the hyperparameters. # We remove special values that start with _. config_list.append({k: v for k, v in run.config.items()}) # .name is the human-readable name of the run. name_list.append(run.name) summary_df = pd.json_normalize(summary_list, max_level=1) config_df = pd.json_normalize(config_list, max_level=2) runs_df = pd.concat([summary_df, config_df], axis=1) runs_df.index = name_list return runs_df def get_leaderboard(runs_df: pd.DataFrame, metrics: list[str]) -> pd.DataFrame: leaderboard = pd.DataFrame( index=runs_df['model'].unique(), columns=metrics ).fillna(0) for _, building_df in runs_df.groupby("unique_id"): for column in leaderboard.columns: best_model = building_df.loc[building_df[column].idxmin()].model leaderboard.loc[best_model, column] += 1 leaderboard = leaderboard.sort_values(by=list(leaderboard.columns), ascending=False) return leaderboard