File size: 5,196 Bytes
35378f6
 
 
 
18d5ac3
 
35378f6
 
18d5ac3
35378f6
 
18d5ac3
 
 
35378f6
 
18d5ac3
35378f6
 
 
 
 
 
 
 
 
 
 
 
 
 
18d5ac3
35378f6
 
 
 
18d5ac3
 
 
 
 
 
 
35378f6
 
 
 
 
 
 
 
 
 
 
 
18d5ac3
35378f6
 
 
 
 
 
 
 
 
 
 
18d5ac3
35378f6
 
 
 
18d5ac3
35378f6
18d5ac3
 
 
35378f6
 
 
 
 
 
18d5ac3
35378f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18d5ac3
35378f6
18d5ac3
35378f6
 
 
 
 
18d5ac3
 
35378f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18d5ac3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import pandas as pd
import requests, json
from io import StringIO
from datetime import datetime


def get_github_data():
    """
    Get data from csv files on Github
    Args:
        None
    Returns:
        latest_df: singular list containing dataframe of the latest version of the leaderboard with only 4 columns
        all_dfs: list of dataframes for previous versions + latest version including columns for all games
        all_vnames: list of the names for the previous versions + latest version (For Details and Versions Tab Dropdown)
    """
    uname = "clembench"
    repo = "clembench-runs"
    json_url = f"https://raw.githubusercontent.com/{uname}/{repo}/main/benchmark_runs.json"
    resp = requests.get(json_url)
    if resp.status_code == 200:
        json_data = json.loads(resp.text)
        versions = json_data['versions']
        version_names = []
        csv_url = f"https://raw.githubusercontent.com/{uname}/{repo}/main/"
        for ver in versions:
            version_names.append(ver['version'])
            csv_path = ver['result_file'].split('/')[1:]
            csv_path = '/'.join(csv_path)
        
        # Sort by latest version
        float_content = [float(s[1:]) for s in version_names]
        float_content.sort(reverse=True)
        version_names = ['v'+str(s) for s in float_content]

        # Get date of latest version
        for data in versions:
            if data['version'] == version_names[0]:
                date = data['date'] # Should be in YYYY/MM/DD format
                date_obj = datetime.strptime(date, "%Y/%m/%d")
                date = date_obj.strftime("%d %b %Y")

        DFS = []
        for version in version_names:
            result_url = csv_url+ version + '/' + csv_path
            csv_response = requests.get(result_url)
            if csv_response.status_code == 200:
                df = pd.read_csv(StringIO(csv_response.text))
                df = process_df(df)
                df = df.sort_values(by=list(df.columns)[1], ascending=False) # Sort by clemscore
                DFS.append(df)
            else:
                print(f"Failed to read CSV file for version : {version}. Status Code : {resp.status_code}")

        # Only keep relevant columns for the main leaderboard
        latest_df_dummy = DFS[0]
        all_columns = list(latest_df_dummy.columns)
        keep_columns = all_columns[0:4]
        latest_df_dummy = latest_df_dummy.drop(columns=[c for c in all_columns if c not in keep_columns])

        latest_df = [latest_df_dummy]
        all_dfs = []
        all_vnames = []
        for df, name in zip(DFS, version_names):
            all_dfs.append(df)
            all_vnames.append(name) 
        return latest_df, all_dfs, all_vnames, date
    
    else:
        print(f"Failed to read JSON file: Status Code : {resp.status_code}")


def process_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process dataframe
    - Remove repition in model names
    - Convert datatypes to sort by "float" instead of "str" for sorting
    - Update column names
    Args:
        df: Unprocessed Dataframe (after using update_cols)
    Returns:
        df: Processed Dataframe
    """

    # Change column type to float from str
    list_column_names = list(df.columns)
    model_col_name = list_column_names[0]
    for col in list_column_names:
        if col != model_col_name:
            df[col] = df[col].astype(float)

    # Remove repetition in model names, if any
    models_list = []
    for i in range(len(df)):
        model_name = df.iloc[i][model_col_name]
        splits = model_name.split('--')
        splits = [split.replace('-t0.0', '') for split in splits] # Comment to not remove -t0.0
        if splits[0] == splits[1]:
            models_list.append(splits[0])
        else:
            models_list.append(splits[0] + "--" + splits[1])
    df[model_col_name] = models_list

    # Update column names
    update = ['Model', 'Clemscore', '% Played', 'Quality Score']
    game_metrics = list_column_names[4:]

    for col in game_metrics:
        splits = col.split(',')
        update.append(splits[0].capitalize() + "" + splits[1])
    
    map_cols = {}
    for i in range(len(update)):
        map_cols[list_column_names[i]] = str(update[i])

    df = df.rename(columns=map_cols)    
    return df


def filter_search(df: pd.DataFrame, query: str) -> pd.DataFrame:
    """
    Filter the dataframe based on the search query
    Args:
        df: Unfiltered dataframe
        query: a string of queries separated by ";"
    Return:
        filtered_df: Dataframe containing searched queries in the 'Model' column
    """
    queries = query.split(';')
    list_cols = list(df.columns)
    df_len = len(df)
    filtered_models = []
    models_list = list(df[list_cols[0]])
    for q in queries:
        q = q.lower()
        q = q.strip()
        for i in range(df_len):
            model_name = models_list[i]
            if q in model_name.lower():
                filtered_models.append(model_name) # Append model names containing query q

    filtered_df = df[df[list_cols[0]].isin(filtered_models)]

    if query == "":
        return df

    return filtered_df