Spaces:
Running
Running
File size: 3,080 Bytes
923aff9 54f2589 923aff9 54f2589 923aff9 54f2589 923aff9 54f2589 923aff9 fa0181f 923aff9 fa0181f 923aff9 54f2589 923aff9 54f2589 923aff9 54f2589 923aff9 54f2589 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
## REQUIRED OUTPUT ###
# A list of version names -> v1.6, v.6_multimodal, v1.6_quantized, v1.5, v0.9, etc......
# A corresponding DataFrame?
import requests
from datetime import datetime
import pandas as pd
import json
from io import StringIO
from src.leaderboard_utils import process_df
from src.assets.text_content import REPO, BENCHMARK_FILE
VARIANTS = ['ascii', 'backends', 'quantized'] # Include other variants if added in the main clembench-runs repo
def get_version_data():
"""
Read and process data from CSV files of all available versions hosted on GitHub. - https://github.com/clembench/clembench-runs
Returns:
version_data:
-
"""
base_repo = REPO
json_url = base_repo + BENCHMARK_FILE
response = requests.get(json_url)
# Check if the JSON file request was successful
if response.status_code != 200:
print(f"Failed to read JSON file: Status Code: {response.status_code}")
return None, None, None, None
json_data = response.json()
versions = json_data['versions']
version_names = sorted(
[ver['version'] for ver in versions],
key=lambda v: list(map(int, v[1:].split('_')[0].split('.'))),
reverse=True
)
version_data = {
'versions': [],
'dataframes': []
}
for version in version_names:
base_url = f"{base_repo}{version}/results.csv"
response = requests.get(base_url)
if response.status_code == 200:
df = pd.read_csv(StringIO(response.text))
df = process_df(df)
df = df.sort_values(by=df.columns[1], ascending=False) # Sort by clemscore column
version_data['dataframes'].append(df)
metadata = {
'name': version,
'last_updated': [datetime.strptime(v['last_updated'], '%Y-%m-%d').strftime("%d %b %Y") for v in versions if v['version'] == version],
'release_date': [datetime.strptime(v['release_date'], '%Y-%m-%d').strftime("%d %b %Y") for v in versions if v['version'] == version]
}
version_data['versions'].append(metadata)
# Look for variant results file
version = version.split('_')[0] # Remove _multimodal suffix, and check for other suffixes
for suffix in VARIANTS:
base_url = f"{base_repo}{version}_{suffix}/results.csv"
response = requests.get(base_url)
if response.status_code == 200:
df = pd.read_csv(StringIO(response.text))
df = process_df(df)
df = df.sort_values(by=df.columns[1], ascending=False) # Sort by clemscore column
version_data['dataframes'].append(df)
metadata = {
'name': version + "_" + suffix # Skip Release date and last updated # Not included in becnhmark_runs.json
}
version_data['versions'].append(metadata)
return version_data
if __name__ == "__main__":
version_data = get_version_data()
print(version_data['versions'])
|