Spaces:
Running
Running
File size: 4,707 Bytes
35378f6 923aff9 35378f6 18d5ac3 54f2589 35378f6 18d5ac3 54f2589 923aff9 18d5ac3 923aff9 18d5ac3 54f2589 923aff9 54f2589 923aff9 54f2589 923aff9 fa0181f 923aff9 fa0181f 923aff9 54f2589 fa0181f 923aff9 54f2589 fa0181f 54f2589 fa0181f 54f2589 fa0181f 54f2589 923aff9 35378f6 18d5ac3 35378f6 18d5ac3 923aff9 35378f6 923aff9 35378f6 923aff9 35378f6 18d5ac3 35378f6 923aff9 35378f6 923aff9 35378f6 18d5ac3 923aff9 18d5ac3 923aff9 35378f6 923aff9 18d5ac3 923aff9 35378f6 923aff9 18d5ac3 fa0181f 54f2589 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import os
import pandas as pd
import requests
import json
from io import StringIO
from datetime import datetime
from src.assets.text_content import REPO, BENCHMARK_FILE
def get_github_data():
"""
Read and process data from CSV files hosted on GitHub. - https://github.com/clembench/clembench-runs (REPO)
Set the path in src/assets/text_content/REPO
Returns:
github_data (dict): Dictionary containing:
- "text": List of DataFrames for each version's textual leaderboard data.
- "multimodal": List of DataFrames for each version's multimodal leaderboard data.
- "date": Formatted date of the latest version in "DD Month YYYY" format.
"""
json_url = REPO + BENCHMARK_FILE
response = requests.get(json_url)
# Check if the JSON file request was successful
if response.status_code != 200:
print(f"Failed to read JSON file - {BENCHMARK_FILE} in repo {REPO}: Status Code: {response.status_code}")
return None, None, None, None
json_data = response.json()
versions = json_data['versions']
# Sort the versions in benchmark by latest first
version_names = sorted(
[ver['version'] for ver in versions],
key=lambda v: list(map(int, v[1:].split('_')[0].split('.'))),
reverse=True
)
# Collect Dataframes - Text and Multimodal Only - Ignoring _quantized, _backends, _ascii
text_data = {
'version_data': [],
'dataframes': []
}
multimodal_data = {
'version_data': [],
'dataframes': []
}
for version in version_names:
results_url = f"{REPO}{version}/results.csv"
csv_response = requests.get(results_url)
if csv_response.status_code == 200:
df = pd.read_csv(StringIO(csv_response.text))
df = process_df(df)
df = df.sort_values(by=df.columns[1], ascending=False) # Sort by Clemscore
version_data = {
'name': version,
'last_updated': [datetime.strptime(v['last_updated'], '%Y-%m-%d').strftime("%d %b %Y") for v in versions if v['version'] == version],
'release_date': [datetime.strptime(v['release_date'], '%Y-%m-%d').strftime("%d %b %Y") for v in versions if v['version'] == version]
}
if 'multimodal' in version:
multimodal_data['dataframes'].append(df)
multimodal_data['version_data'].append(version_data)
else:
text_data['dataframes'].append(df)
text_data['version_data'].append(version_data)
github_data = {
'text': text_data,
'multimodal': multimodal_data
}
return github_data
def process_df(df: pd.DataFrame) -> pd.DataFrame:
"""
Process dataframe:
- Convert datatypes to sort by "float" instead of "str"
- Remove repetition in model names
- Update column names
Args:
df: Unprocessed Dataframe (after using update_cols)
Returns:
df: Processed Dataframe
"""
# Convert column values to float, apart from the model names column
for col in df.columns[1:]:
df[col] = pd.to_numeric(df[col], errors='coerce')
# Remove repetition in model names
df[df.columns[0]] = df[df.columns[0]].str.replace('-t0.0', '', regex=True)
df[df.columns[0]] = df[df.columns[0]].apply(lambda x: '--'.join(set(x.split('--'))))
# Update column names
custom_column_names = ['Model', 'Clemscore', '% Played', 'Quality Score']
for i, col in enumerate(df.columns[4:]): # Start Capitalizing from the 5th column
parts = col.split(',')
custom_name = f"{parts[0].strip().capitalize()} {parts[1].strip()}"
custom_column_names.append(custom_name)
# Rename columns
df.columns = custom_column_names
return df
def query_search(df: pd.DataFrame, query: str) -> pd.DataFrame:
"""
Filter the dataframe based on the search query.
Args:
df (pd.DataFrame): Unfiltered dataframe.
query (str): A string of queries separated by ";".
Returns:
pd.DataFrame: Filtered dataframe containing searched queries in the 'Model' column.
"""
if not query.strip(): # Reset Dataframe if empty query is passed
return df
queries = [q.strip().lower() for q in query.split(';') if q.strip()] # Normalize and split queries
# Filter dataframe based on queries in 'Model' column
filtered_df = df[df['Model'].str.lower().str.contains('|'.join(queries))]
return filtered_df
if __name__=='__main__':
data = get_github_data()
print(data['text']['version_data'])
print(data['multimodal']['version_data'])
|