File size: 4,811 Bytes
ee9e25e 513e813 cc32c4f b58e1f0 0a77c60 b58e1f0 ee9e25e 513e813 83a34f0 ee9e25e 30fa96a ee9e25e f228d38 c32735e 25b87bf 38d88f9 513e813 25b87bf 38d88f9 b58e1f0 38d88f9 b58e1f0 38d88f9 25b87bf b58e1f0 25b87bf b58e1f0 25b87bf b58e1f0 38d88f9 9adae3c 0a77c60 9adae3c 0a77c60 b58e1f0 0a77c60 9adae3c 0a77c60 b58e1f0 0a77c60 b58e1f0 25b87bf 513e813 9adae3c 38d88f9 83a34f0 cc32c4f 0a77c60 cc32c4f c32735e 30fa96a c32735e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import pandas as pd
import os
import fnmatch
import json
import re
import numpy as np
import requests
from urllib.parse import quote
from datetime import datetime
import uuid
class DetailsDataProcessor:
# Download
#url example https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/64bits/LexPodLM-13B/details_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-25T13%3A41%3A51.227672.json
def __init__(self, directory='results', pattern='results*.json'):
self.directory = directory
self.pattern = pattern
def _find_files(self, directory='results', pattern='results*.json'):
matching_files = [] # List to hold matching filenames
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
matching_files.append(filename) # Append the matching filename to the list
return matching_files # Return the list of matching filenames
# @staticmethod
# def download_file(url, directory='details_data'):
# # Define the prefix to be removed from the URL
# url_prefix = "https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/"
# # Remove the prefix from the URL
# file_name_part = url.replace(url_prefix, '')
# # Replace characters that don't play nice with file systems
# safe_file_name = re.sub(r'[<>:"/\\|?*]', '_', file_name_part) # Replace with '_'
# save_file_path = os.path.join(directory, safe_file_name)
# error_count = 0
# success_count = 0
# try:
# # Sending a GET request
# r = requests.get(url, allow_redirects=True)
# r.raise_for_status()
# # Writing the content to the specified file
# with open(save_file_path, 'wb') as file:
# file.write(r.content)
# success_count += 1
# except requests.ConnectionError as e:
# error_count += 1
# except requests.HTTPError as e:
# error_count += 1
# except FileNotFoundError as e:
# error_count += 1
# except Exception as e:
# error_count += 1
# return error_count, success_count
@staticmethod
def download_file(url, directory='details_data'):
# Extract relevant parts from the URL
segments = url.split('/')
organization = segments[-3]
model_name = segments[-2]
task = segments[-1].split('_')[0] # Assuming task is part of the last segment
# Construct the filename
safe_file_name = f"{organization}_{model_name}_{task}.json"
# Create the full save file path
save_file_path = os.path.join(directory, safe_file_name)
error_count = 0
success_count = 0
try:
# Sending a GET request
r = requests.get(url, allow_redirects=True)
r.raise_for_status()
# Writing the content to the specified file
with open(save_file_path, 'wb') as file:
file.write(r.content)
print(save_file_path)
success_count += 1
except requests.ConnectionError as e:
error_count += 1
except requests.HTTPError as e:
error_count += 1
except FileNotFoundError as e:
error_count += 1
except Exception as e:
error_count += 1
return error_count, success_count
@staticmethod
def single_file_pipeline(url, filename):
DetailsDataProcessor.download_file(url, filename)
# read file
with open(filename) as f:
data = json.load(f)
# convert to dataframe
df = pd.DataFrame(data)
return df
@staticmethod
def build_url(file_path):
segments = file_path.split('/')
bits = segments[1]
model_name = segments[2]
try:
timestamp = segments[3].split('_')[1]
except IndexError:
print(f"Error: {file_path}")
return None
url = f'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/{bits}/{model_name}/details_harness%7ChendrycksTest-moral_scenarios%7C5_{quote(timestamp, safe="")}'
return url
def pipeline(self):
dataframes = []
file_paths = self._find_files(self.directory, self.pattern)
for file_path in file_paths:
print(file_path)
url = self.generate_url(file_path)
file_path = file_path.split('/')[-1]
df = self.single_file_pipeline(url, file_path)
dataframes.append(df)
return dataframes
|