Spaces:
Sleeping
Sleeping
File size: 1,591 Bytes
dc70c7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import requests
import duckdb
DATASET_VIEWER_API_URL = "https://datasets-server.huggingface.co/"
session = requests.Session()
def fetch_json(url, params=None, timeout=20):
response = session.get(url, params=params, timeout=timeout)
response.raise_for_status()
data = response.json()
if "error" in data:
raise Exception(f"Error fetching data: {data['error']}")
return data
def get_split_rows(dataset, config, split):
url = f"{DATASET_VIEWER_API_URL}/size"
params = {"dataset": dataset, "config": config}
config_size = fetch_json(url, params)
split_size = next(
(s for s in config_size["size"]["splits"] if s["split"] == split), None
)
if split_size is None:
raise Exception(f"Error fetching split {split} in config {config}")
return split_size["num_rows"]
def get_parquet_urls(dataset, config, split):
url = f"{DATASET_VIEWER_API_URL}/parquet"
params = {"dataset": dataset, "config": config, "split": split}
parquet_files = fetch_json(url, params)
parquet_urls = [file["url"] for file in parquet_files["parquet_files"]]
return ",".join(f"'{url}'" for url in parquet_urls)
def get_docs_from_parquet(parquet_urls, column, offset, limit):
sql_query = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};"
df = duckdb.sql(sql_query).to_df()
return df[column].tolist()
def get_info(dataset):
url = f"{DATASET_VIEWER_API_URL}/info"
params = {"dataset": dataset}
info_resp = fetch_json(url, params)
return info_resp["dataset_info"]
|