File size: 1,591 Bytes
dc70c7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import requests
import duckdb

DATASET_VIEWER_API_URL = "https://datasets-server.huggingface.co/"
session = requests.Session()


def fetch_json(url, params=None, timeout=20):
    response = session.get(url, params=params, timeout=timeout)
    response.raise_for_status()
    data = response.json()
    if "error" in data:
        raise Exception(f"Error fetching data: {data['error']}")
    return data


def get_split_rows(dataset, config, split):
    url = f"{DATASET_VIEWER_API_URL}/size"
    params = {"dataset": dataset, "config": config}
    config_size = fetch_json(url, params)

    split_size = next(
        (s for s in config_size["size"]["splits"] if s["split"] == split), None
    )
    if split_size is None:
        raise Exception(f"Error fetching split {split} in config {config}")

    return split_size["num_rows"]


def get_parquet_urls(dataset, config, split):
    url = f"{DATASET_VIEWER_API_URL}/parquet"
    params = {"dataset": dataset, "config": config, "split": split}
    parquet_files = fetch_json(url, params)

    parquet_urls = [file["url"] for file in parquet_files["parquet_files"]]
    return ",".join(f"'{url}'" for url in parquet_urls)


def get_docs_from_parquet(parquet_urls, column, offset, limit):
    sql_query = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};"
    df = duckdb.sql(sql_query).to_df()
    return df[column].tolist()


def get_info(dataset):
    url = f"{DATASET_VIEWER_API_URL}/info"
    params = {"dataset": dataset}
    info_resp = fetch_json(url, params)

    return info_resp["dataset_info"]