hf-public-data-insights

Sleeping

File size: 3,987 Bytes

import os
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import duckdb
import random
import argparse
import yaml

# Create the "public" folders if they don't exist
os.makedirs("public", exist_ok=True)

# URLs of the files to download
urls = [
    "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet?download=true",
    "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/datasets.parquet?download=true",
    "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/spaces.parquet?download=true"
]

def download_file(url, overwrite=True):
    filename = os.path.join("public", url.split("/")[-1].split("?")[0])
    
    if not overwrite and os.path.exists(filename):
        print(f"File already exists: {filename}. Skipping download.")
        return

    response = requests.get(url, stream=True)
    total_size = int(response.headers.get("Content-Length", 0))
    block_size = 1024  # 1 KB

    with open(filename, "wb") as file, tqdm(
        desc=filename,
        total=total_size,
        unit="iB",
        unit_scale=True,
        unit_divisor=1024,
        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
    ) as progress_bar:
        for data in response.iter_content(block_size):
            size = file.write(data)
            progress_bar.update(size)

    print(f"Downloaded: {filename}")

def main(overwrite):
    # Create a ThreadPoolExecutor with max_workers set to 3 (number of files to download)
    with ThreadPoolExecutor(max_workers=3) as executor:
        # Submit download tasks to the executor
        futures = [executor.submit(download_file, url, overwrite) for url in urls]

        # Wait for all tasks to complete
        for future in futures:
            future.result()

    print("All files downloaded successfully.")

    # Process each downloaded Parquet file
    for url in urls:
        filename = os.path.join("public", url.split("/")[-1].split("?")[0])
        table_name = os.path.splitext(os.path.basename(filename))[0]

        # Connect to the Parquet file using DuckDB
        con = duckdb.connect(database=':memory:')
        con.execute(f"CREATE VIEW {table_name} AS SELECT * FROM parquet_scan('{filename}')")

        # Retrieve the table structure
        table_structure = con.execute(f"DESCRIBE {table_name}").fetchall()

        # Generate the YAML content
        yaml_content = f"{table_name}:\n"
        yaml_content += "  table_structure:\n"
        for row in table_structure:
            column, dtype = row[:2]  # Unpack only the first two values
            yaml_content += f"    - column: {column}\n"
            yaml_content += f"      type: {dtype}\n"

        # Retrieve 10 random items from the table
        con.execute(f"CREATE VIEW {table_name}_random AS SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 10")
        random_items = con.execute(f"SELECT * FROM {table_name}_random").fetchall()

        yaml_content += "  random_items:\n"
        for item in random_items:
            yaml_content += "    - "
            for column, value in zip([row[0] for row in table_structure], item):
                yaml_content += f"{column}: {value}\n      "
            yaml_content = yaml_content.rstrip()  # Remove trailing spaces
            yaml_content += "\n"

        # Save the YAML content to a file in the "public" folder
        yaml_file = os.path.join("public", f"{table_name}.example.yaml")
        with open(yaml_file, "w") as file:
            file.write(yaml_content)

        print(f"Generated: {yaml_file}")

    print("Example files generated successfully.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Download and process Parquet files.")
    parser.add_argument("--no-overwrite", action="store_true", help="Skip downloading files that already exist.")
    args = parser.parse_args()

    main(overwrite=not args.no_overwrite)