Plonk / scripts /download-dataset.py
nicolas-dufour's picture
squash: merge all unpushed commits
c4c7cee
raw
history blame
868 Bytes
import os, zipfile
from huggingface_hub import snapshot_download
# Define the base directory
base_dir = os.path.join(os.getcwd(), 'datasets')
# Ensure the base directory exists
if not os.path.exists(base_dir):
os.mkdir(base_dir)
# Define the specific dataset directory
dataset_dir = os.path.join(base_dir, "osv5m")
# Ensure the specific dataset directory exists
if not os.path.exists(dataset_dir):
os.mkdir(dataset_dir)
# Download the dataset
snapshot_download(repo_id="osv5m/osv5m", local_dir=dataset_dir, repo_type='dataset')
# Extract zip files and remove them after extraction
for root, dirs, files in os.walk(dataset_dir):
for file in files:
if file.endswith(".zip"):
with zipfile.ZipFile(os.path.join(root, file), 'r') as zip_ref:
zip_ref.extractall(root)
os.remove(os.path.join(root, file))