Spaces:
Running
Running
import webdataset as wds | |
from pathlib import Path | |
import json | |
import numpy as np | |
from PIL import Image | |
def main( | |
src_json, | |
dest_folder, | |
num_samples_per_tar=10000, | |
number_of_jobs=10, | |
job_offset=0, | |
): | |
with open(src_json, "r") as f: | |
data = json.load(f) | |
import pandas as pd | |
root_path = Path(src_json).parent | |
# Convert images list to pandas dataframe | |
data_df = pd.DataFrame(data["images"]) | |
if "annotations" in data: | |
has_annotations = True | |
annotations_df = pd.DataFrame(data["annotations"]) | |
# Join the dataframes on id to get category_id from annotations | |
data_df = data_df.merge( | |
annotations_df[["id", "category_id"]], | |
left_on="id", | |
right_on="id", | |
how="left", | |
) | |
categories_df = pd.DataFrame(data["categories"]) | |
data_df = data_df.merge( | |
categories_df[ | |
[ | |
"id", | |
"name", | |
"common_name", | |
"supercategory", | |
"kingdom", | |
"phylum", | |
"class", | |
"order", | |
"family", | |
"genus", | |
"specific_epithet", | |
] | |
], | |
left_on="category_id", | |
right_on="id", | |
how="left", | |
) | |
data_df.rename( | |
columns={ | |
"id_x": "id", | |
}, | |
inplace=True, | |
) | |
del data_df["id_y"] | |
else: | |
has_annotations = False | |
data_df = data_df[data_df["latitude"].notna() & data_df["longitude"].notna()] | |
num_samples = len(data_df) | |
num_total_tar = num_samples // num_samples_per_tar + ( | |
1 if num_samples % num_samples_per_tar > 0 else 0 | |
) | |
number_of_tar_per_job = num_total_tar // number_of_jobs | |
if job_offset == number_of_jobs - 1: | |
data_df = data_df.iloc[ | |
number_of_tar_per_job * job_offset * num_samples_per_tar : | |
] | |
else: | |
data_df = data_df.iloc[ | |
number_of_tar_per_job | |
* job_offset | |
* num_samples_per_tar : number_of_tar_per_job | |
* (job_offset + 1) | |
* num_samples_per_tar | |
] | |
print(f"Processing job {job_offset} with {len(data_df)} / {num_samples} samples") | |
print(f"Number of tar: {number_of_tar_per_job} / {num_total_tar}") | |
print(f"Start shard: {number_of_tar_per_job * job_offset}") | |
with wds.ShardWriter( | |
str(Path(dest_folder) / "%04d.tar"), | |
maxcount=num_samples_per_tar, | |
start_shard=number_of_tar_per_job * job_offset, | |
) as sink: | |
for i in range(len(data_df)): | |
row = data_df.iloc[i] | |
image_path = Path(root_path) / Path("images") / row["file_name"] | |
dinov2_embedding_path = ( | |
Path(root_path) | |
/ Path("embeddings") | |
/ Path("dinov2") | |
/ f"{row['file_name'].replace('.jpg', '.npy')}" | |
) | |
sample = { | |
"__key__": str(row["id"]), | |
"jpg": Image.open(image_path).convert("RGB"), | |
"dinov2_vitl14_registers.npy": np.load(dinov2_embedding_path), | |
"json": row.to_dict(), | |
} | |
sink.write(sample) | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--src_json", help="pixel_input_folder") | |
parser.add_argument("--dest", help="path to destination web") | |
parser.add_argument( | |
"--num_samples_per_tar", | |
help="number of samples per tar", | |
type=int, | |
default=10000, | |
) | |
parser.add_argument("--number_of_jobs", help="number of jobs", type=int, default=10) | |
parser.add_argument("--job_offset", help="job offset", type=int, default=0) | |
args = parser.parse_args() | |
dest = Path(args.dest) | |
dest.mkdir(exist_ok=True, parents=True) | |
main( | |
args.src_json, | |
args.dest, | |
args.num_samples_per_tar, | |
args.number_of_jobs, | |
args.job_offset, | |
) | |