Spaces:
Running
Running
import dask | |
import dask.dataframe as dd | |
from dask.diagnostics import ProgressBar | |
with ProgressBar(): | |
ddf = dd.read_csv( | |
"../datasets/YFCC100M/yfcc100m_dataset", | |
names=[ | |
"photo_id", | |
"user_nsid", | |
"user_nickname", | |
"date_taken", | |
"date_uploaded", | |
"capture_device", | |
"title", | |
"description", | |
"user_tags", | |
"machine_tags", | |
"longitude", | |
"latitude", | |
"accuracy", | |
"page_url", | |
"download_url", | |
"license_name", | |
"license_url", | |
"server_id", | |
"farm_id", | |
"secret", | |
"secret_original", | |
"extension", | |
"media_type", | |
], | |
dtype={ | |
"photo_id": str, | |
"user_nsid": str, | |
"user_nickname": str, | |
"user_tags": str, | |
"machine_tags": str, | |
"longitude": float, | |
"latitude": float, | |
"accuracy": float, | |
"server_id": str, | |
"farm_id": str, | |
"secret": str, | |
"secret_original": str, | |
"extension": str, | |
"media_type": float, | |
}, | |
sep="\t", | |
) | |
ddf = ddf[ | |
[ | |
"photo_id", | |
"longitude", | |
"latitude", | |
"accuracy", | |
"extension", | |
"download_url", | |
"media_type", | |
] | |
] | |
filtered_ddf = ddf[ | |
ddf["longitude"].notnull() | |
& ddf["latitude"].notnull() | |
& (ddf["media_type"] == 0) | |
] | |
del ddf["media_type"] | |
hash_ddf = dd.read_csv( | |
"../datasets/YFCC100M/yfcc100m_hash", | |
names=["photo_id", "hash"], | |
dtype={"photo_id": str, "hash": str}, | |
sep="\t", | |
) | |
filtered_ddf = filtered_ddf.merge(hash_ddf, on="photo_id", how="left") | |
# Read the 4k photo IDs | |
with open("../datasets/YFCC100M/yfcc_4k_ids.txt", "r") as f: | |
test_photo_ids = set(f.read().splitlines()) | |
# Split the dataframe based on whether photo_id is in test set | |
filter = filtered_ddf["photo_id"].isin(test_photo_ids) | |
test_ddf = filtered_ddf[filter] | |
train_ddf = filtered_ddf[~filter] | |
train_ddf = train_ddf[train_ddf["accuracy"] >= 12] | |
# Save the split dataframes | |
test_ddf.to_csv( | |
"../datasets/YFCC100M/yfcc_4k_dataset_with_gps.csv", | |
sep="\t", | |
index=False, | |
single_file=True, | |
) | |
train_ddf = train_ddf.repartition(npartitions=len(train_ddf) // 100000 + 1) | |
train_ddf.to_csv( | |
"../datasets/YFCC100M/yfcc100m_dataset_with_gps_train/*.csv", | |
sep="\t", | |
index=False, | |
single_file=False, | |
) | |