Plonk / data /to_webdataset /process_yfcc_metadata.py
nicolas-dufour's picture
squash: merge all unpushed commits
c4c7cee
raw
history blame
2.75 kB
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
with ProgressBar():
ddf = dd.read_csv(
"../datasets/YFCC100M/yfcc100m_dataset",
names=[
"photo_id",
"user_nsid",
"user_nickname",
"date_taken",
"date_uploaded",
"capture_device",
"title",
"description",
"user_tags",
"machine_tags",
"longitude",
"latitude",
"accuracy",
"page_url",
"download_url",
"license_name",
"license_url",
"server_id",
"farm_id",
"secret",
"secret_original",
"extension",
"media_type",
],
dtype={
"photo_id": str,
"user_nsid": str,
"user_nickname": str,
"user_tags": str,
"machine_tags": str,
"longitude": float,
"latitude": float,
"accuracy": float,
"server_id": str,
"farm_id": str,
"secret": str,
"secret_original": str,
"extension": str,
"media_type": float,
},
sep="\t",
)
ddf = ddf[
[
"photo_id",
"longitude",
"latitude",
"accuracy",
"extension",
"download_url",
"media_type",
]
]
filtered_ddf = ddf[
ddf["longitude"].notnull()
& ddf["latitude"].notnull()
& (ddf["media_type"] == 0)
]
del ddf["media_type"]
hash_ddf = dd.read_csv(
"../datasets/YFCC100M/yfcc100m_hash",
names=["photo_id", "hash"],
dtype={"photo_id": str, "hash": str},
sep="\t",
)
filtered_ddf = filtered_ddf.merge(hash_ddf, on="photo_id", how="left")
# Read the 4k photo IDs
with open("../datasets/YFCC100M/yfcc_4k_ids.txt", "r") as f:
test_photo_ids = set(f.read().splitlines())
# Split the dataframe based on whether photo_id is in test set
filter = filtered_ddf["photo_id"].isin(test_photo_ids)
test_ddf = filtered_ddf[filter]
train_ddf = filtered_ddf[~filter]
train_ddf = train_ddf[train_ddf["accuracy"] >= 12]
# Save the split dataframes
test_ddf.to_csv(
"../datasets/YFCC100M/yfcc_4k_dataset_with_gps.csv",
sep="\t",
index=False,
single_file=True,
)
train_ddf = train_ddf.repartition(npartitions=len(train_ddf) // 100000 + 1)
train_ddf.to_csv(
"../datasets/YFCC100M/yfcc100m_dataset_with_gps_train/*.csv",
sep="\t",
index=False,
single_file=False,
)