Plonk / scripts /preprocessing /preprocess.py
nicolas-dufour's picture
squash: merge all unpushed commits
c4c7cee
raw
history blame
13.2 kB
import pandas as pd
import torch
import numpy as np
from os.path import join
import matplotlib.pyplot as plt
import hydra
class QuadTree(object):
def __init__(self, data, mins=None, maxs=None, id="", depth=3, do_split=1000):
self.id = id
self.data = data
if mins is None:
mins = data[["latitude", "longitude"]].to_numpy().min(0)
if maxs is None:
maxs = data[["latitude", "longitude"]].to_numpy().max(0)
self.mins = np.asarray(mins)
self.maxs = np.asarray(maxs)
self.sizes = self.maxs - self.mins
self.children = []
mids = 0.5 * (self.mins + self.maxs)
xmin, ymin = self.mins
xmax, ymax = self.maxs
xmid, ymid = mids
if (depth > 0) and (len(self.data) >= do_split):
# split the data into four quadrants
data_q1 = data[(data["latitude"] < mids[0]) & (data["longitude"] < mids[1])]
data_q2 = data[
(data["latitude"] < mids[0]) & (data["longitude"] >= mids[1])
]
data_q3 = data[
(data["latitude"] >= mids[0]) & (data["longitude"] < mids[1])
]
data_q4 = data[
(data["latitude"] >= mids[0]) & (data["longitude"] >= mids[1])
]
# recursively build a quad tree on each quadrant which has data
if data_q1.shape[0] > 0:
self.children.append(
QuadTree(
data_q1,
[xmin, ymin],
[xmid, ymid],
id + "0",
depth - 1,
do_split=do_split,
)
)
if data_q2.shape[0] > 0:
self.children.append(
QuadTree(
data_q2,
[xmin, ymid],
[xmid, ymax],
id + "1",
depth - 1,
do_split=do_split,
)
)
if data_q3.shape[0] > 0:
self.children.append(
QuadTree(
data_q3,
[xmid, ymin],
[xmax, ymid],
id + "2",
depth - 1,
do_split=do_split,
)
)
if data_q4.shape[0] > 0:
self.children.append(
QuadTree(
data_q4,
[xmid, ymid],
[xmax, ymax],
id + "3",
depth - 1,
do_split=do_split,
)
)
def unwrap(self):
if len(self.children) == 0:
return {self.id: [self.mins, self.maxs, self.data.copy()]}
else:
d = dict()
for child in self.children:
d.update(child.unwrap())
return d
def extract(qt, name_new_column):
cluster = qt.unwrap()
boundaries, data = {}, []
id_to_quad = np.array(list(cluster.keys()))
for i, (id, vs) in zip(np.arange(len(cluster)), cluster.items()):
(min_lat, min_lon), (max_lat, max_lon), points = vs
points[name_new_column] = int(i)
data.append(points)
boundaries[i] = (
float(min_lat),
float(min_lon),
float(max_lat),
float(max_lon),
points["latitude"].mean(),
points["longitude"].mean(),
)
data = pd.concat(data)
return boundaries, data, id_to_quad
def vizu(name_new_column, df_train, boundaries, save_path):
plt.hist(df_train[name_new_column], bins=len(boundaries))
plt.xlabel("Cluster ID")
plt.ylabel("Number of images")
plt.title("Cluster distribution")
plt.yscale("log")
plt.savefig(join(save_path, f"{name_new_column}_distrib.png"))
plt.clf()
plt.scatter(
df_train["longitude"].to_numpy(),
df_train["latitude"].to_numpy(),
c=np.random.permutation(len(boundaries))[df_train[name_new_column].to_numpy()],
cmap="tab20",
s=0.1,
alpha=0.5,
)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("Quadtree map")
plt.savefig(join(save_path, f"{name_new_column}_map.png"))
@hydra.main(
config_path="../../configs/scripts",
config_name="preprocess",
version_base=None,
)
def main(cfg):
data_path = join(cfg.data_dir, "osv5m")
save_path = cfg.data_dir
name_new_column = f"quadtree_{cfg.depth}_{cfg.do_split}"
# Create clusters from train images
train_fp = join(data_path, f"train.csv")
df_train = pd.read_csv(train_fp, low_memory=False)
qt = QuadTree(df_train, depth=cfg.depth, do_split=cfg.do_split)
boundaries, df_train, id_to_quad = extract(qt, name_new_column)
vizu(name_new_column, df_train, boundaries, save_path)
# Save clusters
boundaries = pd.DataFrame.from_dict(
boundaries,
orient="index",
columns=["min_lat", "min_lon", "max_lat", "max_lon", "mean_lat", "mean_lon"],
)
boundaries.to_csv(
join(save_path, f"{name_new_column}.csv"), index_label="cluster_id"
)
# Assign test images to clusters
test_fp = join(data_path, f"test.csv")
df_test = pd.read_csv(test_fp)
above_lat = np.expand_dims(df_test["latitude"].to_numpy(), -1) > np.expand_dims(
boundaries["min_lat"].to_numpy(), 0
)
below_lat = np.expand_dims(df_test["latitude"].to_numpy(), -1) < np.expand_dims(
boundaries["max_lat"].to_numpy(), 0
)
above_lon = np.expand_dims(df_test["longitude"].to_numpy(), -1) > np.expand_dims(
boundaries["min_lon"].to_numpy(), 0
)
below_lon = np.expand_dims(df_test["longitude"].to_numpy(), -1) < np.expand_dims(
boundaries["max_lon"].to_numpy(), 0
)
mask = np.logical_and(
np.logical_and(above_lat, below_lat), np.logical_and(above_lon, below_lon)
)
df_test[name_new_column] = np.argmax(mask, axis=1)
# save index_to_gps_quadtree file
lat = torch.tensor(boundaries["mean_lat"])
lon = torch.tensor(boundaries["mean_lon"])
coord = torch.stack([lat, lon], dim=-1)
torch.save(
coord, join(save_path, f"index_to_gps_quadtree_{cfg.depth}_{cfg.do_split}.pt")
)
torch.save(id_to_quad, join(save_path, f"id_to_quad_{cfg.depth}_{cfg.do_split}.pt"))
# Overwrite test.csv and train.csv
if cfg.overwrite_csv:
df_train.to_csv(train_fp, index=False)
df_test.to_csv(test_fp, index=False)
df = pd.read_csv(join(data_path, "train.csv"), low_memory=False).fillna("NaN")
# Compute the average location for each unique country
country_avg = (
df.groupby("unique_country")[["latitude", "longitude"]].mean().reset_index()
)
country_avg.to_csv(
join(save_path, "country_center.csv"),
columns=["unique_country", "latitude", "longitude"],
index=False,
)
# Compute the average location for each unique admin1 (region)
region_avg = (
df.groupby(["unique_region"])[["latitude", "longitude"]].mean().reset_index()
)
region_avg.to_csv(
join(save_path, "region_center.csv"),
columns=["unique_region", "latitude", "longitude"],
index=False,
)
# Compute the average location for each unique admin2 (area)
area_avg = (
df.groupby(["unique_sub-region"])[["latitude", "longitude"]]
.mean()
.reset_index()
)
area_avg.to_csv(
join(save_path, "sub-region_center.csv"),
columns=["unique_sub-region", "latitude", "longitude"],
index=False,
)
# Compute the average location for each unique city
city_avg = (
df.groupby(["unique_city"])[["latitude", "longitude"]].mean().reset_index()
)
city_avg.to_csv(
join(save_path, "city_center.csv"),
columns=["unique_city", "latitude", "longitude"],
index=False,
)
for class_name in [
"unique_country",
"unique_sub-region",
"unique_region",
"unique_city",
]:
# Load CSV data into a Pandas DataFrame
csv_file = class_name.split("_")[-1] + "_center.csv"
df = pd.read_csv(join(save_path, csv_file), low_memory=False)
splits = ["train"]
categories = sorted(
pd.concat(
[
pd.read_csv(
join(data_path, f"{split}.csv"), low_memory=False
)[class_name]
for split in splits
]
)
.fillna("NaN")
.unique()
.tolist()
)
if "NaN" in categories:
categories.remove("NaN")
# compute the total number of categories - this name is fixed and will be used as a lookup during init
num_classes = len(categories)
# create a mapping from category to index
category_to_index = {category: i for i, category in enumerate(categories)}
dictionary = torch.zeros((num_classes, 2))
for index, row in df.iterrows():
key = row.iloc[0]
value = [row.iloc[1], row.iloc[2]]
if key in categories:
(
dictionary[category_to_index[key], 0],
dictionary[category_to_index[key], 1],
) = np.radians(row.iloc[1]), np.radians(row.iloc[2])
# Save the PyTorch tensor to a .pt file
output_file = join(save_path, "index_to_gps_" + class_name + ".pt")
torch.save(dictionary, output_file)
train = pd.read_csv(join(data_path, "train.csv"), low_memory=False).fillna(
"NaN"
)
u = train.groupby("unique_city").sample(n=1)
country_df = (
u.pivot(index="unique_city", columns="unique_country", values="unique_city")
.notna()
.astype(int)
.fillna(0)
)
country_to_idx = {
category: i for i, category in enumerate(list(country_df.columns))
}
city_country_matrix = torch.tensor(country_df.values) / 1.0
region_df = (
u.pivot(index="unique_city", columns="unique_region", values="unique_city")
.notna()
.astype(int)
.fillna(0)
)
region_to_idx = {category: i for i, category in enumerate(list(region_df.columns))}
city_region_matrix = torch.tensor(region_df.values) / 1.0
country_df = (
u.pivot(index="unique_city", columns="unique_country", values="unique_city")
.notna()
.astype(int)
.fillna(0)
)
country_to_idx = {
category: i for i, category in enumerate(list(country_df.columns))
}
city_country_matrix = torch.tensor(country_df.values) / 1.0
output_file = join(save_path, "city_to_country.pt")
torch.save(city_country_matrix, output_file)
output_file = join(save_path, "country_to_idx.pt")
torch.save(country_to_idx, output_file)
region_df = (
u.pivot(index="unique_city", columns="unique_region", values="unique_city")
.notna()
.astype(int)
.fillna(0)
)
region_to_idx = {category: i for i, category in enumerate(list(region_df.columns))}
city_region_matrix = torch.tensor(region_df.values) / 1.0
output_file = join(save_path, "city_to_region.pt")
torch.save(city_region_matrix, output_file)
output_file = join(save_path, "region_to_idx.pt")
torch.save(region_to_idx, output_file)
area_df = (
u.pivot(index="unique_city", columns="unique_sub-region", values="unique_city")
.notna()
.astype(int)
.fillna(0)
)
area_to_idx = {category: i for i, category in enumerate(list(area_df.columns))}
city_area_matrix = torch.tensor(area_df.values) / 1.0
output_file = join(save_path, "city_to_area.pt")
torch.save(city_area_matrix, output_file)
output_file = join(save_path, "area_to_idx.pt")
torch.save(area_to_idx, output_file)
gt = torch.load(join(save_path, f"id_to_quad_{cfg.depth}_{cfg.do_split}.pt"))
matrixes = []
dicts = []
for i in range(1, cfg.depth):
# Step 2: Truncate strings to size cfg.depth - 1
l = [s[: cfg.depth - i] if len(s) >= cfg.depth + 1 - i else s for s in gt]
# Step 3: Get unique values in the modified list l
h = list(set(l))
# Step 4: Create a dictionary to map unique values to their index
h_dict = {value: index for index, value in enumerate(h)}
dicts.append(h_dict)
# Step 5: Initialize a torch matrix with zeros
matrix = torch.zeros((len(gt), len(h)))
# Step 6: Fill in the matrix with 1s based on the mapping
for h in range(len(gt)):
j = h_dict[l[h]]
matrix[h, j] = 1
matrixes.append(matrix)
output_file = join(save_path, "quadtree_matrixes.pt")
torch.save(matrixes, output_file)
output_file = join(save_path, "quadtree_dicts.pt")
torch.save(dicts, output_file)
if __name__ == "__main__":
main()