Spaces:
Running
Running
import pandas as pd | |
import torch | |
import numpy as np | |
from os.path import join | |
import matplotlib.pyplot as plt | |
import hydra | |
class QuadTree(object): | |
def __init__(self, data, mins=None, maxs=None, id="", depth=3, do_split=1000): | |
self.id = id | |
self.data = data | |
if mins is None: | |
mins = data[["latitude", "longitude"]].to_numpy().min(0) | |
if maxs is None: | |
maxs = data[["latitude", "longitude"]].to_numpy().max(0) | |
self.mins = np.asarray(mins) | |
self.maxs = np.asarray(maxs) | |
self.sizes = self.maxs - self.mins | |
self.children = [] | |
mids = 0.5 * (self.mins + self.maxs) | |
xmin, ymin = self.mins | |
xmax, ymax = self.maxs | |
xmid, ymid = mids | |
if (depth > 0) and (len(self.data) >= do_split): | |
# split the data into four quadrants | |
data_q1 = data[(data["latitude"] < mids[0]) & (data["longitude"] < mids[1])] | |
data_q2 = data[ | |
(data["latitude"] < mids[0]) & (data["longitude"] >= mids[1]) | |
] | |
data_q3 = data[ | |
(data["latitude"] >= mids[0]) & (data["longitude"] < mids[1]) | |
] | |
data_q4 = data[ | |
(data["latitude"] >= mids[0]) & (data["longitude"] >= mids[1]) | |
] | |
# recursively build a quad tree on each quadrant which has data | |
if data_q1.shape[0] > 0: | |
self.children.append( | |
QuadTree( | |
data_q1, | |
[xmin, ymin], | |
[xmid, ymid], | |
id + "0", | |
depth - 1, | |
do_split=do_split, | |
) | |
) | |
if data_q2.shape[0] > 0: | |
self.children.append( | |
QuadTree( | |
data_q2, | |
[xmin, ymid], | |
[xmid, ymax], | |
id + "1", | |
depth - 1, | |
do_split=do_split, | |
) | |
) | |
if data_q3.shape[0] > 0: | |
self.children.append( | |
QuadTree( | |
data_q3, | |
[xmid, ymin], | |
[xmax, ymid], | |
id + "2", | |
depth - 1, | |
do_split=do_split, | |
) | |
) | |
if data_q4.shape[0] > 0: | |
self.children.append( | |
QuadTree( | |
data_q4, | |
[xmid, ymid], | |
[xmax, ymax], | |
id + "3", | |
depth - 1, | |
do_split=do_split, | |
) | |
) | |
def unwrap(self): | |
if len(self.children) == 0: | |
return {self.id: [self.mins, self.maxs, self.data.copy()]} | |
else: | |
d = dict() | |
for child in self.children: | |
d.update(child.unwrap()) | |
return d | |
def extract(qt, name_new_column): | |
cluster = qt.unwrap() | |
boundaries, data = {}, [] | |
id_to_quad = np.array(list(cluster.keys())) | |
for i, (id, vs) in zip(np.arange(len(cluster)), cluster.items()): | |
(min_lat, min_lon), (max_lat, max_lon), points = vs | |
points[name_new_column] = int(i) | |
data.append(points) | |
boundaries[i] = ( | |
float(min_lat), | |
float(min_lon), | |
float(max_lat), | |
float(max_lon), | |
points["latitude"].mean(), | |
points["longitude"].mean(), | |
) | |
data = pd.concat(data) | |
return boundaries, data, id_to_quad | |
def vizu(name_new_column, df_train, boundaries, save_path): | |
plt.hist(df_train[name_new_column], bins=len(boundaries)) | |
plt.xlabel("Cluster ID") | |
plt.ylabel("Number of images") | |
plt.title("Cluster distribution") | |
plt.yscale("log") | |
plt.savefig(join(save_path, f"{name_new_column}_distrib.png")) | |
plt.clf() | |
plt.scatter( | |
df_train["longitude"].to_numpy(), | |
df_train["latitude"].to_numpy(), | |
c=np.random.permutation(len(boundaries))[df_train[name_new_column].to_numpy()], | |
cmap="tab20", | |
s=0.1, | |
alpha=0.5, | |
) | |
plt.xlabel("Longitude") | |
plt.ylabel("Latitude") | |
plt.title("Quadtree map") | |
plt.savefig(join(save_path, f"{name_new_column}_map.png")) | |
def main(cfg): | |
data_path = join(cfg.data_dir, "osv5m") | |
save_path = cfg.data_dir | |
name_new_column = f"quadtree_{cfg.depth}_{cfg.do_split}" | |
# Create clusters from train images | |
train_fp = join(data_path, f"train.csv") | |
df_train = pd.read_csv(train_fp, low_memory=False) | |
qt = QuadTree(df_train, depth=cfg.depth, do_split=cfg.do_split) | |
boundaries, df_train, id_to_quad = extract(qt, name_new_column) | |
vizu(name_new_column, df_train, boundaries, save_path) | |
# Save clusters | |
boundaries = pd.DataFrame.from_dict( | |
boundaries, | |
orient="index", | |
columns=["min_lat", "min_lon", "max_lat", "max_lon", "mean_lat", "mean_lon"], | |
) | |
boundaries.to_csv( | |
join(save_path, f"{name_new_column}.csv"), index_label="cluster_id" | |
) | |
# Assign test images to clusters | |
test_fp = join(data_path, f"test.csv") | |
df_test = pd.read_csv(test_fp) | |
above_lat = np.expand_dims(df_test["latitude"].to_numpy(), -1) > np.expand_dims( | |
boundaries["min_lat"].to_numpy(), 0 | |
) | |
below_lat = np.expand_dims(df_test["latitude"].to_numpy(), -1) < np.expand_dims( | |
boundaries["max_lat"].to_numpy(), 0 | |
) | |
above_lon = np.expand_dims(df_test["longitude"].to_numpy(), -1) > np.expand_dims( | |
boundaries["min_lon"].to_numpy(), 0 | |
) | |
below_lon = np.expand_dims(df_test["longitude"].to_numpy(), -1) < np.expand_dims( | |
boundaries["max_lon"].to_numpy(), 0 | |
) | |
mask = np.logical_and( | |
np.logical_and(above_lat, below_lat), np.logical_and(above_lon, below_lon) | |
) | |
df_test[name_new_column] = np.argmax(mask, axis=1) | |
# save index_to_gps_quadtree file | |
lat = torch.tensor(boundaries["mean_lat"]) | |
lon = torch.tensor(boundaries["mean_lon"]) | |
coord = torch.stack([lat, lon], dim=-1) | |
torch.save( | |
coord, join(save_path, f"index_to_gps_quadtree_{cfg.depth}_{cfg.do_split}.pt") | |
) | |
torch.save(id_to_quad, join(save_path, f"id_to_quad_{cfg.depth}_{cfg.do_split}.pt")) | |
# Overwrite test.csv and train.csv | |
if cfg.overwrite_csv: | |
df_train.to_csv(train_fp, index=False) | |
df_test.to_csv(test_fp, index=False) | |
df = pd.read_csv(join(data_path, "train.csv"), low_memory=False).fillna("NaN") | |
# Compute the average location for each unique country | |
country_avg = ( | |
df.groupby("unique_country")[["latitude", "longitude"]].mean().reset_index() | |
) | |
country_avg.to_csv( | |
join(save_path, "country_center.csv"), | |
columns=["unique_country", "latitude", "longitude"], | |
index=False, | |
) | |
# Compute the average location for each unique admin1 (region) | |
region_avg = ( | |
df.groupby(["unique_region"])[["latitude", "longitude"]].mean().reset_index() | |
) | |
region_avg.to_csv( | |
join(save_path, "region_center.csv"), | |
columns=["unique_region", "latitude", "longitude"], | |
index=False, | |
) | |
# Compute the average location for each unique admin2 (area) | |
area_avg = ( | |
df.groupby(["unique_sub-region"])[["latitude", "longitude"]] | |
.mean() | |
.reset_index() | |
) | |
area_avg.to_csv( | |
join(save_path, "sub-region_center.csv"), | |
columns=["unique_sub-region", "latitude", "longitude"], | |
index=False, | |
) | |
# Compute the average location for each unique city | |
city_avg = ( | |
df.groupby(["unique_city"])[["latitude", "longitude"]].mean().reset_index() | |
) | |
city_avg.to_csv( | |
join(save_path, "city_center.csv"), | |
columns=["unique_city", "latitude", "longitude"], | |
index=False, | |
) | |
for class_name in [ | |
"unique_country", | |
"unique_sub-region", | |
"unique_region", | |
"unique_city", | |
]: | |
# Load CSV data into a Pandas DataFrame | |
csv_file = class_name.split("_")[-1] + "_center.csv" | |
df = pd.read_csv(join(save_path, csv_file), low_memory=False) | |
splits = ["train"] | |
categories = sorted( | |
pd.concat( | |
[ | |
pd.read_csv( | |
join(data_path, f"{split}.csv"), low_memory=False | |
)[class_name] | |
for split in splits | |
] | |
) | |
.fillna("NaN") | |
.unique() | |
.tolist() | |
) | |
if "NaN" in categories: | |
categories.remove("NaN") | |
# compute the total number of categories - this name is fixed and will be used as a lookup during init | |
num_classes = len(categories) | |
# create a mapping from category to index | |
category_to_index = {category: i for i, category in enumerate(categories)} | |
dictionary = torch.zeros((num_classes, 2)) | |
for index, row in df.iterrows(): | |
key = row.iloc[0] | |
value = [row.iloc[1], row.iloc[2]] | |
if key in categories: | |
( | |
dictionary[category_to_index[key], 0], | |
dictionary[category_to_index[key], 1], | |
) = np.radians(row.iloc[1]), np.radians(row.iloc[2]) | |
# Save the PyTorch tensor to a .pt file | |
output_file = join(save_path, "index_to_gps_" + class_name + ".pt") | |
torch.save(dictionary, output_file) | |
train = pd.read_csv(join(data_path, "train.csv"), low_memory=False).fillna( | |
"NaN" | |
) | |
u = train.groupby("unique_city").sample(n=1) | |
country_df = ( | |
u.pivot(index="unique_city", columns="unique_country", values="unique_city") | |
.notna() | |
.astype(int) | |
.fillna(0) | |
) | |
country_to_idx = { | |
category: i for i, category in enumerate(list(country_df.columns)) | |
} | |
city_country_matrix = torch.tensor(country_df.values) / 1.0 | |
region_df = ( | |
u.pivot(index="unique_city", columns="unique_region", values="unique_city") | |
.notna() | |
.astype(int) | |
.fillna(0) | |
) | |
region_to_idx = {category: i for i, category in enumerate(list(region_df.columns))} | |
city_region_matrix = torch.tensor(region_df.values) / 1.0 | |
country_df = ( | |
u.pivot(index="unique_city", columns="unique_country", values="unique_city") | |
.notna() | |
.astype(int) | |
.fillna(0) | |
) | |
country_to_idx = { | |
category: i for i, category in enumerate(list(country_df.columns)) | |
} | |
city_country_matrix = torch.tensor(country_df.values) / 1.0 | |
output_file = join(save_path, "city_to_country.pt") | |
torch.save(city_country_matrix, output_file) | |
output_file = join(save_path, "country_to_idx.pt") | |
torch.save(country_to_idx, output_file) | |
region_df = ( | |
u.pivot(index="unique_city", columns="unique_region", values="unique_city") | |
.notna() | |
.astype(int) | |
.fillna(0) | |
) | |
region_to_idx = {category: i for i, category in enumerate(list(region_df.columns))} | |
city_region_matrix = torch.tensor(region_df.values) / 1.0 | |
output_file = join(save_path, "city_to_region.pt") | |
torch.save(city_region_matrix, output_file) | |
output_file = join(save_path, "region_to_idx.pt") | |
torch.save(region_to_idx, output_file) | |
area_df = ( | |
u.pivot(index="unique_city", columns="unique_sub-region", values="unique_city") | |
.notna() | |
.astype(int) | |
.fillna(0) | |
) | |
area_to_idx = {category: i for i, category in enumerate(list(area_df.columns))} | |
city_area_matrix = torch.tensor(area_df.values) / 1.0 | |
output_file = join(save_path, "city_to_area.pt") | |
torch.save(city_area_matrix, output_file) | |
output_file = join(save_path, "area_to_idx.pt") | |
torch.save(area_to_idx, output_file) | |
gt = torch.load(join(save_path, f"id_to_quad_{cfg.depth}_{cfg.do_split}.pt")) | |
matrixes = [] | |
dicts = [] | |
for i in range(1, cfg.depth): | |
# Step 2: Truncate strings to size cfg.depth - 1 | |
l = [s[: cfg.depth - i] if len(s) >= cfg.depth + 1 - i else s for s in gt] | |
# Step 3: Get unique values in the modified list l | |
h = list(set(l)) | |
# Step 4: Create a dictionary to map unique values to their index | |
h_dict = {value: index for index, value in enumerate(h)} | |
dicts.append(h_dict) | |
# Step 5: Initialize a torch matrix with zeros | |
matrix = torch.zeros((len(gt), len(h))) | |
# Step 6: Fill in the matrix with 1s based on the mapping | |
for h in range(len(gt)): | |
j = h_dict[l[h]] | |
matrix[h, j] = 1 | |
matrixes.append(matrix) | |
output_file = join(save_path, "quadtree_matrixes.pt") | |
torch.save(matrixes, output_file) | |
output_file = join(save_path, "quadtree_dicts.pt") | |
torch.save(dicts, output_file) | |
if __name__ == "__main__": | |
main() | |