Spaces:

nicolas-dufour
/

Plonk

Running

File size: 1,694 Bytes

c4c7cee

from os.path import join, dirname
import numpy as np
import pandas as pd

if __name__ == "__main__":
    # Define the list of cities
    cities = [
        "Walvis Bay",
        "Keetmanshoop",
        "Warmbad",
        "Rundu",
        "Outapi",
        "Karibib",
        "Otjimbingwe",
        "Ondangwa",
        "Oranjemund",
        "Maltahohe",
        "Otavi",
        "Outjo",
        "Swakopmund",
        "Gobabis",
        "Karasburg",
        "Opuwo",
        "Hentiesbaai",
        "Katima Mulilo",
        "Oshikango",
        "Bethanie",
        "Ongandjera",
        "Mariental",
        "Bagani",
        "Nkurenkuru",
        "Usakos",
        "Rehoboth",
        "Aranos",
        "Omaruru",
        "Arandis",
        "Windhoek",
        "Khorixas",
        "Okahandja",
        "Grootfontein",
        "Tsumeb",
    ]

    csv_dtype = {"category": str, "country": str, "city": str}
    for split in ["train", "test"]:
        fp = join(
            dirname(dirname(__file__)), "datasets", "osv5m", f"{split}.csv"
        )

        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(fp, dtype=csv_dtype)

        # Check if the "country" column contains any of the cities in the list
        mask = df["city"].isin(cities)

        # If a city is found, set the corresponding rows in the "country" column to 'NMB'
        df.loc[mask, "country"] = "NMB"
        assert all(map(lambda x: isinstance(x, str), df["country"].unique().tolist()))

        # Drop the columns that are all NaN
        df.dropna(subset=["id", "latitude", "longitude"], inplace=True)

        # Save the modified DataFrame back to the CSV file
        df.to_csv(fp, index=False)