Spaces:
Sleeping
Sleeping
""" | |
Preliminary preprocessing on the data, such as: | |
- correcting column names | |
- encoding the target column | |
""" | |
import pandas as pd | |
from sklearn import preprocessing | |
# Files location | |
TRAINING_FILE_NAME = "./data/Training.csv" | |
TESTING_FILE_NAME = "./data/Testing.csv" | |
# Columns processing | |
TARGET_COLUMN = "prognosis" | |
DROP_COLUMNS = ["Unnamed: 133"] | |
RENAME_COLUMNS = { | |
"scurring": "scurving", | |
"dischromic _patches": "dischromic_patches", | |
"spotting_ urination": "spotting_urination", | |
"foul_smell_of urine": "foul_smell_of_urine", | |
} | |
RENAME_VALUES = { | |
"(vertigo) Paroymsal Positional Vertigo": "Paroymsal Positional Vertigo", | |
"Dimorphic hemmorhoids(piles)": "Dimorphic hemmorhoids (piles)", | |
"Peptic ulcer diseae": "Peptic Ulcer", | |
} | |
if __name__ == "__main__": | |
# Load data | |
df_train = pd.read_csv(TRAINING_FILE_NAME) | |
df_test = pd.read_csv(TESTING_FILE_NAME) | |
# Remove unseless columns | |
df_train.drop(columns=DROP_COLUMNS, axis=1, errors="ignore", inplace=True) | |
df_test.drop(columns=DROP_COLUMNS, axis=1, errors="ignore", inplace=True) | |
# Correct some typos in some columns name | |
df_train.rename(columns=RENAME_COLUMNS, inplace=True) | |
df_test.rename(columns=RENAME_COLUMNS, inplace=True) | |
df_train[TARGET_COLUMN].replace(RENAME_VALUES.keys(), RENAME_VALUES.values(), inplace=True) | |
df_train[TARGET_COLUMN] = df_train[TARGET_COLUMN].apply(str.title) | |
df_test[TARGET_COLUMN].replace(RENAME_VALUES.keys(), RENAME_VALUES.values(), inplace=True) | |
df_test[TARGET_COLUMN] = df_test[TARGET_COLUMN].apply(str.title) | |
# Convert the `TARGET_COLUMN` to a numeric label | |
label_encoder = preprocessing.LabelEncoder() | |
label_encoder.fit(df_train[[TARGET_COLUMN]].values.flatten()) | |
df_train[f"{TARGET_COLUMN}_encoded"] = label_encoder.transform( | |
df_train[[TARGET_COLUMN]].values.flatten() | |
) | |
df_test[f"{TARGET_COLUMN}_encoded"] = label_encoder.transform( | |
df_test[[TARGET_COLUMN]].values.flatten() | |
) | |
# Cast X features from int64 to float32 | |
float_columns = df_train.columns.drop([TARGET_COLUMN]) | |
df_train[float_columns] = df_train[float_columns].astype("float32") | |
df_test[float_columns] = df_test[float_columns].astype("float32") | |
# Save preprocessed data | |
df_train.to_csv(path_or_buf="./data/Training_preprocessed.csv", index=False) | |
df_test.to_csv(path_or_buf="./data/Testing_preprocessed.csv", index=False) | |