""" |
Preliminary preprocessing on the data, such as: |
- correcting column names |
- encoding the target column |
""" |
import pandas as pd |
from sklearn import preprocessing |
COLUMNS_TO_DROP = ["Unnamed: 133"] |
TARGET_COLUMN = ["prognosis"] |
"scurring": "scurving", |
"dischromic _patches": "dischromic_patches", |
"spotting_ urination": "spotting_urination", |
"foul_smell_of urine": "foul_smell_of_urine", |
} |
def pretty_print(input): |
""" |
Prettify the input. |
Args: |
input: Can be a list of symtoms or a disease. |
Returns: |
list: Sorted and prettified input. |
""" |
if isinstance(input, list): |
input = list(input) |
pretty_list = [] |
for item in input: |
if isinstance(item, list): |
pretty_list.extend(item) |
else: |
pretty_list.append(item) |
pretty_list = sorted([" ".join((item.split("_"))).title() for item in pretty_list]) |
return pretty_list |
def map_prediction(target_columns=["y", "prognosis"]): |
df = pd.read_csv("Training_preprocessed.csv") |
relevent_df = df[target_columns].drop_duplicates().relevent_df.where(df["y"] == 1) |
prediction = relevent_df[target_columns[1]].dropna().values[0] |
return prediction |
if __name__ == "__main__": |
df_train = pd.read_csv("Training.csv") |
df_test = pd.read_csv("Testing.csv") |
df_train.drop(columns=COLUMNS_TO_DROP, axis=1, errors="ignore", inplace=True) |
df_test.drop(columns=COLUMNS_TO_DROP, axis=1, errors="ignore", inplace=True) |
df_train.rename(columns=RENAME_COLUMNS, inplace=True) |
df_test.rename(columns=RENAME_COLUMNS, inplace=True) |
label_encoder = preprocessing.LabelEncoder() |
label_encoder.fit(df_train[TARGET_COLUMN].values.flatten()) |
df_train["y"] = label_encoder.transform(df_train[TARGET_COLUMN].values.flatten()) |
df_test["y"] = label_encoder.transform(df_test[TARGET_COLUMN].values.flatten()) |
float_columns = df_train.columns.drop(TARGET_COLUMN) |
df_train[float_columns] = df_train[float_columns].astype("float32") |
df_test[float_columns] = df_test[float_columns].astype("float32") |
df_train.to_csv(path_or_buf="Training_preprocessed.csv", index=False) |
df_test.to_csv(path_or_buf="Testing_preprocessed.csv", index=False) |