|
""" |
|
Preliminary preprocessing on the data, such as: |
|
- correcting column names |
|
- encoding the target column |
|
""" |
|
|
|
import pandas as pd |
|
from sklearn import preprocessing |
|
|
|
COLUMNS_TO_DROP = ["Unnamed: 133"] |
|
TARGET_COLUMN = ["prognosis"] |
|
RENAME_COLUMNS = { |
|
"scurring": "scurving", |
|
"dischromic _patches": "dischromic_patches", |
|
"spotting_ urination": "spotting_urination", |
|
"foul_smell_of urine": "foul_smell_of_urine", |
|
} |
|
|
|
|
|
def pretty_print(input): |
|
""" |
|
Prettify the input. |
|
|
|
Args: |
|
input: Can be a list of symtoms or a disease. |
|
|
|
Returns: |
|
list: Sorted and prettified input. |
|
""" |
|
|
|
if isinstance(input, list): |
|
input = list(input) |
|
|
|
|
|
pretty_list = [] |
|
for item in input: |
|
if isinstance(item, list): |
|
pretty_list.extend(item) |
|
else: |
|
pretty_list.append(item) |
|
|
|
|
|
pretty_list = sorted([" ".join((item.split("_"))).title() for item in pretty_list]) |
|
|
|
return pretty_list |
|
|
|
|
|
def map_prediction(target_columns=["y", "prognosis"]): |
|
df = pd.read_csv("Training_preprocessed.csv") |
|
relevent_df = df[target_columns].drop_duplicates().relevent_df.where(df["y"] == 1) |
|
prediction = relevent_df[target_columns[1]].dropna().values[0] |
|
return prediction |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
df_train = pd.read_csv("Training.csv") |
|
df_test = pd.read_csv("Testing.csv") |
|
|
|
|
|
df_train.drop(columns=COLUMNS_TO_DROP, axis=1, errors="ignore", inplace=True) |
|
df_test.drop(columns=COLUMNS_TO_DROP, axis=1, errors="ignore", inplace=True) |
|
|
|
|
|
df_train.rename(columns=RENAME_COLUMNS, inplace=True) |
|
df_test.rename(columns=RENAME_COLUMNS, inplace=True) |
|
|
|
|
|
label_encoder = preprocessing.LabelEncoder() |
|
label_encoder.fit(df_train[TARGET_COLUMN].values.flatten()) |
|
|
|
df_train["y"] = label_encoder.transform(df_train[TARGET_COLUMN].values.flatten()) |
|
df_test["y"] = label_encoder.transform(df_test[TARGET_COLUMN].values.flatten()) |
|
|
|
|
|
float_columns = df_train.columns.drop(TARGET_COLUMN) |
|
df_train[float_columns] = df_train[float_columns].astype("float32") |
|
df_test[float_columns] = df_test[float_columns].astype("float32") |
|
|
|
|
|
df_train.to_csv(path_or_buf="Training_preprocessed.csv", index=False) |
|
df_test.to_csv(path_or_buf="Testing_preprocessed.csv", index=False) |
|
|