|
|
|
|
|
cleanup_catergories = {"sex": {"female": 1, "male": 0}, "embarked": {"S": 0, "C": 1, "Q": 2}, "Cabin": {"N": 0, "C": 1, "E": 2, "G": 3, "D":4, "A": 5, "B": 6, "F": 7, "T": 8}} |
|
|
|
sex_dict = {"female": 1, "male": 0} |
|
embarked_dict = {"S": 0, "C": 1, "Q": 2} |
|
|
|
|
|
""" |
|
title_dict = { |
|
0: ["Mr"], |
|
1: ["Miss"], |
|
2: ["Mrs"], |
|
3: ["Master"], |
|
# Rare titles, not worth individual categorys |
|
4: [ |
|
"Dr", |
|
"Rev", |
|
"Mlle", |
|
"Major", |
|
"Col", |
|
"Countess", |
|
"Capt", |
|
"Ms", |
|
"Sir", |
|
"Lady", |
|
"Nme", |
|
"Don", |
|
"Jonkheer", |
|
], |
|
} |
|
""" |
|
|
|
|
|
def feat_eng(df): |
|
""" |
|
Main function containg the feature engineering part |
|
of the pipeline. |
|
""" |
|
import pandas as pd |
|
import numpy as np |
|
import hopsworks |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df.drop(["Ticket"], axis=1, inplace=True) |
|
df = df[df["Embarked"].notna()] |
|
|
|
|
|
|
|
if "Name" in df.columns: |
|
df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.") |
|
df.drop("Name", axis=1, inplace=True) |
|
|
|
|
|
for title in df["Title"].unique(): |
|
|
|
mask = (df["Title"] == title) & df["Age"].isna() |
|
|
|
|
|
candidates = df.loc[(df["Title"] == title) & df["Age"].notna()] |
|
|
|
g = candidates.groupby("Age", dropna=True)["Age"].count() |
|
g = g.apply(lambda x: x / g.sum()) |
|
|
|
weights = g.to_numpy() |
|
ages = g.index |
|
|
|
df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights))) |
|
|
|
|
|
df["Age"] = df["Age"].astype("int") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df["Cabin"] = df["Cabin"].str.slice(0,1) |
|
|
|
|
|
df["Cabin"] = df["Cabin"].fillna("N") |
|
|
|
|
|
df.columns = df.columns.str.lower() |
|
|
|
|
|
df = df.replace(cleanup_catergories) |
|
|
|
return df |
|
|
|
|
|
|