File size: 2,655 Bytes
33e9cd7 7a25fef 33e9cd7 5a45d93 33e9cd7 331fa48 33e9cd7 331fa48 33e9cd7 331fa48 33e9cd7 331fa48 33e9cd7 331fa48 33e9cd7 331fa48 33e9cd7 331fa48 33e9cd7 ab3b679 33e9cd7 ab3b679 33e9cd7 ab3b679 33e9cd7 7a25fef 33e9cd7 7a25fef 33e9cd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
################ Dicts with encodings ################
# cabin_dict= "Cabin": {"N": 0, "C": 1, "E": 2, "G": 3, "D":4, "A": 5, "B": 6, "F": 7, "T": 8}
cleanup_catergories = {"sex": {"female": 1, "male": 0}, "embarked": {"S": 0, "C": 1, "Q": 2}}
sex_dict = {"female": 1, "male": 0}
embarked_dict = {"S": 0, "C": 1, "Q": 2}
# Reversed
"""
title_dict = {
0: ["Mr"],
1: ["Miss"],
2: ["Mrs"],
3: ["Master"],
# Rare titles, not worth individual categorys
4: [
"Dr",
"Rev",
"Mlle",
"Major",
"Col",
"Countess",
"Capt",
"Ms",
"Sir",
"Lady",
"Nme",
"Don",
"Jonkheer",
],
}
"""
#####################################################
def feat_eng(df):
"""
Main function containg the feature engineering part
of the pipeline.
"""
import pandas as pd
import numpy as np
import hopsworks
# Load the data_frame
# df = pd.read_csv(
# "https://raw.githubusercontent.com/ID2223KTH/id2223kth.github.io/master/assignments/lab1/titanic.csv"
# )
# Drop features and NaNs
df.drop(["Ticket", "Cabin", "Fare", "PassengerId", "Title"], axis=1, inplace=True)
df = df[df["Embarked"].notna()]
# Feature engineering
# Creat a title feature
# if "Name" in df.columns:
# df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.")
# df.drop("Name", axis=1, inplace=True)
# # Interpolate missing ages
# for title in df["Title"].unique():
# # This sould be optimized
# mask = (df["Title"] == title) & df["Age"].isna()
# # Get sutible candidates for age sampling
# candidates = df.loc[(df["Title"] == title) & df["Age"].notna()]
# g = candidates.groupby("Age", dropna=True)["Age"].count()
# g = g.apply(lambda x: x / g.sum())
# weights = g.to_numpy()
# ages = g.index
# df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights)))
# Cast age to int
df["Age"] = df["Age"].astype("int")
# Bin ages
# df['Age'] = pd.cut(df['Age'],[0,8,15,30,65,150])
# # Bin fare
# df['Fare'] = pd.cut(df['Fare'],[0,200,400,600,1000])
# # Bin SibSp
# pd.cut(df['SibSp'], [0,1,2,7], right=False)
# Cabin into categories based on first letter(deck of boat)
# df["Cabin"] = df["Cabin"].str.slice(0,1)
# Make a separate category of all te NANs
# df["Cabin"] = df["Cabin"].fillna("N")
# Fixes for hopsworks...
df.columns = df.columns.str.lower()
# Final encoding
df = df.replace(cleanup_catergories)
return df
|