removed title
Browse files- feature_engineering.py +15 -15
feature_engineering.py
CHANGED
@@ -48,30 +48,30 @@ def feat_eng(df):
|
|
48 |
# )
|
49 |
|
50 |
# Drop features and NaNs
|
51 |
-
df.drop(["Ticket", "Cabin", "Fare", "PassengerId"], axis=1, inplace=True)
|
52 |
df = df[df["Embarked"].notna()]
|
53 |
|
54 |
# Feature engineering
|
55 |
# Creat a title feature
|
56 |
-
if "Name" in df.columns:
|
57 |
-
|
58 |
-
|
59 |
|
60 |
-
# Interpolate missing ages
|
61 |
-
for title in df["Title"].unique():
|
62 |
-
|
63 |
-
|
64 |
|
65 |
-
|
66 |
-
|
67 |
|
68 |
-
|
69 |
-
|
70 |
|
71 |
-
|
72 |
-
|
73 |
|
74 |
-
|
75 |
|
76 |
# Cast age to int
|
77 |
df["Age"] = df["Age"].astype("int")
|
|
|
48 |
# )
|
49 |
|
50 |
# Drop features and NaNs
|
51 |
+
df.drop(["Ticket", "Cabin", "Fare", "PassengerId", "Title"], axis=1, inplace=True)
|
52 |
df = df[df["Embarked"].notna()]
|
53 |
|
54 |
# Feature engineering
|
55 |
# Creat a title feature
|
56 |
+
# if "Name" in df.columns:
|
57 |
+
# df["Title"] = df.Name.str.extract("([A-Za-z]+)\\.")
|
58 |
+
# df.drop("Name", axis=1, inplace=True)
|
59 |
|
60 |
+
# # Interpolate missing ages
|
61 |
+
# for title in df["Title"].unique():
|
62 |
+
# # This sould be optimized
|
63 |
+
# mask = (df["Title"] == title) & df["Age"].isna()
|
64 |
|
65 |
+
# # Get sutible candidates for age sampling
|
66 |
+
# candidates = df.loc[(df["Title"] == title) & df["Age"].notna()]
|
67 |
|
68 |
+
# g = candidates.groupby("Age", dropna=True)["Age"].count()
|
69 |
+
# g = g.apply(lambda x: x / g.sum())
|
70 |
|
71 |
+
# weights = g.to_numpy()
|
72 |
+
# ages = g.index
|
73 |
|
74 |
+
# df.update(df["Age"][mask].apply(lambda x: np.random.choice(ages, p=weights)))
|
75 |
|
76 |
# Cast age to int
|
77 |
df["Age"] = df["Age"].astype("int")
|