nirajandhakal commited on
Commit
3f702ae
1 Parent(s): fe24083

Create data_preprocessing.py

Browse files
Files changed (1) hide show
  1. data_preprocessing.py +121 -0
data_preprocessing.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ # Function Definitions
5
+
6
+ # Load libraries
7
+ def load_data(file):
8
+ return pd.read_csv(file, index_col=False)
9
+
10
+ # Handle duplicate rows
11
+ def remove_duplicate_rows(df):
12
+ df = df.drop_duplicates()
13
+ print("Number of removed duplicated rows:", len(df)-len(df.drop_duplicates()))
14
+ return df
15
+
16
+ # One hot encode categorical columns
17
+ def onehot_encoder(df, cols):
18
+ encoded_cols = []
19
+ for col in cols:
20
+ encoder = pd.get_dummies(df[col])
21
+ encoded_cols += list(encoder.columns)
22
+ df = df.join(encoder)
23
+ del df[col]
24
+
25
+ return df, encoded_cols
26
+
27
+ # Deal with NaN values in specified columns
28
+ def fillna_values(df, cols, strategy='mean'):
29
+ for col in cols:
30
+ if strategy == 'median':
31
+ df[col].fillna(df[col].median(), inplace=True)
32
+ elif strategy == 'mean':
33
+ df[col].fillna(df[col].mean(), inplace=True)
34
+ else:
35
+ raise ValueError('Invalid filling strategy')
36
+
37
+ return df
38
+
39
+ # Preprocess books dataset
40
+ def preprocess_books(books):
41
+ # Drop duplicates
42
+ books = remove_duplicate_rows(books)
43
+
44
+ # Get categorical columns
45
+ cat_cols = ['language_code']
46
+
47
+ # One-hot encode categoricals
48
+ books, _ = onehot_encoder(books, cat_cols)
49
+
50
+ # Fill NAs
51
+ fillna_cols = ['average_rating', 'ratings_count', 'work_ratings_count', 'work_text_reviews_count']
52
+ books = fillna_values(books, fillna_cols, strategy='mean')
53
+
54
+ return books
55
+
56
+ # Preprocess tags dataset
57
+ def preprocess_tags(tags):
58
+ return tags
59
+
60
+ def preprocess_book_tags(book_tags):
61
+ # Map tag_id to tag_name instead of dropping the column
62
+ tag_mapping = dict(zip(book_tags["tag_id"], book_tags["tag_name"]))
63
+ book_tags["tag_name"] = book_tags["tag_id"].apply(lambda x: tag_mapping.get(x, None))
64
+
65
+ # Groupby aggregate
66
+ agg_funcs = {'count': 'sum'} # Sum or other functions according to requirement
67
+ book_tags = book_tags.groupby(['goodreads_book_id'], as_index=False).agg(agg_funcs)
68
+
69
+ return book_tags
70
+
71
+ # Preprocess goodbooks-10k dataset
72
+ def preprocess_goodbooks(goodbooks):
73
+ # Scaling/softening extreme ratings
74
+ scaling_threshold = 4.5
75
+ goodbooks['scaled_rating'] = np.where(goodbooks['rating'] > scaling_threshold, scaling_threshold - 0.5 + ((scaling_threshold - 0.5) / (5 - scaling_threshold)) * (goodbooks['rating'] - scaling_threshold), goodbooks['rating'])
76
+
77
+ return goodbooks
78
+
79
+ # Merge and save dataset
80
+ # Merge and save dataset
81
+ def merge_and_save_dataset():
82
+ # Read files
83
+ files = {
84
+ 'books': '../data/books.csv',
85
+ 'book_tags': '../data/book_tags.csv',
86
+ 'goodbooks': '../data/goodbooks-10k.csv',
87
+ 'ratings': '../data/ratings.csv',
88
+ 'tags': '../data/tags.csv',
89
+ 'to_read': '../data/to_read.csv'
90
+ }
91
+
92
+ merged_dataset = pd.merge(left=pd.merge(preprocessed_books, preprocessed_book_tags, left_index=True, right_on="goodreads_book_id"), right=preprocessed_goodbooks, left_index=True)
93
+
94
+ # Additional cleanup and preprocessing
95
+ merged_dataset = merged_dataset.loc[:, ~merged_dataset.columns.duplicated()]
96
+
97
+ # Save the final dataset
98
+ merged_dataset.to_csv("../data/final_dataset.csv", index=False)
99
+
100
+ # Merge and save dataset
101
+ def merge_and_save_dataset():
102
+ # Read files
103
+ files = {
104
+ 'books': '../data/books.csv',
105
+ 'book_tags': '../data/book_tags.csv',
106
+ 'goodbooks': '../data/goodbooks-10k.csv',
107
+ 'ratings': '../data/ratings.csv',
108
+ 'tags': '../data/tags.csv',
109
+ 'to_read': '../data/to_read.csv'
110
+ }
111
+
112
+ merged_dataset = pd.merge(left=pd.merge(preprocessed_books, preprocessed_book_tags, left_index=True, right_on="goodreads_book_id"), right=preprocessed_goodbooks, left_index=True, right_index=True)
113
+
114
+ # Additional cleanup and preprocessing
115
+ merged_dataset = merged_dataset.loc[:, ~merged_dataset.columns.duplicated()]
116
+
117
+ # Save the final dataset
118
+ merged_dataset.to_csv("../data/final_dataset.csv", index=False)
119
+
120
+
121
+ merge_and_save_dataset()