File size: 3,992 Bytes
3f702ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
import numpy as np

# Function Definitions

# Load libraries
def load_data(file):
    return pd.read_csv(file, index_col=False)

# Handle duplicate rows
def remove_duplicate_rows(df):
    df = df.drop_duplicates()
    print("Number of removed duplicated rows:", len(df)-len(df.drop_duplicates()))
    return df

# One hot encode categorical columns
def onehot_encoder(df, cols):
    encoded_cols = []
    for col in cols:
        encoder = pd.get_dummies(df[col])
        encoded_cols += list(encoder.columns)
        df = df.join(encoder)
        del df[col]
        
    return df, encoded_cols

# Deal with NaN values in specified columns
def fillna_values(df, cols, strategy='mean'):
    for col in cols:
        if strategy == 'median':
            df[col].fillna(df[col].median(), inplace=True)
        elif strategy == 'mean':
            df[col].fillna(df[col].mean(), inplace=True)
        else:
            raise ValueError('Invalid filling strategy')

    return df

# Preprocess books dataset
def preprocess_books(books):
    # Drop duplicates
    books = remove_duplicate_rows(books)
    
    # Get categorical columns
    cat_cols = ['language_code']

    # One-hot encode categoricals
    books, _ = onehot_encoder(books, cat_cols)

    # Fill NAs
    fillna_cols = ['average_rating', 'ratings_count', 'work_ratings_count', 'work_text_reviews_count']
    books = fillna_values(books, fillna_cols, strategy='mean')

    return books

# Preprocess tags dataset
def preprocess_tags(tags):
    return tags

def preprocess_book_tags(book_tags):
    # Map tag_id to tag_name instead of dropping the column
    tag_mapping = dict(zip(book_tags["tag_id"], book_tags["tag_name"]))
    book_tags["tag_name"] = book_tags["tag_id"].apply(lambda x: tag_mapping.get(x, None))

    # Groupby aggregate
    agg_funcs = {'count': 'sum'}   # Sum or other functions according to requirement
    book_tags = book_tags.groupby(['goodreads_book_id'], as_index=False).agg(agg_funcs)

    return book_tags

# Preprocess goodbooks-10k dataset
def preprocess_goodbooks(goodbooks):
    # Scaling/softening extreme ratings
    scaling_threshold = 4.5
    goodbooks['scaled_rating'] = np.where(goodbooks['rating'] > scaling_threshold, scaling_threshold - 0.5 + ((scaling_threshold - 0.5) / (5 - scaling_threshold)) * (goodbooks['rating'] - scaling_threshold), goodbooks['rating'])

    return goodbooks

# Merge and save dataset
# Merge and save dataset
def merge_and_save_dataset():
    # Read files
    files = {
        'books': '../data/books.csv',
        'book_tags': '../data/book_tags.csv',
        'goodbooks': '../data/goodbooks-10k.csv',
        'ratings': '../data/ratings.csv',
        'tags': '../data/tags.csv',
        'to_read': '../data/to_read.csv'
    }

    merged_dataset = pd.merge(left=pd.merge(preprocessed_books, preprocessed_book_tags, left_index=True, right_on="goodreads_book_id"), right=preprocessed_goodbooks, left_index=True)
    
    # Additional cleanup and preprocessing
    merged_dataset = merged_dataset.loc[:, ~merged_dataset.columns.duplicated()]

    # Save the final dataset
    merged_dataset.to_csv("../data/final_dataset.csv", index=False)

# Merge and save dataset
def merge_and_save_dataset():
    # Read files
    files = {
        'books': '../data/books.csv',
        'book_tags': '../data/book_tags.csv',
        'goodbooks': '../data/goodbooks-10k.csv',
        'ratings': '../data/ratings.csv',
        'tags': '../data/tags.csv',
        'to_read': '../data/to_read.csv'
    }

    merged_dataset = pd.merge(left=pd.merge(preprocessed_books, preprocessed_book_tags, left_index=True, right_on="goodreads_book_id"), right=preprocessed_goodbooks, left_index=True, right_index=True)
    
    # Additional cleanup and preprocessing
    merged_dataset = merged_dataset.loc[:, ~merged_dataset.columns.duplicated()]

    # Save the final dataset
    merged_dataset.to_csv("../data/final_dataset.csv", index=False)


merge_and_save_dataset()