""" Split data into train-test-val based on proportions """ from modules.transformations import logReturn, scaleStandard, createLag import pandas as pd import numpy as np import click import mlflow @click.command( help="Transforms the data to based on customizations" "in an mlflow artifact called 'ratings-parquet-dir'" ) @click.option("--data-parquet") @click.option( "--test-size", default=0.1, help="Proportion of data for test set" ) @click.option( "--val-size", default=0.1, help="Proportion of data for test set" ) def split(data_parquet, test_size=0.1, val_size=0.1, log_return = [], standard_scale = []): """ Splits data into train-test-validation sets Input: :param pd.DataFrame data: Dataset for splitting :param float test_size: Proportion of data for test set :param float val_size: Proportiion of data for validation set Output: Split data into train-test-val """ with mlflow.start_run(): y_log_ret = False y_std_scale = False data = pd.read_parquet(data_parquet) # Split data into train-test-val without any randomization train_size = int(len(data) * (1 - test_size - val_size)) test_size = int(len(data) * test_size) val_size = int(len(data) * val_size) train = data[:train_size].sort_values(by='ds', inplace=True) test = data[train_size:train_size+test_size].sort_values(by='ds', inplace=True) val = data[train_size+test_size:train_size+test_size+val_size].sort_values(by='ds', inplace=True) # Transform select columns if len(log_return) != 0: for col1 in log_return: try: #print(data[col1]) data[col1] = logReturn(data, col1) except Exception as e: print(e) pass if 'y' in log_return: y_log_ret = True if len(standard_scale) != 0: for col2 in standard_scale: try: data[col2], _ = scaleStandard(data[[col2]]) except Exception as e: print(e) pass if 'y' in standard_scale: data['y'], yScaler = scaleStandard(data[['y']]) y_std_scale = True return data if __name__ == "__main__": split()