import pandas as pd import numpy as np import scipy import torch import os import scipy import datetime import matplotlib.pyplot as plt import gdown def crude_poc(src_link='https://drive.google.com/drive/folders/10DnogdypxvAarscAlwUVnXPtwxzfiAlB?usp=sharing', output_path='/data/crude_oil'): gdown.download(src_link, output_path, quiet=False) def convert_date(date): date_formats = ['%m/%d/%Y', '%Y-%m-%d', '%d-%m-%Y'] # Add more formats as needed for fmt in date_formats: try: return datetime.datetime.strptime(date, fmt).date() except ValueError: continue print(f"Error converting date: {date}") return pd.NaT def processCSV(data_folder): """ Processes all CSV files in the specified folder, performing various cleaning and transformation steps, and merges them into a single DataFrame. Args: data_folder (str): The path to the folder containing the CSV files to be processed. Returns: pd.DataFrame: A merged DataFrame containing the processed data from all valid CSV files. If no valid files are processed, returns an empty DataFrame. """ processed_dfs = [] # Iterate through all CSV files in the folder for file_name in os.listdir(data_folder): if file_name.endswith('.csv'): file_path = os.path.join(data_folder, file_name) try: df = pd.read_csv(file_path) try: df = df.drop(columns=['Open']) except KeyError: pass try: df = df.drop(columns=['Volume']) except KeyError: pass if 'Close/Last' in df.columns: df = df.rename({'Close/Last': 'Close'}, axis=1) if 'Unnamed: 0' in df.columns: df = df.drop(columns=['Unnamed: 0']) if 'Date' in df.columns: df['Date'] = pd.to_datetime(df['Date'].apply(convert_date), errors='coerce') # Rename columns (except 'Date') by adding the file name prefix file_label = os.path.splitext(file_name)[0] df = df.rename( {col: f"{file_label}_{col}" for col in df.columns if col != 'Date'}, axis=1 ) # Add the processed DataFrame to the list if it contains 'Date' and at least one other column if 'Date' in df.columns and any(col != 'Date' for col in df.columns): processed_dfs.append(df) else: print(f"Skipped {file_name} due to missing 'Date' or valid data columns after processing.") except Exception as e: print(f"Error processing {file_name}: {e}") # Merge all DataFrames on the 'Date' column, handling potential column mismatches if not processed_dfs: print("No valid DataFrames to merge.") return pd.DataFrame() # Return an empty DataFrame if no valid files were processed merged_df = processed_dfs[0] for df in processed_dfs[1:]: try: # Merge on 'Date' using outer join to keep all dates merged_df = merged_df.merge(df, on='Date', how='inner') except Exception as e: print(f"Error merging DataFrame: {e}") return merged_df