import torch from torch.utils.data import Dataset import random import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from sklearn.preprocessing import MinMaxScaler from sklearn import datasets def import_dataset(name):# import dataset among a list a available ones if name=="boston": data_url = "http://lib.stat.cmu.edu/datasets/boston" raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) target = raw_df.values[1::2, 2] y_boston=target X_boston=data y_boston=torch.Tensor(y_boston).view(len(y_boston),1).float() X_boston=torch.Tensor(X_boston).float() return X_boston,y_boston if name=="airfoil": columns_names=["Frequency","Angle of attack","Chord length","Free-stream velocity","Suction side displacement thickness","sound pressure level"] airfoil=pd.read_csv('datasets/airfoil_self_noise.dat',sep='\t',names=columns_names) y_airfoil=airfoil["sound pressure level"] X_airfoil=airfoil.drop("sound pressure level",axis=1) y_airfoil=torch.Tensor(y_airfoil).view(len(y_airfoil),1).float() X_airfoil=torch.Tensor(X_airfoil.values).float() return X_airfoil,y_airfoil if name=="energy1": energy=pd.read_csv('datasets/energy efficiency.csv') y_energy=energy["Y1"] X_energy=energy.drop(["Y2","Y1"],axis=1) y_energy=torch.Tensor(y_energy).view(len(y_energy),1).float() X_energy=torch.Tensor(X_energy.values).float() return X_energy,y_energy if name=="energy2":# other target function energy=pd.read_csv('datasets/energy efficiency.csv') y_energy=energy["Y2"] X_energy=energy.drop(["Y2","Y1"],axis=1) y_energy=torch.Tensor(y_energy).view(len(y_energy),1).float() X_energy=torch.Tensor(X_energy.values).float() return X_energy,y_energy if name=="yacht": yacht=pd.read_csv('datasets/yacht_hydrodynamics.data',sep=' ',header=None) y_yacht=yacht[6] X_yacht=yacht.drop([6],axis=1) y_yacht=torch.Tensor(y_yacht).view(len(y_yacht),1).float() X_yacht=torch.Tensor(X_yacht.values).float() return X_yacht,y_yacht if name=="concrete_slump": concrete=pd.read_csv('datasets/slump_test.data',sep=',') y_concrete=concrete["SLUMP(cm)"] X_concrete=concrete.drop(["No","SLUMP(cm)","FLOW(cm)","Compressive Strength (28-day)(Mpa)"],axis=1) y_concrete=torch.Tensor(y_concrete).view(len(y_concrete),1).float() X_concrete=torch.Tensor(X_concrete.values).float() return X_concrete,y_concrete if name=="concrete_flow":#other target function concrete=pd.read_csv('datasets/slump_test.data',sep=',') y_concrete=concrete["FLOW(cm)"] X_concrete=concrete.drop(["No","FLOW(cm)","SLUMP(cm)","Compressive Strength (28-day)(Mpa)"],axis=1) y_concrete=torch.Tensor(y_concrete).view(len(y_concrete),1).float() X_concrete=torch.Tensor(X_concrete.values).float() return X_concrete,y_concrete if name=="concrete_compressive":#other target function concrete=pd.read_csv('datasets/slump_test.data',sep=',') y_concrete=concrete["Compressive Strength (28-day)(Mpa)"] X_concrete=concrete.drop(["No","FLOW(cm)","SLUMP(cm)","Compressive Strength (28-day)(Mpa)"],axis=1) y_concrete=torch.Tensor(y_concrete).view(len(y_concrete),1).float() X_concrete=torch.Tensor(X_concrete.values).float() return X_concrete,y_concrete if name=="x_squared": data_generated=100 x_b=torch.tensor([random.random() for i in range(data_generated)]) x_carré_b=x_b.view(x_b.size()[0],1) y_carré_b=(x_b**2 + torch.tensor([np.random.normal(loc=0,scale=0.05) for i in range(data_generated)])).view(x_b.size()[0],1) return x_carré_b,y_carré_b if name=="news_popularity": news=pd.read_csv('datasets/OnlineNewsPopularity/OnlineNewsPopularity.csv') y_news=news[" shares"] X_news=news.drop([" shares","url"," timedelta"],axis=1) y_news=torch.Tensor(y_news).view(len(y_news),1).float() X_news=torch.Tensor(X_news.values).float() return X_news,y_news def get_dataset(proportion=0.2,dataset="boston"):# scale and process the data scaler = MinMaxScaler() X,y=import_dataset(dataset) X=torch.Tensor(scaler.fit_transform(X)) X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=proportion) print(f"Shape of the training set: {X_train.shape}") return X_train,X_test,y_train,y_test class myData(Dataset): def __init__(self,x,y): self.x=x self.y=y self.shape=x.size(0) def __getitem__(self,index): return self.x[index],self.y[index] def __len__(self): return self.shape