|
import torch |
|
from torch.utils.data import Dataset |
|
import random |
|
import numpy as np |
|
import pandas as pd |
|
|
|
from sklearn.model_selection import train_test_split |
|
from sklearn.decomposition import PCA |
|
from sklearn.preprocessing import MinMaxScaler |
|
from sklearn import datasets |
|
|
|
|
|
def import_dataset(name): |
|
|
|
if name=="boston": |
|
data_url = "http://lib.stat.cmu.edu/datasets/boston" |
|
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) |
|
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) |
|
target = raw_df.values[1::2, 2] |
|
y_boston=target |
|
X_boston=data |
|
y_boston=torch.Tensor(y_boston).view(len(y_boston),1).float() |
|
X_boston=torch.Tensor(X_boston).float() |
|
return X_boston,y_boston |
|
|
|
if name=="airfoil": |
|
columns_names=["Frequency","Angle of attack","Chord length","Free-stream velocity","Suction side displacement thickness","sound pressure level"] |
|
airfoil=pd.read_csv('datasets/airfoil_self_noise.dat',sep='\t',names=columns_names) |
|
y_airfoil=airfoil["sound pressure level"] |
|
X_airfoil=airfoil.drop("sound pressure level",axis=1) |
|
y_airfoil=torch.Tensor(y_airfoil).view(len(y_airfoil),1).float() |
|
X_airfoil=torch.Tensor(X_airfoil.values).float() |
|
return X_airfoil,y_airfoil |
|
|
|
if name=="energy1": |
|
energy=pd.read_csv('datasets/energy efficiency.csv') |
|
y_energy=energy["Y1"] |
|
X_energy=energy.drop(["Y2","Y1"],axis=1) |
|
y_energy=torch.Tensor(y_energy).view(len(y_energy),1).float() |
|
X_energy=torch.Tensor(X_energy.values).float() |
|
return X_energy,y_energy |
|
|
|
if name=="energy2": |
|
energy=pd.read_csv('datasets/energy efficiency.csv') |
|
y_energy=energy["Y2"] |
|
X_energy=energy.drop(["Y2","Y1"],axis=1) |
|
y_energy=torch.Tensor(y_energy).view(len(y_energy),1).float() |
|
X_energy=torch.Tensor(X_energy.values).float() |
|
return X_energy,y_energy |
|
|
|
if name=="yacht": |
|
yacht=pd.read_csv('datasets/yacht_hydrodynamics.data',sep=' ',header=None) |
|
y_yacht=yacht[6] |
|
X_yacht=yacht.drop([6],axis=1) |
|
y_yacht=torch.Tensor(y_yacht).view(len(y_yacht),1).float() |
|
X_yacht=torch.Tensor(X_yacht.values).float() |
|
return X_yacht,y_yacht |
|
|
|
if name=="concrete_slump": |
|
concrete=pd.read_csv('datasets/slump_test.data',sep=',') |
|
y_concrete=concrete["SLUMP(cm)"] |
|
X_concrete=concrete.drop(["No","SLUMP(cm)","FLOW(cm)","Compressive Strength (28-day)(Mpa)"],axis=1) |
|
y_concrete=torch.Tensor(y_concrete).view(len(y_concrete),1).float() |
|
X_concrete=torch.Tensor(X_concrete.values).float() |
|
return X_concrete,y_concrete |
|
|
|
if name=="concrete_flow": |
|
concrete=pd.read_csv('datasets/slump_test.data',sep=',') |
|
y_concrete=concrete["FLOW(cm)"] |
|
X_concrete=concrete.drop(["No","FLOW(cm)","SLUMP(cm)","Compressive Strength (28-day)(Mpa)"],axis=1) |
|
y_concrete=torch.Tensor(y_concrete).view(len(y_concrete),1).float() |
|
X_concrete=torch.Tensor(X_concrete.values).float() |
|
return X_concrete,y_concrete |
|
|
|
if name=="concrete_compressive": |
|
concrete=pd.read_csv('datasets/slump_test.data',sep=',') |
|
y_concrete=concrete["Compressive Strength (28-day)(Mpa)"] |
|
X_concrete=concrete.drop(["No","FLOW(cm)","SLUMP(cm)","Compressive Strength (28-day)(Mpa)"],axis=1) |
|
y_concrete=torch.Tensor(y_concrete).view(len(y_concrete),1).float() |
|
X_concrete=torch.Tensor(X_concrete.values).float() |
|
return X_concrete,y_concrete |
|
if name=="x_squared": |
|
|
|
data_generated=100 |
|
x_b=torch.tensor([random.random() for i in range(data_generated)]) |
|
x_carré_b=x_b.view(x_b.size()[0],1) |
|
y_carré_b=(x_b**2 + torch.tensor([np.random.normal(loc=0,scale=0.05) for i in range(data_generated)])).view(x_b.size()[0],1) |
|
return x_carré_b,y_carré_b |
|
|
|
if name=="news_popularity": |
|
news=pd.read_csv('datasets/OnlineNewsPopularity/OnlineNewsPopularity.csv') |
|
y_news=news[" shares"] |
|
X_news=news.drop([" shares","url"," timedelta"],axis=1) |
|
y_news=torch.Tensor(y_news).view(len(y_news),1).float() |
|
X_news=torch.Tensor(X_news.values).float() |
|
return X_news,y_news |
|
|
|
def get_dataset(proportion=0.2,dataset="boston"): |
|
|
|
scaler = MinMaxScaler() |
|
X,y=import_dataset(dataset) |
|
X=torch.Tensor(scaler.fit_transform(X)) |
|
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=proportion) |
|
print(f"Shape of the training set: {X_train.shape}") |
|
return X_train,X_test,y_train,y_test |
|
|
|
|
|
|
|
class myData(Dataset): |
|
|
|
def __init__(self,x,y): |
|
self.x=x |
|
self.y=y |
|
self.shape=x.size(0) |
|
|
|
def __getitem__(self,index): |
|
return self.x[index],self.y[index] |
|
|
|
def __len__(self): |
|
return self.shape |
|
|
|
|
|
|
|
|
|
|