File size: 2,068 Bytes
efa0882
4b771a1
b62adba
fde29d9
efa0882
 
fde29d9
b62adba
efa0882
b62adba
 
 
 
efa0882
fde29d9
 
 
 
efa0882
b62adba
 
fde29d9
 
efa0882
b62adba
efa0882
da88bf0
 
fde29d9
efa0882
b62adba
fde29d9
 
efa0882
b62adba
efa0882
 
 
 
 
 
 
0901b66
 
b62adba
efa0882
 
 
fde29d9
 
4b771a1
 
b62adba
efa0882
 
da88bf0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# Import necessary libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('diabetes_prediction_dataset.csv')  # Ensure this file is uploaded to the root directory

# Define the target column (e.g., 'hypertension') and create binary labels
# Replace 'hypertension' with your actual target column if needed
threshold_value = 0
df['label'] = (df['hypertension'] > threshold_value).astype(int)

# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load the tokenizer and model from Hugging Face
model_name = "bert-base-uncased"  # You can replace this with another compatible model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define a tokenization function
def preprocess_function(examples):
    # Convert each feature to a string and concatenate them
    inputs = [f"{age} {bmi} {hba1c}" for age, bmi, hba1c in zip(examples["age"], examples["bmi"], examples["HbA1c_level"])]
    return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)

# Apply the tokenization function to the datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

# Train and evaluate the model
trainer.train()
trainer.evaluate()