Spaces:
Runtime error
Runtime error
# Import necessary libraries | |
import pandas as pd | |
from datasets import load_dataset, Dataset | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments | |
from sklearn.model_selection import train_test_split | |
# Load the dataset | |
# Make sure you have the correct path to the CSV file | |
df = pd.read_csv('diabetes_data.csv') | |
# Define target column and preprocess | |
threshold_value = 0 # Set threshold if needed | |
df['label'] = (df['hypertension'] > threshold_value).astype(int) # Binary classification based on hypertension | |
# Split the dataset into train and test sets | |
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) | |
train_dataset = Dataset.from_pandas(train_df) | |
test_dataset = Dataset.from_pandas(test_df) | |
# Load tokenizer and model | |
model_name = "bert-base-uncased" # Replace with any compatible model from Hugging Face Model Hub | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) | |
# Tokenization function | |
def preprocess_function(examples): | |
# Concatenate relevant columns to form the input text if needed | |
inputs = examples["age"].astype(str) + " " + examples["bmi"].astype(str) + " " + examples["HbA1c_level"].astype(str) | |
return tokenizer(inputs, padding="max_length", truncation=True, max_length=32) | |
# Apply tokenization to the datasets | |
tokenized_train = train_dataset.map(preprocess_function, batched=True) | |
tokenized_test = test_dataset.map(preprocess_function, batched=True) | |
# Set up Trainer with training arguments | |
training_args = TrainingArguments( | |
output_dir="./results", | |
evaluation_strategy="epoch", | |
per_device_train_batch_size=16, | |
per_device_eval_batch_size=16, | |
num_train_epochs=3, | |
weight_decay=0.01, | |
) | |
# Initialize Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_train, | |
eval_dataset=tokenized_test, | |
) | |
# Train and evaluate | |
trainer.train() | |
trainer.evaluate() | |