Spaces:
Runtime error
Runtime error
# Import necessary libraries | |
import pandas as pd | |
from datasets import Dataset | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments | |
from sklearn.model_selection import train_test_split | |
# Load the dataset | |
file_path = 'diabetes_prediction_dataset.csv' # Ensure the dataset file is present in the same directory | |
df = pd.read_csv(file_path) | |
# Define the target column and create binary labels | |
target_column = 'hypertension' # Replace with your target column name | |
if target_column not in df.columns: | |
raise ValueError(f"Target column '{target_column}' not found in the dataset.") | |
threshold_value = 0 | |
df['label'] = (df[target_column] > threshold_value).astype(int) | |
# Ensure necessary feature columns exist | |
feature_columns = ['age', 'bmi', 'HbA1c_level'] # Replace with your dataset's feature names | |
for col in feature_columns: | |
if col not in df.columns: | |
raise ValueError(f"Feature column '{col}' not found in the dataset.") | |
# Handle missing values (optional: drop or fill) | |
df = df.dropna(subset=feature_columns + [target_column]) | |
# Split the dataset into train and test sets | |
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) | |
# Convert to Hugging Face Dataset | |
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True)) | |
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True)) | |
# Load the tokenizer and model | |
model_name = "bert-base-uncased" # Replace with a suitable model for your task | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) | |
# Define a tokenization function | |
def preprocess_function(examples): | |
# Combine features into a single string representation | |
inputs = [ | |
f"age: {age}, bmi: {bmi}, HbA1c: {hba1c}" | |
for age, bmi, hba1c in zip(examples["age"], examples["bmi"], examples["HbA1c_level"]) | |
] | |
return tokenizer(inputs, padding="max_length", truncation=True, max_length=32) | |
# Apply the tokenization function | |
tokenized_train = train_dataset.map(preprocess_function, batched=True) | |
tokenized_test = test_dataset.map(preprocess_function, batched=True) | |
# Set up training arguments | |
training_args = TrainingArguments( | |
output_dir="./results", | |
evaluation_strategy="epoch", | |
save_strategy="epoch", | |
per_device_train_batch_size=16, | |
per_device_eval_batch_size=16, | |
num_train_epochs=3, | |
weight_decay=0.01, | |
logging_dir='./logs', | |
logging_steps=10, | |
load_best_model_at_end=True, | |
metric_for_best_model="accuracy", | |
) | |
# Initialize the Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_train, | |
eval_dataset=tokenized_test, | |
tokenizer=tokenizer, # Ensure the tokenizer is passed | |
) | |
# Train the model | |
trainer.train() | |
# Evaluate the model | |
eval_results = trainer.evaluate() | |
print("Evaluation Results:", eval_results) | |