Spaces:
Runtime error
Runtime error
File size: 2,926 Bytes
efa0882 4b771a1 b62adba fde29d9 efa0882 fde29d9 6575b06 efa0882 b62adba 6575b06 efa0882 fde29d9 efa0882 6575b06 fde29d9 efa0882 b62adba efa0882 6575b06 fde29d9 efa0882 6575b06 fde29d9 efa0882 b62adba efa0882 6575b06 efa0882 6575b06 0901b66 b62adba efa0882 fde29d9 6575b06 4b771a1 6575b06 efa0882 da88bf0 6575b06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
# Import necessary libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
# Load the dataset
file_path = 'diabetes_prediction_dataset.csv' # Ensure the dataset file is present in the same directory
df = pd.read_csv(file_path)
# Define the target column and create binary labels
target_column = 'hypertension' # Replace with your target column name
if target_column not in df.columns:
raise ValueError(f"Target column '{target_column}' not found in the dataset.")
threshold_value = 0
df['label'] = (df[target_column] > threshold_value).astype(int)
# Ensure necessary feature columns exist
feature_columns = ['age', 'bmi', 'HbA1c_level'] # Replace with your dataset's feature names
for col in feature_columns:
if col not in df.columns:
raise ValueError(f"Feature column '{col}' not found in the dataset.")
# Handle missing values (optional: drop or fill)
df = df.dropna(subset=feature_columns + [target_column])
# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
# Load the tokenizer and model
model_name = "bert-base-uncased" # Replace with a suitable model for your task
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Define a tokenization function
def preprocess_function(examples):
# Combine features into a single string representation
inputs = [
f"age: {age}, bmi: {bmi}, HbA1c: {hba1c}"
for age, bmi, hba1c in zip(examples["age"], examples["bmi"], examples["HbA1c_level"])
]
return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)
# Apply the tokenization function
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)
# Set up training arguments
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
save_strategy="epoch",
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
)
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_test,
tokenizer=tokenizer, # Ensure the tokenizer is passed
)
# Train the model
trainer.train()
# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)
|