Spaces:

Sujatha
/

TabularClassification

Runtime error

App Files Files Community

Sujatha commited on Nov 16, 2024

Commit

6575b06

verified ·

1 Parent(s): 8ec97a6

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -13

app.py CHANGED Viewed

@@ -5,30 +5,48 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trai
 from sklearn.model_selection import train_test_split
 # Load the dataset
-df = pd.read_csv('diabetes_prediction_dataset.csv')  # Ensure this file is uploaded to the root directory
-# Define the target column (e.g., 'hypertension') and create binary labels
-# Replace 'hypertension' with your actual target column if needed
 threshold_value = 0
-df['label'] = (df['hypertension'] > threshold_value).astype(int)
 # Split the dataset into train and test sets
 train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
-train_dataset = Dataset.from_pandas(train_df)
-test_dataset = Dataset.from_pandas(test_df)
-# Load the tokenizer and model from Hugging Face
-model_name = "bert-base-uncased"  # You can replace this with another compatible model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
 # Define a tokenization function
 def preprocess_function(examples):
-    # Convert each feature to a string and concatenate them
-    inputs = [f"{age} {bmi} {hba1c}" for age, bmi, hba1c in zip(examples["age"], examples["bmi"], examples["HbA1c_level"])]
     return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)
-# Apply the tokenization function to the datasets
 tokenized_train = train_dataset.map(preprocess_function, batched=True)
 tokenized_test = test_dataset.map(preprocess_function, batched=True)
@@ -36,10 +54,15 @@ tokenized_test = test_dataset.map(preprocess_function, batched=True)
 training_args = TrainingArguments(
     output_dir="./results",
     evaluation_strategy="epoch",
     per_device_train_batch_size=16,
     per_device_eval_batch_size=16,
     num_train_epochs=3,
     weight_decay=0.01,
 )
 # Initialize the Trainer
@@ -48,9 +71,12 @@ trainer = Trainer(
     args=training_args,
     train_dataset=tokenized_train,
     eval_dataset=tokenized_test,
 )
-# Train and evaluate the model
 trainer.train()
-trainer.evaluate()

 from sklearn.model_selection import train_test_split
 # Load the dataset
+file_path = 'diabetes_prediction_dataset.csv'  # Ensure the dataset file is present in the same directory
+df = pd.read_csv(file_path)
+# Define the target column and create binary labels
+target_column = 'hypertension'  # Replace with your target column name
+if target_column not in df.columns:
+    raise ValueError(f"Target column '{target_column}' not found in the dataset.")
 threshold_value = 0
+df['label'] = (df[target_column] > threshold_value).astype(int)
+# Ensure necessary feature columns exist
+feature_columns = ['age', 'bmi', 'HbA1c_level']  # Replace with your dataset's feature names
+for col in feature_columns:
+    if col not in df.columns:
+        raise ValueError(f"Feature column '{col}' not found in the dataset.")
+# Handle missing values (optional: drop or fill)
+df = df.dropna(subset=feature_columns + [target_column])
 # Split the dataset into train and test sets
 train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
+# Convert to Hugging Face Dataset
+train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
+test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
+# Load the tokenizer and model
+model_name = "bert-base-uncased"  # Replace with a suitable model for your task
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
 # Define a tokenization function
 def preprocess_function(examples):
+    # Combine features into a single string representation
+    inputs = [
+        f"age: {age}, bmi: {bmi}, HbA1c: {hba1c}"
+        for age, bmi, hba1c in zip(examples["age"], examples["bmi"], examples["HbA1c_level"])
+    ]
     return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)
+# Apply the tokenization function
 tokenized_train = train_dataset.map(preprocess_function, batched=True)
 tokenized_test = test_dataset.map(preprocess_function, batched=True)
 training_args = TrainingArguments(
     output_dir="./results",
     evaluation_strategy="epoch",
+    save_strategy="epoch",
     per_device_train_batch_size=16,
     per_device_eval_batch_size=16,
     num_train_epochs=3,
     weight_decay=0.01,
+    logging_dir='./logs',
+    logging_steps=10,
+    load_best_model_at_end=True,
+    metric_for_best_model="accuracy",
 )
 # Initialize the Trainer
     args=training_args,
     train_dataset=tokenized_train,
     eval_dataset=tokenized_test,
+    tokenizer=tokenizer,  # Ensure the tokenizer is passed
 )
+# Train the model
 trainer.train()
+# Evaluate the model
+eval_results = trainer.evaluate()
+print("Evaluation Results:", eval_results)