Spaces:

Sujatha
/

TabularClassification

Runtime error

App Files Files Community

Sujatha commited on Nov 16, 2024

Commit

b62adba

verified ·

1 Parent(s): fde29d9

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -14

app.py CHANGED Viewed

@@ -1,38 +1,38 @@
 # Import necessary libraries
 import pandas as pd
-from datasets import load_dataset, Dataset
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
 from sklearn.model_selection import train_test_split
 # Load the dataset
-# Make sure you have the correct path to the CSV file
-df = pd.read_csv('diabetes_data.csv')
-# Define target column and preprocess
-threshold_value = 0  # Set threshold if needed
-df['label'] = (df['hypertension'] > threshold_value).astype(int)  # Binary classification based on hypertension
 # Split the dataset into train and test sets
 train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
 train_dataset = Dataset.from_pandas(train_df)
 test_dataset = Dataset.from_pandas(test_df)
-# Load tokenizer and model
-model_name = "bert-base-uncased"  # Replace with any compatible model from Hugging Face Model Hub
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
-# Tokenization function
 def preprocess_function(examples):
-    # Concatenate relevant columns to form the input text if needed
     inputs = examples["age"].astype(str) + " " + examples["bmi"].astype(str) + " " + examples["HbA1c_level"].astype(str)
     return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)
-# Apply tokenization to the datasets
 tokenized_train = train_dataset.map(preprocess_function, batched=True)
 tokenized_test = test_dataset.map(preprocess_function, batched=True)
-# Set up Trainer with training arguments
 training_args = TrainingArguments(
     output_dir="./results",
     evaluation_strategy="epoch",
@@ -42,7 +42,7 @@ training_args = TrainingArguments(
     weight_decay=0.01,
 )
-# Initialize Trainer
 trainer = Trainer(
     model=model,
     args=training_args,
@@ -50,7 +50,7 @@ trainer = Trainer(
     eval_dataset=tokenized_test,
 )
-# Train and evaluate
 trainer.train()
 trainer.evaluate()

 # Import necessary libraries
 import pandas as pd
+from datasets import Dataset
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
 from sklearn.model_selection import train_test_split
 # Load the dataset
+df = pd.read_csv('diabetes_prediction_dataset.csv')  # Ensure this file is uploaded to the root directory
+# Define the target column (e.g., 'hypertension') and create binary labels
+# Replace 'hypertension' with your actual target column if needed
+threshold_value = 0
+df['label'] = (df['hypertension'] > threshold_value).astype(int)
 # Split the dataset into train and test sets
 train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
 train_dataset = Dataset.from_pandas(train_df)
 test_dataset = Dataset.from_pandas(test_df)
+# Load the tokenizer and model from Hugging Face
+model_name = "bert-base-uncased"  # You can replace this with another compatible model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
+# Define a tokenization function
 def preprocess_function(examples):
+    # Concatenate relevant columns into a single input string for tokenization
     inputs = examples["age"].astype(str) + " " + examples["bmi"].astype(str) + " " + examples["HbA1c_level"].astype(str)
     return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)
+# Apply the tokenization function to the datasets
 tokenized_train = train_dataset.map(preprocess_function, batched=True)
 tokenized_test = test_dataset.map(preprocess_function, batched=True)
+# Set up training arguments
 training_args = TrainingArguments(
     output_dir="./results",
     evaluation_strategy="epoch",
     weight_decay=0.01,
 )
+# Initialize the Trainer
 trainer = Trainer(
     model=model,
     args=training_args,
     eval_dataset=tokenized_test,
 )
+# Train and evaluate the model
 trainer.train()
 trainer.evaluate()