Spaces:

Sujatha
/

TabularClassification

Runtime error

App Files Files Community

Sujatha commited on Nov 16, 2024

Commit

fde29d9

verified ·

1 Parent(s): 8554933

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -19

app.py CHANGED Viewed

@@ -1,32 +1,38 @@
 # Import necessary libraries
-from datasets import load_dataset
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
 import pandas as pd
 from sklearn.model_selection import train_test_split
-# Convert PDF to DataFrame (assuming it's already loaded as df in CSV or DataFrame format)
-df = pd.read_csv('diabetes_prediction_dataset.csv')  # Replace with the path to your CSV
-df['label'] = (df['target_column'] > threshold_value).astype(int)  # Adjust target column for binary classification
-# Split the dataset
-train_df, test_df = train_test_split(df, test_size=0.2)
-train_df.to_csv("train.csv", index=False)
-test_df.to_csv("test.csv", index=False)
-# Load dataset with Hugging Face Datasets
-dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})
 # Load tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
-model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
-# Tokenize the dataset
 def preprocess_function(examples):
-    return tokenizer(examples['text_column'], padding="max_length", truncation=True)
-tokenized_dataset = dataset.map(preprocess_function, batched=True)
-# Set training arguments
 training_args = TrainingArguments(
     output_dir="./results",
     evaluation_strategy="epoch",
@@ -40,8 +46,8 @@ training_args = TrainingArguments(
 trainer = Trainer(
     model=model,
     args=training_args,
-    train_dataset=tokenized_dataset['train'],
-    eval_dataset=tokenized_dataset['test'],
 )
 # Train and evaluate

 # Import necessary libraries
 import pandas as pd
+from datasets import load_dataset, Dataset
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
 from sklearn.model_selection import train_test_split
+# Load the dataset
+# Make sure you have the correct path to the CSV file
+df = pd.read_csv('diabetes_data.csv')
+# Define target column and preprocess
+threshold_value = 0  # Set threshold if needed
+df['label'] = (df['hypertension'] > threshold_value).astype(int)  # Binary classification based on hypertension
+# Split the dataset into train and test sets
+train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
+train_dataset = Dataset.from_pandas(train_df)
+test_dataset = Dataset.from_pandas(test_df)
 # Load tokenizer and model
+model_name = "bert-base-uncased"  # Replace with any compatible model from Hugging Face Model Hub
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
+# Tokenization function
 def preprocess_function(examples):
+    # Concatenate relevant columns to form the input text if needed
+    inputs = examples["age"].astype(str) + " " + examples["bmi"].astype(str) + " " + examples["HbA1c_level"].astype(str)
+    return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)
+# Apply tokenization to the datasets
+tokenized_train = train_dataset.map(preprocess_function, batched=True)
+tokenized_test = test_dataset.map(preprocess_function, batched=True)
+# Set up Trainer with training arguments
 training_args = TrainingArguments(
     output_dir="./results",
     evaluation_strategy="epoch",
 trainer = Trainer(
     model=model,
     args=training_args,
+    train_dataset=tokenized_train,
+    eval_dataset=tokenized_test,
 )
 # Train and evaluate