Sujatha commited on
Commit
6575b06
·
verified ·
1 Parent(s): 8ec97a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -13
app.py CHANGED
@@ -5,30 +5,48 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trai
5
  from sklearn.model_selection import train_test_split
6
 
7
  # Load the dataset
8
- df = pd.read_csv('diabetes_prediction_dataset.csv') # Ensure this file is uploaded to the root directory
 
 
 
 
 
 
9
 
10
- # Define the target column (e.g., 'hypertension') and create binary labels
11
- # Replace 'hypertension' with your actual target column if needed
12
  threshold_value = 0
13
- df['label'] = (df['hypertension'] > threshold_value).astype(int)
 
 
 
 
 
 
 
 
 
14
 
15
  # Split the dataset into train and test sets
16
  train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
17
- train_dataset = Dataset.from_pandas(train_df)
18
- test_dataset = Dataset.from_pandas(test_df)
19
 
20
- # Load the tokenizer and model from Hugging Face
21
- model_name = "bert-base-uncased" # You can replace this with another compatible model
 
 
 
 
22
  tokenizer = AutoTokenizer.from_pretrained(model_name)
23
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
24
 
25
  # Define a tokenization function
26
  def preprocess_function(examples):
27
- # Convert each feature to a string and concatenate them
28
- inputs = [f"{age} {bmi} {hba1c}" for age, bmi, hba1c in zip(examples["age"], examples["bmi"], examples["HbA1c_level"])]
 
 
 
29
  return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)
30
 
31
- # Apply the tokenization function to the datasets
32
  tokenized_train = train_dataset.map(preprocess_function, batched=True)
33
  tokenized_test = test_dataset.map(preprocess_function, batched=True)
34
 
@@ -36,10 +54,15 @@ tokenized_test = test_dataset.map(preprocess_function, batched=True)
36
  training_args = TrainingArguments(
37
  output_dir="./results",
38
  evaluation_strategy="epoch",
 
39
  per_device_train_batch_size=16,
40
  per_device_eval_batch_size=16,
41
  num_train_epochs=3,
42
  weight_decay=0.01,
 
 
 
 
43
  )
44
 
45
  # Initialize the Trainer
@@ -48,9 +71,12 @@ trainer = Trainer(
48
  args=training_args,
49
  train_dataset=tokenized_train,
50
  eval_dataset=tokenized_test,
 
51
  )
52
 
53
- # Train and evaluate the model
54
  trainer.train()
55
- trainer.evaluate()
56
 
 
 
 
 
5
  from sklearn.model_selection import train_test_split
6
 
7
  # Load the dataset
8
+ file_path = 'diabetes_prediction_dataset.csv' # Ensure the dataset file is present in the same directory
9
+ df = pd.read_csv(file_path)
10
+
11
+ # Define the target column and create binary labels
12
+ target_column = 'hypertension' # Replace with your target column name
13
+ if target_column not in df.columns:
14
+ raise ValueError(f"Target column '{target_column}' not found in the dataset.")
15
 
 
 
16
  threshold_value = 0
17
+ df['label'] = (df[target_column] > threshold_value).astype(int)
18
+
19
+ # Ensure necessary feature columns exist
20
+ feature_columns = ['age', 'bmi', 'HbA1c_level'] # Replace with your dataset's feature names
21
+ for col in feature_columns:
22
+ if col not in df.columns:
23
+ raise ValueError(f"Feature column '{col}' not found in the dataset.")
24
+
25
+ # Handle missing values (optional: drop or fill)
26
+ df = df.dropna(subset=feature_columns + [target_column])
27
 
28
  # Split the dataset into train and test sets
29
  train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
 
 
30
 
31
+ # Convert to Hugging Face Dataset
32
+ train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
33
+ test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
34
+
35
+ # Load the tokenizer and model
36
+ model_name = "bert-base-uncased" # Replace with a suitable model for your task
37
  tokenizer = AutoTokenizer.from_pretrained(model_name)
38
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
39
 
40
  # Define a tokenization function
41
  def preprocess_function(examples):
42
+ # Combine features into a single string representation
43
+ inputs = [
44
+ f"age: {age}, bmi: {bmi}, HbA1c: {hba1c}"
45
+ for age, bmi, hba1c in zip(examples["age"], examples["bmi"], examples["HbA1c_level"])
46
+ ]
47
  return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)
48
 
49
+ # Apply the tokenization function
50
  tokenized_train = train_dataset.map(preprocess_function, batched=True)
51
  tokenized_test = test_dataset.map(preprocess_function, batched=True)
52
 
 
54
  training_args = TrainingArguments(
55
  output_dir="./results",
56
  evaluation_strategy="epoch",
57
+ save_strategy="epoch",
58
  per_device_train_batch_size=16,
59
  per_device_eval_batch_size=16,
60
  num_train_epochs=3,
61
  weight_decay=0.01,
62
+ logging_dir='./logs',
63
+ logging_steps=10,
64
+ load_best_model_at_end=True,
65
+ metric_for_best_model="accuracy",
66
  )
67
 
68
  # Initialize the Trainer
 
71
  args=training_args,
72
  train_dataset=tokenized_train,
73
  eval_dataset=tokenized_test,
74
+ tokenizer=tokenizer, # Ensure the tokenizer is passed
75
  )
76
 
77
+ # Train the model
78
  trainer.train()
 
79
 
80
+ # Evaluate the model
81
+ eval_results = trainer.evaluate()
82
+ print("Evaluation Results:", eval_results)