Sujatha commited on
Commit
b62adba
·
verified ·
1 Parent(s): fde29d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -14
app.py CHANGED
@@ -1,38 +1,38 @@
1
  # Import necessary libraries
2
  import pandas as pd
3
- from datasets import load_dataset, Dataset
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
5
  from sklearn.model_selection import train_test_split
6
 
7
  # Load the dataset
8
- # Make sure you have the correct path to the CSV file
9
- df = pd.read_csv('diabetes_data.csv')
10
 
11
- # Define target column and preprocess
12
- threshold_value = 0 # Set threshold if needed
13
- df['label'] = (df['hypertension'] > threshold_value).astype(int) # Binary classification based on hypertension
 
14
 
15
  # Split the dataset into train and test sets
16
  train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
17
  train_dataset = Dataset.from_pandas(train_df)
18
  test_dataset = Dataset.from_pandas(test_df)
19
 
20
- # Load tokenizer and model
21
- model_name = "bert-base-uncased" # Replace with any compatible model from Hugging Face Model Hub
22
  tokenizer = AutoTokenizer.from_pretrained(model_name)
23
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
24
 
25
- # Tokenization function
26
  def preprocess_function(examples):
27
- # Concatenate relevant columns to form the input text if needed
28
  inputs = examples["age"].astype(str) + " " + examples["bmi"].astype(str) + " " + examples["HbA1c_level"].astype(str)
29
  return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)
30
 
31
- # Apply tokenization to the datasets
32
  tokenized_train = train_dataset.map(preprocess_function, batched=True)
33
  tokenized_test = test_dataset.map(preprocess_function, batched=True)
34
 
35
- # Set up Trainer with training arguments
36
  training_args = TrainingArguments(
37
  output_dir="./results",
38
  evaluation_strategy="epoch",
@@ -42,7 +42,7 @@ training_args = TrainingArguments(
42
  weight_decay=0.01,
43
  )
44
 
45
- # Initialize Trainer
46
  trainer = Trainer(
47
  model=model,
48
  args=training_args,
@@ -50,7 +50,7 @@ trainer = Trainer(
50
  eval_dataset=tokenized_test,
51
  )
52
 
53
- # Train and evaluate
54
  trainer.train()
55
  trainer.evaluate()
56
 
 
1
  # Import necessary libraries
2
  import pandas as pd
3
+ from datasets import Dataset
4
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
5
  from sklearn.model_selection import train_test_split
6
 
7
  # Load the dataset
8
+ df = pd.read_csv('diabetes_prediction_dataset.csv') # Ensure this file is uploaded to the root directory
 
9
 
10
+ # Define the target column (e.g., 'hypertension') and create binary labels
11
+ # Replace 'hypertension' with your actual target column if needed
12
+ threshold_value = 0
13
+ df['label'] = (df['hypertension'] > threshold_value).astype(int)
14
 
15
  # Split the dataset into train and test sets
16
  train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
17
  train_dataset = Dataset.from_pandas(train_df)
18
  test_dataset = Dataset.from_pandas(test_df)
19
 
20
+ # Load the tokenizer and model from Hugging Face
21
+ model_name = "bert-base-uncased" # You can replace this with another compatible model
22
  tokenizer = AutoTokenizer.from_pretrained(model_name)
23
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
24
 
25
+ # Define a tokenization function
26
  def preprocess_function(examples):
27
+ # Concatenate relevant columns into a single input string for tokenization
28
  inputs = examples["age"].astype(str) + " " + examples["bmi"].astype(str) + " " + examples["HbA1c_level"].astype(str)
29
  return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)
30
 
31
+ # Apply the tokenization function to the datasets
32
  tokenized_train = train_dataset.map(preprocess_function, batched=True)
33
  tokenized_test = test_dataset.map(preprocess_function, batched=True)
34
 
35
+ # Set up training arguments
36
  training_args = TrainingArguments(
37
  output_dir="./results",
38
  evaluation_strategy="epoch",
 
42
  weight_decay=0.01,
43
  )
44
 
45
+ # Initialize the Trainer
46
  trainer = Trainer(
47
  model=model,
48
  args=training_args,
 
50
  eval_dataset=tokenized_test,
51
  )
52
 
53
+ # Train and evaluate the model
54
  trainer.train()
55
  trainer.evaluate()
56