Sujatha commited on
Commit
fde29d9
·
verified ·
1 Parent(s): 8554933

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -19
app.py CHANGED
@@ -1,32 +1,38 @@
1
  # Import necessary libraries
2
- from datasets import load_dataset
3
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
4
  import pandas as pd
 
 
5
  from sklearn.model_selection import train_test_split
6
 
7
- # Convert PDF to DataFrame (assuming it's already loaded as df in CSV or DataFrame format)
8
- df = pd.read_csv('diabetes_prediction_dataset.csv') # Replace with the path to your CSV
9
- df['label'] = (df['target_column'] > threshold_value).astype(int) # Adjust target column for binary classification
10
 
11
- # Split the dataset
12
- train_df, test_df = train_test_split(df, test_size=0.2)
13
- train_df.to_csv("train.csv", index=False)
14
- test_df.to_csv("test.csv", index=False)
15
 
16
- # Load dataset with Hugging Face Datasets
17
- dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})
 
 
18
 
19
  # Load tokenizer and model
20
- tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
21
- model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
 
22
 
23
- # Tokenize the dataset
24
  def preprocess_function(examples):
25
- return tokenizer(examples['text_column'], padding="max_length", truncation=True)
 
 
26
 
27
- tokenized_dataset = dataset.map(preprocess_function, batched=True)
 
 
28
 
29
- # Set training arguments
30
  training_args = TrainingArguments(
31
  output_dir="./results",
32
  evaluation_strategy="epoch",
@@ -40,8 +46,8 @@ training_args = TrainingArguments(
40
  trainer = Trainer(
41
  model=model,
42
  args=training_args,
43
- train_dataset=tokenized_dataset['train'],
44
- eval_dataset=tokenized_dataset['test'],
45
  )
46
 
47
  # Train and evaluate
 
1
  # Import necessary libraries
 
 
2
  import pandas as pd
3
+ from datasets import load_dataset, Dataset
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
5
  from sklearn.model_selection import train_test_split
6
 
7
+ # Load the dataset
8
+ # Make sure you have the correct path to the CSV file
9
+ df = pd.read_csv('diabetes_data.csv')
10
 
11
+ # Define target column and preprocess
12
+ threshold_value = 0 # Set threshold if needed
13
+ df['label'] = (df['hypertension'] > threshold_value).astype(int) # Binary classification based on hypertension
 
14
 
15
+ # Split the dataset into train and test sets
16
+ train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
17
+ train_dataset = Dataset.from_pandas(train_df)
18
+ test_dataset = Dataset.from_pandas(test_df)
19
 
20
  # Load tokenizer and model
21
+ model_name = "bert-base-uncased" # Replace with any compatible model from Hugging Face Model Hub
22
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
24
 
25
+ # Tokenization function
26
  def preprocess_function(examples):
27
+ # Concatenate relevant columns to form the input text if needed
28
+ inputs = examples["age"].astype(str) + " " + examples["bmi"].astype(str) + " " + examples["HbA1c_level"].astype(str)
29
+ return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)
30
 
31
+ # Apply tokenization to the datasets
32
+ tokenized_train = train_dataset.map(preprocess_function, batched=True)
33
+ tokenized_test = test_dataset.map(preprocess_function, batched=True)
34
 
35
+ # Set up Trainer with training arguments
36
  training_args = TrainingArguments(
37
  output_dir="./results",
38
  evaluation_strategy="epoch",
 
46
  trainer = Trainer(
47
  model=model,
48
  args=training_args,
49
+ train_dataset=tokenized_train,
50
+ eval_dataset=tokenized_test,
51
  )
52
 
53
  # Train and evaluate