jaynopponep commited on
Commit
fcb22a6
·
1 Parent(s): 8768724

Fixing model...

Browse files
Files changed (1) hide show
  1. model.py +35 -11
model.py CHANGED
@@ -3,21 +3,27 @@ import torch
3
  from sklearn.model_selection import train_test_split
4
  from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
5
 
6
- df = pd.read_csv('Training_Essay_Data 1.csv.csv')
 
7
 
8
- train_df, eval_df = train_test_split(df, test_size=0.1) # Here 10% for validation
 
9
 
10
- tokenizer = BertTokenizer.from_pretrained('bert-baseuncased')
 
11
 
12
 
 
13
  def tokenize_function(examples):
14
- return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)
15
 
16
 
 
17
  train_encodings = tokenize_function(train_df)
18
  eval_encodings = tokenize_function(eval_df)
19
 
20
 
 
21
  class EssayDataset(torch.utils.data.Dataset):
22
  def __init__(self, encodings, labels):
23
  self.encodings = encodings
@@ -25,18 +31,21 @@ class EssayDataset(torch.utils.data.Dataset):
25
 
26
  def __getitem__(self, idx):
27
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
28
- item['labels'] = torch.tensor(int(self.labels[idx])) # Convert labels to tensor
29
  return item
30
 
31
  def __len__(self):
32
  return len(self.labels)
33
 
34
 
 
35
  train_dataset = EssayDataset(train_encodings, train_df['label'].tolist())
36
  eval_dataset = EssayDataset(eval_encodings, eval_df['label'].tolist())
37
 
38
- model = BertForSequenceClassification.from_pretrained('bertbase-uncased', num_labels=2)
 
39
 
 
40
  training_args = TrainingArguments(
41
  output_dir='./results',
42
  num_train_epochs=3,
@@ -45,8 +54,10 @@ training_args = TrainingArguments(
45
  warmup_steps=500,
46
  weight_decay=0.01,
47
  logging_dir='./logs',
 
48
  )
49
 
 
50
  trainer = Trainer(
51
  model=model,
52
  args=training_args,
@@ -54,11 +65,24 @@ trainer = Trainer(
54
  eval_dataset=eval_dataset
55
  )
56
 
 
57
  trainer.train()
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  user_input = input("Enter the text you want to classify: ")
60
- inputs = tokenizer(user_input, padding=True, truncation=True,
61
- return_tensors="pt")
62
- outputs = model(**inputs)
63
- predictions = torch.argmax(outputs.logits, dim=-1)
64
- print("Classified as:", "AI-generated" if predictions.item() == 1 else "Human-written")
 
3
  from sklearn.model_selection import train_test_split
4
  from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
5
 
6
+ # Read the dataset
7
+ df = pd.read_csv('Training_Essay_Data.csv') # Make sure the file name is correct
8
 
9
+ # Splitting the dataset
10
+ train_df, eval_df = train_test_split(df, test_size=0.1)
11
 
12
+ # Tokenizer
13
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
14
 
15
 
16
+ # Tokenize function
17
  def tokenize_function(examples):
18
+ return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
19
 
20
 
21
+ # Tokenize the dataset
22
  train_encodings = tokenize_function(train_df)
23
  eval_encodings = tokenize_function(eval_df)
24
 
25
 
26
+ # Essay dataset class
27
  class EssayDataset(torch.utils.data.Dataset):
28
  def __init__(self, encodings, labels):
29
  self.encodings = encodings
 
31
 
32
  def __getitem__(self, idx):
33
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
34
+ item['labels'] = torch.tensor(int(self.labels[idx]))
35
  return item
36
 
37
  def __len__(self):
38
  return len(self.labels)
39
 
40
 
41
+ # Dataset preparation
42
  train_dataset = EssayDataset(train_encodings, train_df['label'].tolist())
43
  eval_dataset = EssayDataset(eval_encodings, eval_df['label'].tolist())
44
 
45
+ # Model
46
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
47
 
48
+ # Training arguments
49
  training_args = TrainingArguments(
50
  output_dir='./results',
51
  num_train_epochs=3,
 
54
  warmup_steps=500,
55
  weight_decay=0.01,
56
  logging_dir='./logs',
57
+ evaluation_strategy="epoch"
58
  )
59
 
60
+ # Trainer
61
  trainer = Trainer(
62
  model=model,
63
  args=training_args,
 
65
  eval_dataset=eval_dataset
66
  )
67
 
68
+ # Train the model
69
  trainer.train()
70
 
71
+ # Save the model
72
+ model.save_pretrained("./saved_model")
73
+
74
+ # Load the model for prediction
75
+ model = BertForSequenceClassification.from_pretrained("./saved_model")
76
+
77
+
78
+ # Predicting
79
+ def predict(text):
80
+ inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
81
+ outputs = model(**inputs)
82
+ predictions = torch.argmax(outputs.logits, dim=-1)
83
+ return "AI-generated" if predictions.item() == 1 else "Human-written"
84
+
85
+
86
+ # Get user input and predict
87
  user_input = input("Enter the text you want to classify: ")
88
+ print("Classified as:", predict(user_input))