import pandas as pd import torch from sklearn.model_selection import train_test_split from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer # Read the dataset df = pd.read_csv('Training_Essay_Data.csv') # Make sure the file name is correct # Splitting the dataset train_df, eval_df = train_test_split(df, test_size=0.1) # Tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Tokenize function def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) # Tokenize the dataset train_encodings = tokenize_function(train_df) eval_encodings = tokenize_function(eval_df) # Essay dataset class class EssayDataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(int(self.labels[idx])) return item def __len__(self): return len(self.labels) # Dataset preparation train_dataset = EssayDataset(train_encodings, train_df['label'].tolist()) eval_dataset = EssayDataset(eval_encodings, eval_df['label'].tolist()) # Model model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) # Training arguments training_args = TrainingArguments( output_dir='./results', num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', evaluation_strategy="epoch" ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset ) # Train the model trainer.train() # Save the model model.save_pretrained("./saved_model") # Load the model for prediction model = BertForSequenceClassification.from_pretrained("./saved_model") # Predicting def predict(text): inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt") outputs = model(**inputs) predictions = torch.argmax(outputs.logits, dim=-1) return "AI-generated" if predictions.item() == 1 else "Human-written" # Get user input and predict user_input = input("Enter the text you want to classify: ") print("Classified as:", predict(user_input))