import pandas as pd from sklearn.model_selection import train_test_split from transformers import BertTokenizer, TFBertForSequenceClassification import tensorflow as tf # Load preprocessed data def load_data(file_path="preprocessed_reviews.csv"): return pd.read_csv("preprocessed_reviews.csv") # Tokenize text using BERT tokenizer def tokenize_text(tokenizer, texts, max_length): encodings = tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors="tf") # Convert BatchEncoding to dictionary encodings_dict = {key: value.numpy() for key, value in encodings.items()} return encodings_dict if __name__ == "__main__": # Load preprocessed data data = load_data("preprocessed_reviews.csv") # Check if 'sentiment' column exists if 'sentiment' in data.columns: # Split data into train and validation sets train_data, val_data = train_test_split(data, test_size=0.2, random_state=42) # Tokenize text using BERT tokenizer max_length = 128 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') train_inputs = tokenize_text(tokenizer, train_data['clean_text'], max_length) val_inputs = tokenize_text(tokenizer, val_data['clean_text'], max_length) # Convert 'sentiment' column to numerical format num_labels = len(data['sentiment'].unique()) train_labels = train_data['sentiment'].astype('category').cat.codes.values val_labels = val_data['sentiment'].astype('category').cat.codes.values # Fine-tuning BERT model for sequence classification model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels) optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metrics = ['accuracy'] model.compile(optimizer=optimizer, loss=loss, metrics=metrics) # Train the model history = model.fit( train_inputs, train_labels, validation_data=(val_inputs, val_labels), epochs=3, batch_size=32, verbose=1 ) # Evaluate the model loss, accuracy = model.evaluate(val_inputs, val_labels) print(f'Validation loss: {loss}, Validation accuracy: {accuracy}') # Save the trained model model.save_pretrained('fine_tuned_bert_model') else: raise ValueError("The 'sentiment' column is not found in the DataFrame.")