import pandas as pd import tensorflow as tf import re import nltk from sklearn.model_selection import train_test_split from sklearn.utils.class_weight import compute_class_weight from keras.layers.experimental.preprocessing import TextVectorization from keras.callbacks import ModelCheckpoint from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer nltk.download('stopwords') nltk.download('wordnet') def preprocess_text(text): # Lowercasing the text text = text.lower() # Removing special characters and numbers text = re.sub(r'[^a-zA-Z\s]', '', text) # Tokenization (convert text into tokens) tokens = text.split() # Removing stopwords stop_words = set(stopwords.words('english')) tokens = [token for token in tokens if token not in stop_words] # Lemmatization lemmatizer = WordNetLemmatizer() tokens = [lemmatizer.lemmatize(token) for token in tokens] # Re-join tokens text = ' '.join(tokens) return text # Read the data df = pd.read_csv("hatespeech/hatespeech.csv") # Apply preprocessing to each tweet df["tweet"] = df["tweet"].apply(preprocess_text) x = df["tweet"] y = df['class'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # Create TensorFlow datasets train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)) test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)) # Define encoder VOCAB_SIZE = 5000 encoder = TextVectorization(max_tokens=VOCAB_SIZE) encoder.adapt(train_dataset.map(lambda text, label: text)) # Batch the datasets batch_size = 32 train_dataset = train_dataset.batch(batch_size) test_dataset = test_dataset.batch(batch_size) # Build the model with increased complexity model = tf.keras.Sequential([ encoder, tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=128, mask_zero=True), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.5, recurrent_dropout=0.5)), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(3, activation='softmax') ]) # Learning rate schedule lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=1e-4, decay_steps=1000, decay_rate=0.9) optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) # Compile the model model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), optimizer=optimizer, metrics=['accuracy']) # Compute class weightsfrom sklearn.utils.class_weight import compute_class_weight # Compute class weights class_weights = compute_class_weight(class_weight='balanced', classes=[0, 1, 2], y=y_train) class_weight_dict = {0: class_weights[0], 1: class_weights[1], 2: class_weights[2]} # Define a model checkpoint callback checkpoint = ModelCheckpoint('hatespeech/best_model', monitor='val_loss', verbose=1, save_best_only=True, mode='min') # Train the model with class weights and checkpoint history = model.fit(train_dataset, epochs=10, validation_data=test_dataset, validation_steps=30, class_weight=class_weight_dict, callbacks=[checkpoint]) # Load the best model model = tf.keras.models.load_model('hatespeech/best_model') # Evaluate the model test_loss, test_acc = model.evaluate(test_dataset) # Output the results print('Test Loss:', test_loss) print('Test Accuracy:', test_acc)