Arsalan8's picture
Upload 9 files
f6ef35c verified
import pandas as pd
import tensorflow as tf
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from keras.layers.experimental.preprocessing import TextVectorization
from keras.callbacks import ModelCheckpoint
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
def preprocess_text(text):
# Lowercasing the text
text = text.lower()
# Removing special characters and numbers
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Tokenization (convert text into tokens)
tokens = text.split()
# Removing stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]
# Lemmatization
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]
# Re-join tokens
text = ' '.join(tokens)
return text
# Read the data
df = pd.read_csv("hatespeech/hatespeech.csv")
# Apply preprocessing to each tweet
df["tweet"] = df["tweet"].apply(preprocess_text)
x = df["tweet"]
y = df['class']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
# Define encoder
VOCAB_SIZE = 5000
encoder = TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))
# Batch the datasets
batch_size = 32
train_dataset = train_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)
# Build the model with increased complexity
model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=128, mask_zero=True),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.5, recurrent_dropout=0.5)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(3, activation='softmax')
])
# Learning rate schedule
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=1e-4,
decay_steps=1000,
decay_rate=0.9)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
# Compile the model
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
optimizer=optimizer,
metrics=['accuracy'])
# Compute class weightsfrom sklearn.utils.class_weight import compute_class_weight
# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=[0, 1, 2], y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1], 2: class_weights[2]}
# Define a model checkpoint callback
checkpoint = ModelCheckpoint('hatespeech/best_model', monitor='val_loss', verbose=1,
save_best_only=True, mode='min')
# Train the model with class weights and checkpoint
history = model.fit(train_dataset, epochs=10, validation_data=test_dataset, validation_steps=30,
class_weight=class_weight_dict, callbacks=[checkpoint])
# Load the best model
model = tf.keras.models.load_model('hatespeech/best_model')
# Evaluate the model
test_loss, test_acc = model.evaluate(test_dataset)
# Output the results
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)