Spaces:
Runtime error
Runtime error
import pandas as pd | |
import tensorflow as tf | |
import re | |
import nltk | |
from sklearn.model_selection import train_test_split | |
from sklearn.utils.class_weight import compute_class_weight | |
from keras.layers.experimental.preprocessing import TextVectorization | |
from keras.callbacks import ModelCheckpoint | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
nltk.download('stopwords') | |
nltk.download('wordnet') | |
def preprocess_text(text): | |
# Lowercasing the text | |
text = text.lower() | |
# Removing special characters and numbers | |
text = re.sub(r'[^a-zA-Z\s]', '', text) | |
# Tokenization (convert text into tokens) | |
tokens = text.split() | |
# Removing stopwords | |
stop_words = set(stopwords.words('english')) | |
tokens = [token for token in tokens if token not in stop_words] | |
# Lemmatization | |
lemmatizer = WordNetLemmatizer() | |
tokens = [lemmatizer.lemmatize(token) for token in tokens] | |
# Re-join tokens | |
text = ' '.join(tokens) | |
return text | |
# Read the data | |
df = pd.read_csv("hatespeech/hatespeech.csv") | |
# Apply preprocessing to each tweet | |
df["tweet"] = df["tweet"].apply(preprocess_text) | |
x = df["tweet"] | |
y = df['class'] | |
# Split the data into training and testing sets | |
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) | |
# Create TensorFlow datasets | |
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)) | |
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)) | |
# Define encoder | |
VOCAB_SIZE = 5000 | |
encoder = TextVectorization(max_tokens=VOCAB_SIZE) | |
encoder.adapt(train_dataset.map(lambda text, label: text)) | |
# Batch the datasets | |
batch_size = 32 | |
train_dataset = train_dataset.batch(batch_size) | |
test_dataset = test_dataset.batch(batch_size) | |
# Build the model with increased complexity | |
model = tf.keras.Sequential([ | |
encoder, | |
tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=128, mask_zero=True), | |
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)), | |
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.5, recurrent_dropout=0.5)), | |
tf.keras.layers.Dense(128, activation='relu'), | |
tf.keras.layers.Dropout(0.5), | |
tf.keras.layers.Dense(64, activation='relu'), | |
tf.keras.layers.Dropout(0.5), | |
tf.keras.layers.Dense(3, activation='softmax') | |
]) | |
# Learning rate schedule | |
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( | |
initial_learning_rate=1e-4, | |
decay_steps=1000, | |
decay_rate=0.9) | |
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) | |
# Compile the model | |
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), | |
optimizer=optimizer, | |
metrics=['accuracy']) | |
# Compute class weightsfrom sklearn.utils.class_weight import compute_class_weight | |
# Compute class weights | |
class_weights = compute_class_weight(class_weight='balanced', classes=[0, 1, 2], y=y_train) | |
class_weight_dict = {0: class_weights[0], 1: class_weights[1], 2: class_weights[2]} | |
# Define a model checkpoint callback | |
checkpoint = ModelCheckpoint('hatespeech/best_model', monitor='val_loss', verbose=1, | |
save_best_only=True, mode='min') | |
# Train the model with class weights and checkpoint | |
history = model.fit(train_dataset, epochs=10, validation_data=test_dataset, validation_steps=30, | |
class_weight=class_weight_dict, callbacks=[checkpoint]) | |
# Load the best model | |
model = tf.keras.models.load_model('hatespeech/best_model') | |
# Evaluate the model | |
test_loss, test_acc = model.evaluate(test_dataset) | |
# Output the results | |
print('Test Loss:', test_loss) | |
print('Test Accuracy:', test_acc) | |