Spaces:

Arsalan8
/

hatespeech_detector

Runtime error

App Files Files Community

hatespeech_detector / main.py

Arsalan8

Upload 9 files

f6ef35c verified about 1 year ago

raw

history blame contribute delete

3.76 kB

	import pandas as pd
	import tensorflow as tf
	import re
	import nltk
	from sklearn.model_selection import train_test_split
	from sklearn.utils.class_weight import compute_class_weight
	from keras.layers.experimental.preprocessing import TextVectorization
	from keras.callbacks import ModelCheckpoint
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer

	nltk.download('stopwords')
	nltk.download('wordnet')

	def preprocess_text(text):
	# Lowercasing the text
	text = text.lower()

	# Removing special characters and numbers
	text = re.sub(r'[^a-zA-Z\s]', '', text)

	# Tokenization (convert text into tokens)
	tokens = text.split()

	# Removing stopwords
	stop_words = set(stopwords.words('english'))
	tokens = [token for token in tokens if token not in stop_words]

	# Lemmatization
	lemmatizer = WordNetLemmatizer()
	tokens = [lemmatizer.lemmatize(token) for token in tokens]

	# Re-join tokens
	text = ' '.join(tokens)

	return text

	# Read the data
	df = pd.read_csv("hatespeech/hatespeech.csv")

	# Apply preprocessing to each tweet
	df["tweet"] = df["tweet"].apply(preprocess_text)

	x = df["tweet"]
	y = df['class']

	# Split the data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

	# Create TensorFlow datasets
	train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
	test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

	# Define encoder
	VOCAB_SIZE = 5000
	encoder = TextVectorization(max_tokens=VOCAB_SIZE)
	encoder.adapt(train_dataset.map(lambda text, label: text))

	# Batch the datasets
	batch_size = 32
	train_dataset = train_dataset.batch(batch_size)
	test_dataset = test_dataset.batch(batch_size)

	# Build the model with increased complexity
	model = tf.keras.Sequential([
	encoder,
	tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=128, mask_zero=True),
	tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)),
	tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.5, recurrent_dropout=0.5)),
	tf.keras.layers.Dense(128, activation='relu'),
	tf.keras.layers.Dropout(0.5),
	tf.keras.layers.Dense(64, activation='relu'),
	tf.keras.layers.Dropout(0.5),
	tf.keras.layers.Dense(3, activation='softmax')
	])

	# Learning rate schedule
	lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
	initial_learning_rate=1e-4,
	decay_steps=1000,
	decay_rate=0.9)

	optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

	# Compile the model
	model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
	optimizer=optimizer,
	metrics=['accuracy'])

	# Compute class weightsfrom sklearn.utils.class_weight import compute_class_weight

	# Compute class weights
	class_weights = compute_class_weight(class_weight='balanced', classes=[0, 1, 2], y=y_train)
	class_weight_dict = {0: class_weights[0], 1: class_weights[1], 2: class_weights[2]}

	# Define a model checkpoint callback
	checkpoint = ModelCheckpoint('hatespeech/best_model', monitor='val_loss', verbose=1,
	save_best_only=True, mode='min')

	# Train the model with class weights and checkpoint
	history = model.fit(train_dataset, epochs=10, validation_data=test_dataset, validation_steps=30,
	class_weight=class_weight_dict, callbacks=[checkpoint])

	# Load the best model
	model = tf.keras.models.load_model('hatespeech/best_model')

	# Evaluate the model
	test_loss, test_acc = model.evaluate(test_dataset)

	# Output the results
	print('Test Loss:', test_loss)
	print('Test Accuracy:', test_acc)