In [1]:
import warnings
warnings.filterwarnings('ignore')

import transformers
transformers_version = transformers.__version__

if transformers_version > '4.31.1':
 !pip uninstall transformers
 !pip install transformers==4.31
else:
 print("transformers version:", transformers.__version__)

Found existing installation: transformers 4.31.0
Uninstalling transformers-4.31.0:
 Would remove:
 /usr/local/bin/transformers-cli
 /usr/local/lib/python3.10/dist-packages/transformers-4.31.0.dist-info/*
 /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? n
[0m

In [2]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

import keras
print("Keras version:", keras.__version__)

TensorFlow version: 2.15.0
Keras version: 2.15.0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Loading the Data ###

In [None]:
import pandas as pd

# Load the CSV file in memory
train_path = '/content/drive/MyDrive/dataset/Twitter_Financial_News_Sentiment/train.csv'
test_path = '/content/drive/MyDrive/dataset/Twitter_Financial_News_Sentiment/test.csv'

train_df = pd.read_csv(train_path, usecols=['text', 'label'])
test_df = pd.read_csv(test_path, usecols=['text', 'label'])

Show example

In [None]:
train_df.head()

In [None]:
#import matplotlib library
from matplotlib import pyplot as plt

#Histogram of "Label" column in train datset
train_df['label'].plot(kind='hist', title='Label')
plt.gca().spines[['top', 'right']].set_visible(False)

In [None]:
test_df.head()

In [None]:
# Pritn theshape of datasets
print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')

### Removing the Special Characters ###

In [None]:

!pip install text_hammer

import text_hammer as th

def text_proccessing(df, col_name):
 """
 Process text data in a DataFrame column by performing the following operations:

 1. Convert text to lowercase.
 2. Remove emails from the text.
 3. Remove accented characters from the text.
 4. Remove URLs from the text.

 Parameters:
 df (DataFrame): Input DataFrame containing text data.
 col_name (str): Name of the column in the DataFrame containing text data.

 Returns:
 DataFrame: Processed DataFrame with text data after applying the specified operations.
 """

 # df[col_name] = df[col_name].apply(lambda x:str(x).lower())
 df[col_name] = df[col_name].apply(lambda x: th.remove_emails(x))
 df[col_name] = df[col_name].apply(lambda x: th.remove_accented_chars(x))
 df[col_name] = df[col_name].apply(lambda x: th.remove_urls(x))

 return df

train_df = text_proccessing(train_df, 'text')


In [None]:
# Print the first sample after cleaning data
train_df['text'].iloc[0:10]

###Loading PreTrained BERT Model###

In [None]:
from transformers import AutoTokenizer, TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert = TFBertModel.from_pretrained('bert-base-uncased')


In [None]:
tokenizer(train_df['text'].iloc[0])

In [None]:
max_len = max([len(x.split()) for x in train_df.text])
print(f'Max len of tweets: {max_len}')

In [None]:
x_train = tokenizer(
 text = train_df.text.tolist(),
 padding = True,
 max_length= 36,
 truncation= True,
 return_tensors = 'tf')

print(x_train)

In [None]:
print(x_train['input_ids'].shape)
print(x_train['attention_mask'].shape)

In [None]:
print(train_df.label.value_counts())

In [None]:
y_train = train_df.label.values
y_train


### Building the Model Architecture ###

In [None]:
from keras import layers, Model

max_length = 36

input_ids = layers.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
input_mask = layers.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")

embeddings = bert(input_ids,attention_mask = input_mask)[1] #(0 is the last hidden states,1 means pooler_output)

out = layers.Dropout(0.1)(embeddings)
out = layers.Dense(128, activation='relu')(out)
out = layers.Dropout(0.1)(out)
out = layers.Dense(32,activation = 'relu')(out)

y = layers.Dense(3,activation = 'softmax')(out)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = False

In [None]:
model.summary()

In [None]:
from keras.optimizers import Adam

optimizer = Adam(
 learning_rate = 6e-06, # this learning rate is for bert model , taken from huggingface website
 epsilon=1e-08,
 weight_decay=0.01)

# Compile the model
model.compile(
 optimizer = optimizer,
 loss = 'sparse_categorical_crossentropy',
 metrics = ["sparse_categorical_accuracy"])

In [None]:
train_history = model.fit(
 x = {'input_ids':x_train['input_ids'], 'attention_mask':x_train['attention_mask']} ,
 y = y_train,
 validation_split = 0.1,
 epochs= 3,
 batch_size= 32)

Epoch 1/3

#### TESTING PHASE
on this phase we will make predictions out of our model

In [None]:
x_test = tokenizer(
 text = test_df.text.tolist(),
 padding= True,
 max_length= 36,
 truncation = True,
 return_tensors= 'tf')

In [None]:
y_test = test_df.label.values
y_test

In [None]:
predicted = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Convert the predictions to binary values (0 or 1)
y_pred_binary = [int(round(x[0])) for x in predicted]

# Generate the confusion matrix
cm = confusion_matrix(test_df['label'], y_pred_binary)

# Create a heatmap of the confusion matrix
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()