{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "TPU" }, "cells": [ { "cell_type": "code", "source": [ "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "import transformers\n", "transformers_version = transformers.__version__\n", "\n", "if transformers_version > '4.31.1':\n", " !pip uninstall transformers\n", " !pip install transformers==4.31\n", "else:\n", " print(\"transformers version:\", transformers.__version__)" ], "metadata": { "id": "2RcFPIqQJ6CY", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "8030dedf-b9f5-4687-ef87-1c5a4d8ee9b9" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Found existing installation: transformers 4.31.0\n", "Uninstalling transformers-4.31.0:\n", " Would remove:\n", " /usr/local/bin/transformers-cli\n", " /usr/local/lib/python3.10/dist-packages/transformers-4.31.0.dist-info/*\n", " /usr/local/lib/python3.10/dist-packages/transformers/*\n", "Proceed (Y/n)? n\n", "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n", "\u001b[0mRequirement already satisfied: transformers==4.31 in /usr/local/lib/python3.10/dist-packages (4.31.0)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (3.13.4)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (0.20.3)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (1.25.2)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (24.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (2023.12.25)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (2.31.0)\n", "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (0.13.3)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (0.4.3)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (4.66.2)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.31) (2023.6.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.31) (4.11.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (2024.2.2)\n", "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n", "\u001b[0m" ] } ] }, { "cell_type": "code", "source": [ "import tensorflow as tf\n", "print(\"TensorFlow version:\", tf.__version__)\n", "\n", "import keras\n", "print(\"Keras version:\", keras.__version__)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "b_0OPx3WukSi", "outputId": "0d205aa3-33b4-4a34-9055-d670cc5ac049" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "TensorFlow version: 2.15.0\n", "Keras version: 2.15.0\n" ] } ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "WkzyTQGqzbPS", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "9bc0c671-8557-4b3c-a120-0237d7f96253" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "markdown", "source": [ "### Loading the Data ###" ], "metadata": { "id": "BKn5EaROLKeX" } }, { "cell_type": "code", "source": [ "import pandas as pd\n", "\n", "# Load the CSV file in memory\n", "train_path = '/content/drive/MyDrive/dataset/Twitter_Financial_News_Sentiment/train.csv'\n", "test_path = '/content/drive/MyDrive/dataset/Twitter_Financial_News_Sentiment/test.csv'\n", "\n", "train_df = pd.read_csv(train_path, usecols=['text', 'label'])\n", "test_df = pd.read_csv(test_path, usecols=['text', 'label'])" ], "metadata": { "id": "QztIz9VOKLuV" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Show example" ], "metadata": { "id": "hn5ONAwkNeFS" } }, { "cell_type": "code", "source": [ "train_df.head()" ], "metadata": { "id": "zwYzU-dANpJ-" }, "execution_count": null, "outputs": [] }, { "source": [ "#import matplotlib library\n", "from matplotlib import pyplot as plt\n", "\n", "#Histogram of \"Label\" column in train datset\n", "train_df['label'].plot(kind='hist', title='Label')\n", "plt.gca().spines[['top', 'right']].set_visible(False)" ], "cell_type": "code", "execution_count": null, "outputs": [], "metadata": { "id": "2M1XLsAeN2GN" } }, { "cell_type": "code", "source": [ "test_df.head()" ], "metadata": { "id": "g5_oGvo1NvON" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Pritn theshape of datasets\n", "print(f'train_df shape: {train_df.shape}')\n", "print(f'test_df shape: {test_df.shape}')" ], "metadata": { "id": "kCFupI1FQlMF" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Removing the Special Characters ###" ], "metadata": { "id": "zRcmc15aSNx6" } }, { "cell_type": "code", "source": [ "\n", "!pip install text_hammer\n", "\n", "import text_hammer as th\n", "\n", "def text_proccessing(df, col_name):\n", " \"\"\"\n", " Process text data in a DataFrame column by performing the following operations:\n", "\n", " 1. Convert text to lowercase.\n", " 2. Remove emails from the text.\n", " 3. Remove accented characters from the text.\n", " 4. Remove URLs from the text.\n", "\n", " Parameters:\n", " df (DataFrame): Input DataFrame containing text data.\n", " col_name (str): Name of the column in the DataFrame containing text data.\n", "\n", " Returns:\n", " DataFrame: Processed DataFrame with text data after applying the specified operations.\n", " \"\"\"\n", "\n", " # df[col_name] = df[col_name].apply(lambda x:str(x).lower())\n", " df[col_name] = df[col_name].apply(lambda x: th.remove_emails(x))\n", " df[col_name] = df[col_name].apply(lambda x: th.remove_accented_chars(x))\n", " df[col_name] = df[col_name].apply(lambda x: th.remove_urls(x))\n", "\n", " return df\n", "\n", "train_df = text_proccessing(train_df, 'text')\n" ], "metadata": { "id": "YEMq7SUiS28e" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Print the first sample after cleaning data\n", "train_df['text'].iloc[0:10]" ], "metadata": { "id": "VD92IEhPZQHm" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "###Loading PreTrained BERT Model###" ], "metadata": { "id": "YfH0H1W6c0Bb" } }, { "cell_type": "code", "source": [ "from transformers import AutoTokenizer, TFBertModel\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "bert = TFBertModel.from_pretrained('bert-base-uncased')\n" ], "metadata": { "id": "ejMMzCOecze9" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "tokenizer(train_df['text'].iloc[0])" ], "metadata": { "id": "PVWkIfE5gLOV" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "max_len = max([len(x.split()) for x in train_df.text])\n", "print(f'Max len of tweets: {max_len}')" ], "metadata": { "id": "dGANUQVdhHH7" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "x_train = tokenizer(\n", " text = train_df.text.tolist(),\n", " padding = True,\n", " max_length= 36,\n", " truncation= True,\n", " return_tensors = 'tf')\n", "\n", "print(x_train)" ], "metadata": { "id": "q9b4iDZ0jW5-" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print(x_train['input_ids'].shape)\n", "print(x_train['attention_mask'].shape)" ], "metadata": { "id": "PUMeXfO8lgNd" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print(train_df.label.value_counts())" ], "metadata": { "id": "RMM1QI3DlpmD" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "y_train = train_df.label.values\n", "y_train\n" ], "metadata": { "id": "4zFkagLml80z" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Building the Model Architecture ###" ], "metadata": { "id": "fFQNe5Cimwxn" } }, { "cell_type": "code", "source": [ "from keras import layers, Model\n", "\n", "max_length = 36\n", "\n", "input_ids = layers.Input(shape=(max_length,), dtype=tf.int32, name=\"input_ids\")\n", "input_mask = layers.Input(shape=(max_length,), dtype=tf.int32, name=\"attention_mask\")\n", "\n", "embeddings = bert(input_ids,attention_mask = input_mask)[1] #(0 is the last hidden states,1 means pooler_output)\n", "\n", "out = layers.Dropout(0.1)(embeddings)\n", "out = layers.Dense(128, activation='relu')(out)\n", "out = layers.Dropout(0.1)(out)\n", "out = layers.Dense(32,activation = 'relu')(out)\n", "\n", "y = layers.Dense(3,activation = 'softmax')(out)\n", "\n", "model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)\n", "model.layers[2].trainable = False" ], "metadata": { "id": "DE1XbnVomwMc" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model.summary()" ], "metadata": { "id": "GuxGCjYjrTyY" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from keras.optimizers import Adam\n", "\n", "optimizer = Adam(\n", " learning_rate = 6e-06, # this learning rate is for bert model , taken from huggingface website\n", " epsilon=1e-08,\n", " weight_decay=0.01)\n", "\n", "# Compile the model\n", "model.compile(\n", " optimizer = optimizer,\n", " loss = 'sparse_categorical_crossentropy',\n", " metrics = [\"sparse_categorical_accuracy\"])" ], "metadata": { "id": "FyyNrAAf7QMP" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "train_history = model.fit(\n", " x = {'input_ids':x_train['input_ids'], 'attention_mask':x_train['attention_mask']} ,\n", " y = y_train,\n", " validation_split = 0.1,\n", " epochs= 3,\n", " batch_size= 32)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bEnttT2rA8Yw", "outputId": "644c03fd-0cc0-40ff-8108-e059e3a4a0dd" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch 1/3\n", "118/269 [============>.................] - ETA: 10:10 - loss: 0.9140 - sparse_categorical_accuracy: 0.6261" ] } ] }, { "cell_type": "markdown", "source": [ "#### TESTING PHASE\n", "on this phase we will make predictions out of our model" ], "metadata": { "id": "hgiDVRwSBtCN" } }, { "cell_type": "code", "source": [ "x_test = tokenizer(\n", " text = test_df.text.tolist(),\n", " padding= True,\n", " max_length= 36,\n", " truncation = True,\n", " return_tensors= 'tf')" ], "metadata": { "id": "xaKYd2PRBySe" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "y_test = test_df.label.values\n", "y_test" ], "metadata": { "id": "OpvHTg3atflb" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "predicted = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})" ], "metadata": { "id": "nWgCdpKvCSWm" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.metrics import confusion_matrix\n", "import seaborn as sns\n", "\n", "# Convert the predictions to binary values (0 or 1)\n", "y_pred_binary = [int(round(x[0])) for x in predicted]\n", "\n", "# Generate the confusion matrix\n", "cm = confusion_matrix(test_df['label'], y_pred_binary)\n", "\n", "# Create a heatmap of the confusion matrix\n", "sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\")\n", "plt.xlabel(\"Predicted Label\")\n", "plt.ylabel(\"True Label\")\n", "plt.title(\"Confusion Matrix\")\n", "plt.show()" ], "metadata": { "id": "-BICUoNs_8qI" }, "execution_count": null, "outputs": [] } ] }