Hossein-NK
/

twitter-financial-news-sentiment

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "TPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "import warnings\n",
+        "warnings.filterwarnings('ignore')\n",
+        "\n",
+        "import transformers\n",
+        "transformers_version = transformers.__version__\n",
+        "\n",
+        "if transformers_version > '4.31.1':\n",
+        "  !pip uninstall transformers\n",
+        "  !pip install transformers==4.31\n",
+        "else:\n",
+        "  print(\"transformers version:\", transformers.__version__)"
+      ],
+      "metadata": {
+        "id": "2RcFPIqQJ6CY",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "8030dedf-b9f5-4687-ef87-1c5a4d8ee9b9"
+      },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Found existing installation: transformers 4.31.0\n",
+            "Uninstalling transformers-4.31.0:\n",
+            "  Would remove:\n",
+            "    /usr/local/bin/transformers-cli\n",
+            "    /usr/local/lib/python3.10/dist-packages/transformers-4.31.0.dist-info/*\n",
+            "    /usr/local/lib/python3.10/dist-packages/transformers/*\n",
+            "Proceed (Y/n)? n\n",
+            "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
+            "\u001b[0mRequirement already satisfied: transformers==4.31 in /usr/local/lib/python3.10/dist-packages (4.31.0)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (3.13.4)\n",
+            "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (0.20.3)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (1.25.2)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (24.0)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (6.0.1)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (2023.12.25)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (2.31.0)\n",
+            "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (0.13.3)\n",
+            "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (0.4.3)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (4.66.2)\n",
+            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.31) (2023.6.0)\n",
+            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.31) (4.11.0)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (3.3.2)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (3.7)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (2.0.7)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (2024.2.2)\n",
+            "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
+            "\u001b[0m"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import tensorflow as tf\n",
+        "print(\"TensorFlow version:\", tf.__version__)\n",
+        "\n",
+        "import keras\n",
+        "print(\"Keras version:\", keras.__version__)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "b_0OPx3WukSi",
+        "outputId": "0d205aa3-33b4-4a34-9055-d670cc5ac049"
+      },
+      "execution_count": 2,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "TensorFlow version: 2.15.0\n",
+            "Keras version: 2.15.0\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "WkzyTQGqzbPS",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "9bc0c671-8557-4b3c-a120-0237d7f96253"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Mounted at /content/drive\n"
+          ]
+        }
+      ],
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Loading the Data ###"
+      ],
+      "metadata": {
+        "id": "BKn5EaROLKeX"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "\n",
+        "# Load the CSV file in memory\n",
+        "train_path = '/content/drive/MyDrive/dataset/Twitter_Financial_News_Sentiment/train.csv'\n",
+        "test_path = '/content/drive/MyDrive/dataset/Twitter_Financial_News_Sentiment/test.csv'\n",
+        "\n",
+        "train_df = pd.read_csv(train_path, usecols=['text', 'label'])\n",
+        "test_df = pd.read_csv(test_path, usecols=['text', 'label'])"
+      ],
+      "metadata": {
+        "id": "QztIz9VOKLuV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Show example"
+      ],
+      "metadata": {
+        "id": "hn5ONAwkNeFS"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "train_df.head()"
+      ],
+      "metadata": {
+        "id": "zwYzU-dANpJ-"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "source": [
+        "#import matplotlib library\n",
+        "from matplotlib import pyplot as plt\n",
+        "\n",
+        "#Histogram of \"Label\" column in train datset\n",
+        "train_df['label'].plot(kind='hist', title='Label')\n",
+        "plt.gca().spines[['top', 'right']].set_visible(False)"
+      ],
+      "cell_type": "code",
+      "execution_count": null,
+      "outputs": [],
+      "metadata": {
+        "id": "2M1XLsAeN2GN"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "test_df.head()"
+      ],
+      "metadata": {
+        "id": "g5_oGvo1NvON"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Pritn theshape of datasets\n",
+        "print(f'train_df shape: {train_df.shape}')\n",
+        "print(f'test_df shape: {test_df.shape}')"
+      ],
+      "metadata": {
+        "id": "kCFupI1FQlMF"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Removing the Special Characters ###"
+      ],
+      "metadata": {
+        "id": "zRcmc15aSNx6"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "!pip install text_hammer\n",
+        "\n",
+        "import text_hammer as th\n",
+        "\n",
+        "def text_proccessing(df, col_name):\n",
+        "  \"\"\"\n",
+        "  Process text data in a DataFrame column by performing the following operations:\n",
+        "\n",
+        "  1. Convert text to lowercase.\n",
+        "  2. Remove emails from the text.\n",
+        "  3. Remove accented characters from the text.\n",
+        "  4. Remove URLs from the text.\n",
+        "\n",
+        "  Parameters:\n",
+        "  df (DataFrame): Input DataFrame containing text data.\n",
+        "  col_name (str): Name of the column in the DataFrame containing text data.\n",
+        "\n",
+        "  Returns:\n",
+        "  DataFrame: Processed DataFrame with text data after applying the specified operations.\n",
+        "  \"\"\"\n",
+        "\n",
+        "  # df[col_name] = df[col_name].apply(lambda x:str(x).lower())\n",
+        "  df[col_name] = df[col_name].apply(lambda x: th.remove_emails(x))\n",
+        "  df[col_name] = df[col_name].apply(lambda x: th.remove_accented_chars(x))\n",
+        "  df[col_name] = df[col_name].apply(lambda x: th.remove_urls(x))\n",
+        "\n",
+        "  return df\n",
+        "\n",
+        "train_df = text_proccessing(train_df, 'text')\n"
+      ],
+      "metadata": {
+        "id": "YEMq7SUiS28e"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Print the first sample after cleaning data\n",
+        "train_df['text'].iloc[0:10]"
+      ],
+      "metadata": {
+        "id": "VD92IEhPZQHm"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "###Loading PreTrained BERT Model###"
+      ],
+      "metadata": {
+        "id": "YfH0H1W6c0Bb"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from transformers import AutoTokenizer, TFBertModel\n",
+        "tokenizer =  AutoTokenizer.from_pretrained('bert-base-uncased')\n",
+        "bert = TFBertModel.from_pretrained('bert-base-uncased')\n"
+      ],
+      "metadata": {
+        "id": "ejMMzCOecze9"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "tokenizer(train_df['text'].iloc[0])"
+      ],
+      "metadata": {
+        "id": "PVWkIfE5gLOV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "max_len = max([len(x.split()) for x in train_df.text])\n",
+        "print(f'Max len of tweets: {max_len}')"
+      ],
+      "metadata": {
+        "id": "dGANUQVdhHH7"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "x_train = tokenizer(\n",
+        "                text = train_df.text.tolist(),\n",
+        "                padding = True,\n",
+        "                max_length= 36,\n",
+        "                truncation= True,\n",
+        "                return_tensors = 'tf')\n",
+        "\n",
+        "print(x_train)"
+      ],
+      "metadata": {
+        "id": "q9b4iDZ0jW5-"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(x_train['input_ids'].shape)\n",
+        "print(x_train['attention_mask'].shape)"
+      ],
+      "metadata": {
+        "id": "PUMeXfO8lgNd"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print(train_df.label.value_counts())"
+      ],
+      "metadata": {
+        "id": "RMM1QI3DlpmD"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "y_train = train_df.label.values\n",
+        "y_train\n"
+      ],
+      "metadata": {
+        "id": "4zFkagLml80z"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Building the Model Architecture ###"
+      ],
+      "metadata": {
+        "id": "fFQNe5Cimwxn"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from keras import layers, Model\n",
+        "\n",
+        "max_length = 36\n",
+        "\n",
+        "input_ids = layers.Input(shape=(max_length,), dtype=tf.int32, name=\"input_ids\")\n",
+        "input_mask = layers.Input(shape=(max_length,), dtype=tf.int32, name=\"attention_mask\")\n",
+        "\n",
+        "embeddings = bert(input_ids,attention_mask = input_mask)[1] #(0 is the last hidden states,1 means pooler_output)\n",
+        "\n",
+        "out = layers.Dropout(0.1)(embeddings)\n",
+        "out = layers.Dense(128, activation='relu')(out)\n",
+        "out = layers.Dropout(0.1)(out)\n",
+        "out = layers.Dense(32,activation = 'relu')(out)\n",
+        "\n",
+        "y = layers.Dense(3,activation = 'softmax')(out)\n",
+        "\n",
+        "model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)\n",
+        "model.layers[2].trainable = False"
+      ],
+      "metadata": {
+        "id": "DE1XbnVomwMc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model.summary()"
+      ],
+      "metadata": {
+        "id": "GuxGCjYjrTyY"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from keras.optimizers import Adam\n",
+        "\n",
+        "optimizer = Adam(\n",
+        "    learning_rate = 6e-06, # this learning rate is for bert model , taken from huggingface website\n",
+        "    epsilon=1e-08,\n",
+        "    weight_decay=0.01)\n",
+        "\n",
+        "# Compile the model\n",
+        "model.compile(\n",
+        "    optimizer = optimizer,\n",
+        "    loss = 'sparse_categorical_crossentropy',\n",
+        "    metrics = [\"sparse_categorical_accuracy\"])"
+      ],
+      "metadata": {
+        "id": "FyyNrAAf7QMP"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "train_history = model.fit(\n",
+        "    x = {'input_ids':x_train['input_ids'], 'attention_mask':x_train['attention_mask']} ,\n",
+        "    y = y_train,\n",
+        "    validation_split = 0.1,\n",
+        "    epochs= 3,\n",
+        "    batch_size= 32)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "bEnttT2rA8Yw",
+        "outputId": "644c03fd-0cc0-40ff-8108-e059e3a4a0dd"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Epoch 1/3\n",
+            "118/269 [============>.................] - ETA: 10:10 - loss: 0.9140 - sparse_categorical_accuracy: 0.6261"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "#### TESTING PHASE\n",
+        "on this phase we will make predictions out of our model"
+      ],
+      "metadata": {
+        "id": "hgiDVRwSBtCN"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "x_test = tokenizer(\n",
+        "    text = test_df.text.tolist(),\n",
+        "    padding= True,\n",
+        "    max_length= 36,\n",
+        "    truncation = True,\n",
+        "    return_tensors= 'tf')"
+      ],
+      "metadata": {
+        "id": "xaKYd2PRBySe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "y_test = test_df.label.values\n",
+        "y_test"
+      ],
+      "metadata": {
+        "id": "OpvHTg3atflb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "predicted = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})"
+      ],
+      "metadata": {
+        "id": "nWgCdpKvCSWm"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from sklearn.metrics import confusion_matrix\n",
+        "import seaborn as sns\n",
+        "\n",
+        "# Convert the predictions to binary values (0 or 1)\n",
+        "y_pred_binary = [int(round(x[0])) for x in predicted]\n",
+        "\n",
+        "# Generate the confusion matrix\n",
+        "cm = confusion_matrix(test_df['label'], y_pred_binary)\n",
+        "\n",
+        "# Create a heatmap of the confusion matrix\n",
+        "sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\")\n",
+        "plt.xlabel(\"Predicted Label\")\n",
+        "plt.ylabel(\"True Label\")\n",
+        "plt.title(\"Confusion Matrix\")\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "-BICUoNs_8qI"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}