initial commit

Browse files

Files changed (9) hide show

Hugging_Face_Bert_Malay_Sentiment.ipynb +892 -0
README.md +19 -0
archive/model-20211015/config.json +34 -0
archive/model-20211015/tf_model.h5 +3 -0
config.json +34 -0
special_tokens_map.json +1 -0
tf_model.h5 +3 -0
tokenizer_config.json +1 -0
vocab.txt +0 -0

Hugging_Face_Bert_Malay_Sentiment.ipynb ADDED Viewed

	@@ -0,0 +1,892 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "633fetsKg5cv",
+        "outputId": "379a3769-9478-4749-cc71-bbf46e6478f9"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Collecting transformers\n",
+            "  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)\n",
+            "\u001b[K     |████████████████████████████████| 2.9 MB 5.2 MB/s \n",
+            "\u001b[?25hCollecting pyyaml>=5.1\n",
+            "  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n",
+            "\u001b[K     |████████████████████████████████| 596 kB 37.4 MB/s \n",
+            "\u001b[?25hCollecting huggingface-hub>=0.0.17\n",
+            "  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)\n",
+            "\u001b[K     |████████████████████████████████| 56 kB 4.7 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n",
+            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.0)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n",
+            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.3)\n",
+            "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.8.1)\n",
+            "Collecting sacremoses\n",
+            "  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n",
+            "\u001b[K     |████████████████████████████████| 895 kB 41.5 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.3.0)\n",
+            "Collecting tokenizers<0.11,>=0.10.1\n",
+            "  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n",
+            "\u001b[K     |████████████████████████████████| 3.3 MB 26.2 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.17->transformers) (3.7.4.3)\n",
+            "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (2.4.7)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.6.0)\n",
+            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)\n",
+            "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n",
+            "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n",
+            "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n",
+            "  Attempting uninstall: pyyaml\n",
+            "    Found existing installation: PyYAML 3.13\n",
+            "    Uninstalling PyYAML-3.13:\n",
+            "      Successfully uninstalled PyYAML-3.13\n",
+            "Successfully installed huggingface-hub-0.0.19 pyyaml-6.0 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.11.3\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install transformers"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 37,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "9pi31_2cndZU",
+        "outputId": "f04cc4a8-7baf-404c-d059-66675a6dda63"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']\n",
+            "- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
+            "- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
+            "Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias']\n",
+            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+          ]
+        }
+      ],
+      "source": [
+        "import tensorflow as tf\n",
+        "import json\n",
+        "from transformers import AutoConfig, AutoTokenizer, TFAutoModelForSequenceClassification\n",
+        "\n",
+        "config = AutoConfig.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased', id2label={\"0\": \"negative\",\"1\": \"positive\"}, \n",
+        "                                    label2id={\"negative\": 0,\"positive\": 1})\n",
+        "tokenizer = AutoTokenizer.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased')\n",
+        "model = TFAutoModelForSequenceClassification.from_pretrained(\"malay-huggingface/bert-tiny-bahasa-cased\", from_pt=True, config=config)\n",
+        "\n",
+        "# config = AutoConfig.from_pretrained('malay-huggingface/bert-base-bahasa-cased', id2label={\"0\": \"negative\",\"1\": \"positive\"}, \n",
+        "#                                     label2id={\"negative\": 0,\"positive\": 1})\n",
+        "\n",
+        "# tokenizer = AutoTokenizer.from_pretrained(\"malay-huggingface/bert-base-bahasa-cased\")\n",
+        "# model = TFAutoModelForSequenceClassification.from_pretrained(\"malay-huggingface/bert-base-bahasa-cased\", from_pt=True, config=config)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "6mkizKwiJFeZ"
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 422
+        },
+        "id": "kgMs04IDJx2z",
+        "outputId": "6ba3687d-4ac9-48f6-a275-1a652a073dcc"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>label</th>\n",
+              "      <th>text</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>Negative</td>\n",
+              "      <td>Lebih-lebih lagi dengan  kemudahan internet da...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>Positive</td>\n",
+              "      <td>boleh memberi teguran kepada parti tetapi perl...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>Negative</td>\n",
+              "      <td>Adalah membingungkan mengapa masyarakat Cina b...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>Positive</td>\n",
+              "      <td>Kami menurunkan defisit daripada 6.7 peratus p...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>Negative</td>\n",
+              "      <td>Ini masalahnya. Bukan rakyat, tetapi sistem</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>...</th>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3680</th>\n",
+              "      <td>Positive</td>\n",
+              "      <td>Jelas pembangkang buat tuduhan untuk mengeliru...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3681</th>\n",
+              "      <td>Positive</td>\n",
+              "      <td>demokrasi adalah kuasa rakyat di mana pegawai ...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3682</th>\n",
+              "      <td>Positive</td>\n",
+              "      <td>Selain dapat menyelesaikan isu beg berat, peng...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3683</th>\n",
+              "      <td>Positive</td>\n",
+              "      <td>Hospital Langkawi buat masa ini hanya dapat me...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3684</th>\n",
+              "      <td>Positive</td>\n",
+              "      <td>Jika sebelum ini kita selesa bergerak dalam ‘g...</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "<p>3685 rows × 2 columns</p>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "         label                                               text\n",
+              "0     Negative  Lebih-lebih lagi dengan  kemudahan internet da...\n",
+              "1     Positive  boleh memberi teguran kepada parti tetapi perl...\n",
+              "2     Negative  Adalah membingungkan mengapa masyarakat Cina b...\n",
+              "3     Positive  Kami menurunkan defisit daripada 6.7 peratus p...\n",
+              "4     Negative        Ini masalahnya. Bukan rakyat, tetapi sistem\n",
+              "...        ...                                                ...\n",
+              "3680  Positive  Jelas pembangkang buat tuduhan untuk mengeliru...\n",
+              "3681  Positive  demokrasi adalah kuasa rakyat di mana pegawai ...\n",
+              "3682  Positive  Selain dapat menyelesaikan isu beg berat, peng...\n",
+              "3683  Positive  Hospital Langkawi buat masa ini hanya dapat me...\n",
+              "3684  Positive  Jika sebelum ini kita selesa bergerak dalam ‘g...\n",
+              "\n",
+              "[3685 rows x 2 columns]"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "sentiment_df = pd.read_csv(\"https://raw.githubusercontent.com/huseinzol05/malaya/master/finetune/sentiment-data-v2.csv\")\n",
+        "sentiment_df"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "hEfJHRjEo1uk"
+      },
+      "outputs": [],
+      "source": [
+        "sentiment_df[\"label\"] = sentiment_df[\"label\"].map({'Positive': 1, 'Negative': 0})\n",
+        "\n",
+        "positive_df = pd.read_csv(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/polarity/polarity-positive-translated.txt\", names=[\"text\"])\n",
+        "positive_df[\"label\"] = 1\n",
+        "\n",
+        "negative_df = pd.read_csv(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/polarity/polarity-negative-translated.txt\", names=[\"text\"])\n",
+        "negative_df[\"label\"] = 0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "id": "iciAB9tss4tW"
+      },
+      "outputs": [],
+      "source": [
+        "amazon_df = pd.read_json(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-amazon.json\", orient='index').T\n",
+        "yelp_df = pd.read_json(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-yelp.json\", orient='index').T\n",
+        "imdb_df = pd.read_json(\"https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/translate/multidomain-sentiment/bm-imdb.json\", orient='index').T\n",
+        "\n",
+        "def process_json_df(df):\n",
+        "  positive_df = df[[\"positive\"]].dropna()\n",
+        "  positive_df.columns = [\"text\"]\n",
+        "  positive_df[\"label\"] = 1\n",
+        "\n",
+        "  negative_df = df[[\"negative\"]].dropna()\n",
+        "  negative_df.columns = [\"text\"]\n",
+        "  negative_df[\"label\"] = 0\n",
+        "\n",
+        "  return pd.concat([positive_df, negative_df])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 422
+        },
+        "id": "GRX3doXvvqjw",
+        "outputId": "6c202e02-04d9-4560-8c16-d44163d92ce6"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>label</th>\n",
+              "      <th>text</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>0</td>\n",
+              "      <td>Lebih-lebih lagi dengan  kemudahan internet da...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>1</td>\n",
+              "      <td>boleh memberi teguran kepada parti tetapi perl...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>0</td>\n",
+              "      <td>Adalah membingungkan mengapa masyarakat Cina b...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>1</td>\n",
+              "      <td>Kami menurunkan defisit daripada 6.7 peratus p...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>0</td>\n",
+              "      <td>Ini masalahnya. Bukan rakyat, tetapi sistem</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>...</th>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>16720</th>\n",
+              "      <td>0</td>\n",
+              "      <td>dalam satu perkataan, ia memalukan.</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>16721</th>\n",
+              "      <td>0</td>\n",
+              "      <td>Saya tidak pernah keluar dari filem dengan pan...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>16722</th>\n",
+              "      <td>0</td>\n",
+              "      <td>saya hanya bosan menonton jessica lange mengam...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>16723</th>\n",
+              "      <td>0</td>\n",
+              "      <td>semua dalam satu penghinaan terhadap kecerdasa...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>16724</th>\n",
+              "      <td>0</td>\n",
+              "      <td>yang ingin melayari gelombang kecil filem angk...</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "<p>16725 rows × 2 columns</p>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "       label                                               text\n",
+              "0          0  Lebih-lebih lagi dengan  kemudahan internet da...\n",
+              "1          1  boleh memberi teguran kepada parti tetapi perl...\n",
+              "2          0  Adalah membingungkan mengapa masyarakat Cina b...\n",
+              "3          1  Kami menurunkan defisit daripada 6.7 peratus p...\n",
+              "4          0        Ini masalahnya. Bukan rakyat, tetapi sistem\n",
+              "...      ...                                                ...\n",
+              "16720      0                dalam satu perkataan, ia memalukan.\n",
+              "16721      0  Saya tidak pernah keluar dari filem dengan pan...\n",
+              "16722      0  saya hanya bosan menonton jessica lange mengam...\n",
+              "16723      0  semua dalam satu penghinaan terhadap kecerdasa...\n",
+              "16724      0  yang ingin melayari gelombang kecil filem angk...\n",
+              "\n",
+              "[16725 rows x 2 columns]"
+            ]
+          },
+          "execution_count": 8,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "# df = pd.concat([sentiment_df, positive_df, negative_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)\n",
+        "# df = pd.concat([sentiment_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)\n",
+        "df = pd.concat([sentiment_df, positive_df, negative_df, process_json_df(amazon_df), process_json_df(yelp_df), process_json_df(imdb_df)], ignore_index=True)\n",
+        "\n",
+        "df"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "FeWmvyotp9RP",
+        "outputId": "c3b34cb1-28d6-4c60-a4f0-778bd398ba02"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "13380\n",
+            "3345\n"
+          ]
+        }
+      ],
+      "source": [
+        "from sklearn.model_selection import train_test_split\n",
+        "\n",
+        "# sentences = sarcasm_df[\"headline\"].tolist()\n",
+        "# labels = sarcasm_df[\"is_sarcastic\"].tolist()\n",
+        "\n",
+        "\n",
+        "sentences = df[\"text\"].tolist()\n",
+        "labels = df[\"label\"].tolist()\n",
+        "\n",
+        "training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(sentences, labels, train_size=0.8, random_state=1)\n",
+        "\n",
+        "print(len(training_sentences))\n",
+        "print(len(validation_sentences))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "KCxtcxObndZk",
+        "outputId": "0c3de610-02d1-4a8f-f7bf-993e1f644d63",
+        "pycharm": {
+          "name": "#%%\n"
+        }
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
+          ]
+        }
+      ],
+      "source": [
+        "train_encodings = tokenizer(training_sentences, truncation=True, padding=True)\n",
+        "val_encodings = tokenizer(validation_sentences, truncation=True, padding=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "id": "Tg7zcOpVndZm",
+        "pycharm": {
+          "name": "#%%\n"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "train_dataset = tf.data.Dataset.from_tensor_slices((\n",
+        "    dict(train_encodings),\n",
+        "    training_labels\n",
+        "))\n",
+        "\n",
+        "val_dataset = tf.data.Dataset.from_tensor_slices((\n",
+        "    dict(val_encodings),\n",
+        "    validation_labels\n",
+        "))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "id": "vfwrq3eMXDi1"
+      },
+      "outputs": [],
+      "source": [
+        "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
+        "\n",
+        "es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)\n",
+        "# mc = ModelCheckpoint('best_model', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "8_gjepLSndZq",
+        "outputId": "3091b5d2-40c6-4cfd-82fd-fcbc094cbc3b",
+        "pycharm": {
+          "name": "#%%\n"
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Epoch 1/10\n",
+            "837/837 [==============================] - 91s 95ms/step - loss: 0.5531 - accuracy: 0.7115 - val_loss: 0.5028 - val_accuracy: 0.7474\n",
+            "Epoch 2/10\n",
+            "837/837 [==============================] - 78s 93ms/step - loss: 0.4301 - accuracy: 0.8006 - val_loss: 0.4745 - val_accuracy: 0.7731\n",
+            "Epoch 3/10\n",
+            "837/837 [==============================] - 78s 93ms/step - loss: 0.3201 - accuracy: 0.8635 - val_loss: 0.5232 - val_accuracy: 0.7773\n",
+            "Epoch 4/10\n",
+            "837/837 [==============================] - 78s 93ms/step - loss: 0.2226 - accuracy: 0.9113 - val_loss: 0.5835 - val_accuracy: 0.7611\n",
+            "Epoch 5/10\n",
+            "837/837 [==============================] - 78s 93ms/step - loss: 0.1604 - accuracy: 0.9389 - val_loss: 0.6551 - val_accuracy: 0.7638\n",
+            "Epoch 00005: early stopping\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "<keras.callbacks.History at 0x7efdb1594e10>"
+            ]
+          },
+          "execution_count": 13,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)\n",
+        "model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])\n",
+        "model.fit(train_dataset.shuffle(100).batch(16),\n",
+        "          epochs=10,\n",
+        "          batch_size=16,\n",
+        "          callbacks=[es],\n",
+        "          validation_data=val_dataset.shuffle(100).batch(16))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "id": "dmfeNn8hndZs",
+        "pycharm": {
+          "name": "#%%\n"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "model.save_pretrained(\"model\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 38,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "D_nYwVTY8W1M",
+        "outputId": "913383cd-983d-41f4-efa7-d727275fab09"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "('tokenize/tokenizer_config.json',\n",
+              " 'tokenize/special_tokens_map.json',\n",
+              " 'tokenize/vocab.txt',\n",
+              " 'tokenize/added_tokens.json',\n",
+              " 'tokenize/tokenizer.json')"
+            ]
+          },
+          "execution_count": 38,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "tokenizer.save_pretrained(\"tokenize\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "id": "_jwvD6AUndZu",
+        "pycharm": {
+          "name": "#%%\n"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "#### Load saved model and run predict function"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "s71ZiN0bndZw",
+        "outputId": "42b7412d-7fe3-439c-8c89-1f5b4e688ee0",
+        "pycharm": {
+          "name": "#%%\n"
+        }
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Some layers from the model checkpoint at model were not used when initializing TFBertForSequenceClassification: ['dropout_13']\n",
+            "- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+            "- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+            "All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at model.\n",
+            "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.\n"
+          ]
+        }
+      ],
+      "source": [
+        "loaded_model = TFAutoModelForSequenceClassification.from_pretrained(\"model\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 18,
+      "metadata": {
+        "id": "3QCgtNI8nlmX"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import pipeline\n",
+        "\n",
+        "pipe = pipeline('text-classification', model=loaded_model, tokenizer=tokenizer)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 30,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "4QWLGTRpPDeZ",
+        "outputId": "29837e60-6d35-43cd-d6e5-14ecfc3c2c33"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[{'label': 'positive', 'score': 0.9960972666740417},\n",
+              " {'label': 'positive', 'score': 0.9960286617279053},\n",
+              " {'label': 'positive', 'score': 0.9795612692832947}]"
+            ]
+          },
+          "execution_count": 30,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "pipe([\"Saya gembira kerana saya boleh meluangkan masa bersama keluarga.\", \"Cikgu Azam adalah yang terbaik!\", \"Terima kasih, pertolongan anda adalah amat dihargai\"])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 29,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Y9RvdOZcnU3p",
+        "outputId": "088ed08d-4402-4889-f047-b3a20ae1f473"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[{'label': 'positive', 'score': 0.9666869640350342},\n",
+              " {'label': 'positive', 'score': 0.9939473867416382},\n",
+              " {'label': 'negative', 'score': 0.949023425579071},\n",
+              " {'label': 'positive', 'score': 0.7437461018562317}]"
+            ]
+          },
+          "execution_count": 29,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "pipe([\"I'm happy to spend time with my family\", \"Mr Azam is the best!\", \"Thank you, your help is much appreciated\", \"Thank you, I appreciate your help\"])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 32,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "cRp2vmxeRSam",
+        "outputId": "c983365b-57b8-4b16-ec3b-30722b120235"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[{'label': 'negative', 'score': 0.9914922118186951},\n",
+              " {'label': 'negative', 'score': 0.9830396771430969},\n",
+              " {'label': 'negative', 'score': 0.9941385984420776}]"
+            ]
+          },
+          "execution_count": 32,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "pipe([\"Sikap tidak peduli dia menyebabkan ibu bapa dia geram\", \"Saya sangat benci warna merah\", \"Cis! Dompet aku hilang!\"])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 34,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "czWBDOvlo20m",
+        "outputId": "25705b2d-32e8-42d9-866c-84cf499fd22e"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[{'label': 'negative', 'score': 0.9114706516265869},\n",
+              " {'label': 'positive', 'score': 0.9896261692047119},\n",
+              " {'label': 'negative', 'score': 0.9341222047805786}]"
+            ]
+          },
+          "execution_count": 34,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "pipe([\"His don't care attitude causes much strife to his parents\", \"I hate red color\", \"Gah! My Wallet is missing!\"])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 21,
+      "metadata": {
+        "id": "akGTf-l_ndZy",
+        "pycharm": {
+          "name": "#%%\n"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def predict_sentiment(sentence):\n",
+        "  predict_input = tokenizer.encode(sentence,\n",
+        "                                  truncation=True,\n",
+        "                                  padding=True,\n",
+        "                                  return_tensors=\"tf\")\n",
+        "\n",
+        "  tf_output = loaded_model.predict(predict_input)[0]\n",
+        "  tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]\n",
+        "\n",
+        "  sentiment = 0 if tf_prediction[0] > tf_prediction[1] else 1\n",
+        "  print(tf_prediction)\n",
+        "  return sentiment"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 22,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "SG7PCrB3nlH0",
+        "outputId": "dc07eecc-13b0-4c02-94e6-c6c8e8036fa1"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[0.0143008  0.98569924]\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "1"
+            ]
+          },
+          "execution_count": 22,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "predict_sentiment(\"gembira\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 23,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "lWiz1MO1nlbO",
+        "outputId": "1ebca034-79cc-4774-e79b-88925c58b34d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[0.57475716 0.4252428 ]\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "0"
+            ]
+          },
+          "execution_count": 23,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "predict_sentiment(\"marah\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "Hugging Face Bert Malay Sentiment.ipynb",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 2
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython2",
+      "version": "2.7.6"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+---
+language:
+  - ms
+  - en
+license: apache-2.0
+tags:
+  - sentiment-analysis
+widget:
+  - text: "Gembiranya saya hari ini!"
+---
+# bert-tiny-bahasa-cased-sentiment
+Proof of concept of creating a sentiment analysis model with using
+https://huggingface.co/malay-huggingface/bert-base-bahasa-cased as the base model.
+Tokenizer is copied directly from https://huggingface.co/malay-huggingface/bert-base-bahasa-cased.
+Sentiment analysis fine tuning was done with data compiled by [huseinzol05](https://github.com/huseinzol05/) at https://github.com/huseinzol05/malay-dataset/tree/master/sentiment.

archive/model-20211015/config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "_name_or_path": "malay-huggingface/bert-tiny-bahasa-cased",
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "id2label": {
+    "0": "negative",
+    "1": "positive"
+  },
+  "label2id": {
+    "negative": 0,
+    "positive": 1
+  },
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 336,
+  "initializer_range": 0.02,
+  "intermediate_size": 1344,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 4,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.11.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 32000
+}

archive/model-20211015/tf_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a8ba89838943372c63b4d6741b97589b76c7044a29110377e6d379b246cf01e
+size 66016632

config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "_name_or_path": "malay-huggingface/bert-tiny-bahasa-cased",
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 336,
+  "id2label": {
+    "0": "negative",
+    "1": "positive"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 1344,
+  "label2id": {
+    "negative": 0,
+    "positive": 1
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 4,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.11.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 32000
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

tf_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:807c95d88ec7570eb5d9091a9f1eeea30795ef6f5fbea58c35cc1470a5192154
+size 66016632

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"do_lower_case": false, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": "/home/patrick/.cache/huggingface/transformers/8976f17381927c83231ffc41ac983516a57f6d0d6e7addbd5f38fa654e4269e0.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d", "tokenizer_file": null, "name_or_path": "malay-huggingface/bert-tiny-bahasa-cased", "tokenizer_class": "BertTokenizer"}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff