{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "633fetsKg5cv", "outputId": "379a3769-9478-4749-cc71-bbf46e6478f9" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting transformers\n", " Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)\n", "\u001b[K |████████████████████████████████| 2.9 MB 5.2 MB/s \n", "\u001b[?25hCollecting pyyaml>=5.1\n", " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n", "\u001b[K |████████████████████████████████| 596 kB 37.4 MB/s \n", "\u001b[?25hCollecting huggingface-hub>=0.0.17\n", " Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)\n", "\u001b[K |████████████████████████████████| 56 kB 4.7 MB/s \n", "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.62.3)\n", "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.8.1)\n", "Collecting sacremoses\n", " Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)\n", "\u001b[K |████████████████████████████████| 895 kB 41.5 MB/s \n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.3.0)\n", "Collecting tokenizers<0.11,>=0.10.1\n", " Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n", "\u001b[K |████████████████████████████████| 3.3 MB 26.2 MB/s \n", "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub>=0.0.17->transformers) (3.7.4.3)\n", "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (2.4.7)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.6.0)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n", "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n", "Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers\n", " Attempting uninstall: pyyaml\n", " Found existing installation: PyYAML 3.13\n", " Uninstalling PyYAML-3.13:\n", " Successfully uninstalled PyYAML-3.13\n", "Successfully installed huggingface-hub-0.0.19 pyyaml-6.0 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.11.3\n" ] } ], "source": [ "!pip install transformers" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9pi31_2cndZU", "outputId": "f04cc4a8-7baf-404c-d059-66675a6dda63" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']\n", "- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "import tensorflow as tf\n", "import json\n", "from transformers import AutoConfig, AutoTokenizer, TFAutoModelForSequenceClassification\n", "\n", "config = AutoConfig.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased', id2label={\"0\": \"negative\",\"1\": \"positive\"}, \n", " label2id={\"negative\": 0,\"positive\": 1})\n", "tokenizer = AutoTokenizer.from_pretrained('malay-huggingface/bert-tiny-bahasa-cased')\n", "model = TFAutoModelForSequenceClassification.from_pretrained(\"malay-huggingface/bert-tiny-bahasa-cased\", from_pt=True, config=config)\n", "\n", "# config = AutoConfig.from_pretrained('malay-huggingface/bert-base-bahasa-cased', id2label={\"0\": \"negative\",\"1\": \"positive\"}, \n", "# label2id={\"negative\": 0,\"positive\": 1})\n", "\n", "# tokenizer = AutoTokenizer.from_pretrained(\"malay-huggingface/bert-base-bahasa-cased\")\n", "# model = TFAutoModelForSequenceClassification.from_pretrained(\"malay-huggingface/bert-base-bahasa-cased\", from_pt=True, config=config)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "6mkizKwiJFeZ" }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 422 }, "id": "kgMs04IDJx2z", "outputId": "6ba3687d-4ac9-48f6-a275-1a652a073dcc" }, "outputs": [ { "data": { "text/html": [ "
\n", " | label | \n", "text | \n", "
---|---|---|
0 | \n", "Negative | \n", "Lebih-lebih lagi dengan kemudahan internet da... | \n", "
1 | \n", "Positive | \n", "boleh memberi teguran kepada parti tetapi perl... | \n", "
2 | \n", "Negative | \n", "Adalah membingungkan mengapa masyarakat Cina b... | \n", "
3 | \n", "Positive | \n", "Kami menurunkan defisit daripada 6.7 peratus p... | \n", "
4 | \n", "Negative | \n", "Ini masalahnya. Bukan rakyat, tetapi sistem | \n", "
... | \n", "... | \n", "... | \n", "
3680 | \n", "Positive | \n", "Jelas pembangkang buat tuduhan untuk mengeliru... | \n", "
3681 | \n", "Positive | \n", "demokrasi adalah kuasa rakyat di mana pegawai ... | \n", "
3682 | \n", "Positive | \n", "Selain dapat menyelesaikan isu beg berat, peng... | \n", "
3683 | \n", "Positive | \n", "Hospital Langkawi buat masa ini hanya dapat me... | \n", "
3684 | \n", "Positive | \n", "Jika sebelum ini kita selesa bergerak dalam ‘g... | \n", "
3685 rows × 2 columns
\n", "\n", " | label | \n", "text | \n", "
---|---|---|
0 | \n", "0 | \n", "Lebih-lebih lagi dengan kemudahan internet da... | \n", "
1 | \n", "1 | \n", "boleh memberi teguran kepada parti tetapi perl... | \n", "
2 | \n", "0 | \n", "Adalah membingungkan mengapa masyarakat Cina b... | \n", "
3 | \n", "1 | \n", "Kami menurunkan defisit daripada 6.7 peratus p... | \n", "
4 | \n", "0 | \n", "Ini masalahnya. Bukan rakyat, tetapi sistem | \n", "
... | \n", "... | \n", "... | \n", "
16720 | \n", "0 | \n", "dalam satu perkataan, ia memalukan. | \n", "
16721 | \n", "0 | \n", "Saya tidak pernah keluar dari filem dengan pan... | \n", "
16722 | \n", "0 | \n", "saya hanya bosan menonton jessica lange mengam... | \n", "
16723 | \n", "0 | \n", "semua dalam satu penghinaan terhadap kecerdasa... | \n", "
16724 | \n", "0 | \n", "yang ingin melayari gelombang kecil filem angk... | \n", "
16725 rows × 2 columns
\n", "