Spaces:
Sleeping
Sleeping
File size: 97,454 Bytes
1e2924d |
1 |
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"colab":{"provenance":[]}},"nbformat_minor":0,"nbformat":4,"cells":[{"cell_type":"code","source":["# This Python 3 environment comes with many helpful analytics libraries installed\n","# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n","# For example, here's several helpful packages to load\n","\n","import numpy as np # linear algebra\n","import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","\n","# Input data files are available in the read-only \"../input/\" directory\n","# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n","\n","import os\n","for dirname, _, filenames in os.walk('/kaggle/input'):\n"," for filename in filenames:\n"," print(os.path.join(dirname, filename))\n","\n","# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\"\n","# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"],"metadata":{"_uuid":"d8e725fd-259f-4b0c-8256-134a9f427df7","_cell_guid":"cf754f24-e571-4113-a5ed-656f1f24bce6","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:39:36.406539Z","iopub.execute_input":"2023-10-27T19:39:36.407090Z","iopub.status.idle":"2023-10-27T19:39:36.775683Z","shell.execute_reply.started":"2023-10-27T19:39:36.407063Z","shell.execute_reply":"2023-10-27T19:39:36.774605Z"},"trusted":true,"id":"aLMiXulAq8IQ","executionInfo":{"status":"ok","timestamp":1698517883348,"user_tz":-180,"elapsed":487,"user":{"displayName":"Jasem Almansour","userId":"17867395552910147411"}}},"execution_count":1,"outputs":[]},{"cell_type":"markdown","source":["### install and import important libraries"],"metadata":{"_uuid":"6793367f-b4a4-401a-acdd-f6b5352e182a","_cell_guid":"305a9ca9-d45a-4ef2-8fdb-5cd4c2995a6f","trusted":true,"id":"97oxHEgXq8IY"}},{"cell_type":"code","source":["pip install streamlit"],"metadata":{"_kg_hide-input":true,"execution":{"iopub.status.busy":"2023-10-27T19:40:46.121550Z","iopub.execute_input":"2023-10-27T19:40:46.122052Z","iopub.status.idle":"2023-10-27T19:41:00.894314Z","shell.execute_reply.started":"2023-10-27T19:40:46.122020Z","shell.execute_reply":"2023-10-27T19:41:00.893204Z"},"trusted":true,"id":"tXq3HLXHq8Ib","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1698517896926,"user_tz":-180,"elapsed":10830,"user":{"displayName":"Jasem Almansour","userId":"17867395552910147411"}},"outputId":"48738f3a-72bd-4975-aa11-9df1128d5324"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting streamlit\n"," Downloading streamlit-1.28.0-py2.py3-none-any.whl (8.4 MB)\n","\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m8.4/8.4 MB\u001b[0m \u001b[31m23.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: altair<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (4.2.2)\n","Requirement already satisfied: blinker<2,>=1.0.0 in /usr/lib/python3/dist-packages (from streamlit) (1.4)\n","Requirement already satisfied: cachetools<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (5.3.2)\n","Requirement already satisfied: click<9,>=7.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (8.1.7)\n","Requirement already satisfied: importlib-metadata<7,>=1.4 in /usr/local/lib/python3.10/dist-packages (from streamlit) (6.8.0)\n","Requirement already satisfied: numpy<2,>=1.19.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (1.23.5)\n","Requirement already satisfied: packaging<24,>=16.8 in /usr/local/lib/python3.10/dist-packages (from streamlit) (23.2)\n","Requirement already satisfied: pandas<3,>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (1.5.3)\n","Requirement already satisfied: pillow<11,>=7.1.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (9.4.0)\n","Requirement already satisfied: protobuf<5,>=3.20 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.20.3)\n","Requirement already satisfied: pyarrow>=6.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (9.0.0)\n","Requirement already satisfied: python-dateutil<3,>=2.7.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (2.8.2)\n","Requirement already satisfied: requests<3,>=2.27 in /usr/local/lib/python3.10/dist-packages (from streamlit) (2.31.0)\n","Requirement already satisfied: rich<14,>=10.14.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (13.6.0)\n","Requirement already satisfied: tenacity<9,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (8.2.3)\n","Requirement already satisfied: toml<2,>=0.10.1 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.10.2)\n","Requirement already satisfied: typing-extensions<5,>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (4.5.0)\n","Requirement already satisfied: tzlocal<6,>=1.1 in /usr/local/lib/python3.10/dist-packages (from streamlit) (5.2)\n","Collecting validators<1,>=0.2 (from streamlit)\n"," Downloading validators-0.22.0-py3-none-any.whl (26 kB)\n","Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)\n"," Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)\n","\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m190.6/190.6 kB\u001b[0m \u001b[31m17.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)\n"," Downloading pydeck-0.8.1b0-py2.py3-none-any.whl (4.8 MB)\n","\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m4.8/4.8 MB\u001b[0m \u001b[31m51.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: tornado<7,>=6.0.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (6.3.2)\n","Collecting watchdog>=2.1.5 (from streamlit)\n"," Downloading watchdog-3.0.0-py3-none-manylinux2014_x86_64.whl (82 kB)\n","\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m82.1/82.1 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (0.4)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (3.1.2)\n","Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (4.19.1)\n","Requirement already satisfied: toolz in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (0.12.0)\n","Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)\n"," Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n","\u001b[2K \u001b[90mββββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata<7,>=1.4->streamlit) (3.17.0)\n","Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3,>=1.3.0->streamlit) (2023.3.post1)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil<3,>=2.7.3->streamlit) (1.16.0)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.27->streamlit) (3.3.1)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.27->streamlit) (3.4)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.27->streamlit) (2.0.7)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.27->streamlit) (2023.7.22)\n","Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit) (3.0.0)\n","Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit) (2.16.1)\n","Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit)\n"," Downloading smmap-5.0.1-py3-none-any.whl (24 kB)\n","Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->altair<6,>=4.0->streamlit) (2.1.3)\n","Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (23.1.0)\n","Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (2023.7.1)\n","Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (0.30.2)\n","Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (0.10.6)\n","Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<14,>=10.14.0->streamlit) (0.1.2)\n","Installing collected packages: watchdog, validators, smmap, pydeck, gitdb, gitpython, streamlit\n","Successfully installed gitdb-4.0.11 gitpython-3.1.40 pydeck-0.8.1b0 smmap-5.0.1 streamlit-1.28.0 validators-0.22.0 watchdog-3.0.0\n"]}]},{"cell_type":"code","source":["import matplotlib.pyplot as plt\n","import random\n","import keras\n","import tensorflow as tf\n","\n","from transformers import AutoTokenizer\n","from transformers import TFDistilBertModel, AutoConfig\n","\n","import streamlit as st"],"metadata":{"_uuid":"764b0395-0f2e-4a26-b71b-b3b238e65803","_cell_guid":"93240b3c-c91d-45fc-afdc-6e25ccde9a76","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:14.020793Z","iopub.execute_input":"2023-10-27T19:41:14.021294Z","iopub.status.idle":"2023-10-27T19:41:14.988123Z","shell.execute_reply.started":"2023-10-27T19:41:14.021266Z","shell.execute_reply":"2023-10-27T19:41:14.987315Z"},"trusted":true,"id":"2gWitzGQq8Ic","colab":{"base_uri":"https://localhost:8080/","height":383},"executionInfo":{"status":"error","timestamp":1698517925997,"user_tz":-180,"elapsed":652,"user":{"displayName":"Jasem Almansour","userId":"17867395552910147411"}},"outputId":"0ae4e935-7486-475e-eaba-3f3dcbb01b4e"},"execution_count":4,"outputs":[{"output_type":"error","ename":"ModuleNotFoundError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)","\u001b[0;32m<ipython-input-4-3d89e12db8e7>\u001b[0m in \u001b[0;36m<cell line: 6>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtransformers\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAutoTokenizer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtransformers\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mTFDistilBertModel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAutoConfig\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'transformers'","","\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n"],"errorDetails":{"actions":[{"action":"open_url","actionText":"Open Examples","url":"/notebooks/snippets/importing_libraries.ipynb"}]}}]},{"cell_type":"markdown","source":["### Preprocessing"],"metadata":{"_uuid":"f61c70f1-311b-4dcd-938c-ff4ab102865c","_cell_guid":"b7308c78-1bfa-4dff-988f-7fcb45c5ee31","trusted":true,"id":"6_-QMa4nq8Id"}},{"cell_type":"code","source":["# Load the dataset files\n","df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')\n","df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')"],"metadata":{"_uuid":"64c41258-61fe-4193-a5c8-00a1fcdcd7cb","_cell_guid":"c0e5785a-3393-4e7a-a573-0da8c001bfa0","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:14.989959Z","iopub.execute_input":"2023-10-27T19:41:14.990260Z","iopub.status.idle":"2023-10-27T19:41:15.103684Z","shell.execute_reply.started":"2023-10-27T19:41:14.990233Z","shell.execute_reply":"2023-10-27T19:41:15.102798Z"},"trusted":true,"id":"DnPsxqPpq8Id","colab":{"base_uri":"https://localhost:8080/","height":355},"executionInfo":{"status":"error","timestamp":1698517943331,"user_tz":-180,"elapsed":425,"user":{"displayName":"Jasem Almansour","userId":"17867395552910147411"}},"outputId":"ce351468-1e00-439f-db42-94160b65ebcd"},"execution_count":5,"outputs":[{"output_type":"error","ename":"FileNotFoundError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)","\u001b[0;32m<ipython-input-5-3df738d3e3a7>\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Load the dataset files\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/kaggle/input/nlp-getting-started/train.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdf_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/kaggle/input/nlp-getting-started/test.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnew_arg_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_arg_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_stack_level\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m )\n\u001b[0;32m--> 331\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 332\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0;31m# error: \"Callable[[VarArg(Any), KwArg(Any)], Any]\" has no\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m 948\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 949\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 950\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 951\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 952\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 604\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 605\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 606\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 607\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1440\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1441\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandles\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIOHandles\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1442\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1443\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1444\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1733\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m\"b\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1735\u001b[0;31m self.handles = get_handle(\n\u001b[0m\u001b[1;32m 1736\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoding\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 855\u001b[0m \u001b[0;31m# Encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 856\u001b[0;31m handle = open(\n\u001b[0m\u001b[1;32m 857\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/kaggle/input/nlp-getting-started/train.csv'"]}]},{"cell_type":"markdown","source":["let's look at the training data"],"metadata":{"_uuid":"5d1d2327-e567-4704-9961-dba574735645","_cell_guid":"00d3bfa5-4e1b-4db8-be5a-55d9516ddb7f","trusted":true,"id":"zTIeDUw8q8If"}},{"cell_type":"code","source":["print(df_train.columns)\n","print(df_train.shape)\n","df_train.head()"],"metadata":{"_uuid":"e6b3baf4-d084-4f76-9ec0-7f54234541cd","_cell_guid":"65b54a29-f70c-4fee-adca-9e2a9451c2e1","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:19.084998Z","iopub.execute_input":"2023-10-27T19:41:19.085364Z","iopub.status.idle":"2023-10-27T19:41:19.108415Z","shell.execute_reply.started":"2023-10-27T19:41:19.085334Z","shell.execute_reply":"2023-10-27T19:41:19.107466Z"},"trusted":true,"id":"FXivPW8Nq8If","outputId":"55982b8b-b60e-4fa6-d608-ff06718367b5"},"execution_count":null,"outputs":[{"name":"stdout","text":"Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')\n(7613, 5)\n","output_type":"stream"},{"execution_count":6,"output_type":"execute_result","data":{"text/plain":" id keyword location text \\\n0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n2 5 NaN NaN All residents asked to 'shelter in place' are ... \n3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n\n target \n0 1 \n1 1 \n2 1 \n3 1 \n4 1 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>keyword</th>\n <th>location</th>\n <th>text</th>\n <th>target</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Our Deeds are the Reason of this #earthquake M...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>4</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Forest fire near La Ronge Sask. Canada</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>5</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>All residents asked to 'shelter in place' are ...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>6</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>13,000 people receive #wildfires evacuation or...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>7</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Just got sent this photo from Ruby #Alaska as ...</td>\n <td>1</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":["Now let's look at the testing data"],"metadata":{"_uuid":"515ab512-06b7-4cce-9a95-f2916f1fd4ad","_cell_guid":"c96e4868-5978-4ca3-8272-af81f4f1d56f","trusted":true,"id":"jLhs-rRiq8Ig"}},{"cell_type":"code","source":["print(df_test.columns)\n","print(df_test.shape)\n","df_test.head()"],"metadata":{"_uuid":"1ea066a1-3a43-467a-a70f-91c131d5868c","_cell_guid":"c08cc742-e3c4-4a66-be8f-422965d1c8a8","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:19.914598Z","iopub.execute_input":"2023-10-27T19:41:19.914973Z","iopub.status.idle":"2023-10-27T19:41:19.926242Z","shell.execute_reply.started":"2023-10-27T19:41:19.914942Z","shell.execute_reply":"2023-10-27T19:41:19.925362Z"},"trusted":true,"id":"LSuuIC2jq8Il","outputId":"16e482e3-aa53-421a-ef8d-22ba2bfbc96a"},"execution_count":null,"outputs":[{"name":"stdout","text":"Index(['id', 'keyword', 'location', 'text'], dtype='object')\n(3263, 4)\n","output_type":"stream"},{"execution_count":7,"output_type":"execute_result","data":{"text/plain":" id keyword location text\n0 0 NaN NaN Just happened a terrible car crash\n1 2 NaN NaN Heard about #earthquake is different cities, s...\n2 3 NaN NaN there is a forest fire at spot pond, geese are...\n3 9 NaN NaN Apocalypse lighting. #Spokane #wildfires\n4 11 NaN NaN Typhoon Soudelor kills 28 in China and Taiwan","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>keyword</th>\n <th>location</th>\n <th>text</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Just happened a terrible car crash</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Heard about #earthquake is different cities, s...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>there is a forest fire at spot pond, geese are...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>9</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Apocalypse lighting. #Spokane #wildfires</td>\n </tr>\n <tr>\n <th>4</th>\n <td>11</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Typhoon Soudelor kills 28 in China and Taiwan</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":["let's check the null values"],"metadata":{"_uuid":"0a7e59be-5a02-438d-8ecb-334ada13bce9","_cell_guid":"1de18dac-da71-4512-9556-7f7d4ebea7f1","trusted":true,"id":"uOCaMj6Uq8Im"}},{"cell_type":"code","source":["df_train.isna().sum()"],"metadata":{"_uuid":"39359b8a-160a-4718-ace0-e289ac71f01e","_cell_guid":"a31bdec4-7cd0-4466-989b-cdcdbc10bee4","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:20.969504Z","iopub.execute_input":"2023-10-27T19:41:20.970222Z","iopub.status.idle":"2023-10-27T19:41:20.982161Z","shell.execute_reply.started":"2023-10-27T19:41:20.970192Z","shell.execute_reply":"2023-10-27T19:41:20.981195Z"},"trusted":true,"id":"jENpYCHVq8In","outputId":"505c10ff-2c1d-452c-d4cb-19fe183fb434"},"execution_count":null,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":"id 0\nkeyword 61\nlocation 2533\ntext 0\ntarget 0\ndtype: int64"},"metadata":{}}]},{"cell_type":"markdown","source":["we can easily see that the colomns \"keyword, location\" doesn't provide any useful information since most of them are None values...\n","so let's Drop them"],"metadata":{"_uuid":"2676e5e8-c49a-4578-9bbc-75cd5c301b6a","_cell_guid":"999b0c97-291d-43e1-9b35-e5e0fd29be82","trusted":true,"id":"dikPAsUwq8In"}},{"cell_type":"code","source":["df_train= df_train.drop(columns=['keyword', 'location'])\n","df_test= df_test.drop(columns=['keyword', 'location'])\n","print(\"training dataset columns:\",df_train.columns)\n","print(\"testing dataset columns:\",df_test.columns)"],"metadata":{"_uuid":"c626b9de-06b9-4d4f-af1c-59bf7499196e","_cell_guid":"27988b26-0c42-4d8b-b427-1e5c3e157e64","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:21.865299Z","iopub.execute_input":"2023-10-27T19:41:21.865625Z","iopub.status.idle":"2023-10-27T19:41:21.883019Z","shell.execute_reply.started":"2023-10-27T19:41:21.865600Z","shell.execute_reply":"2023-10-27T19:41:21.882046Z"},"trusted":true,"id":"f0zEWKd9q8Io","outputId":"58efec41-8701-4295-f73f-d7c7fe6cfe64"},"execution_count":null,"outputs":[{"name":"stdout","text":"training dataset columns: Index(['id', 'text', 'target'], dtype='object')\ntesting dataset columns: Index(['id', 'text'], dtype='object')\n","output_type":"stream"}]},{"cell_type":"markdown","source":["Now let's check if the two target classes are represented equaly or if one class is under-represented."],"metadata":{"_uuid":"866bd007-6ccc-4e9c-aff7-52edd786dde9","_cell_guid":"3859d518-440f-4034-8556-02a7c0b274d3","trusted":true,"id":"TYc--H-_q8Ip"}},{"cell_type":"code","source":["# Count the number of tweets in each class\n","class_counts = df_train['target'].value_counts()\n","print('0 = Not a Disaster Tweet counts, 1 = Disaster Tweet counts')\n","print(class_counts)\n","\n","# Create a bar plot for the Class Distribution\n","plt.figure(figsize=(6, 4))\n","class_counts.plot(kind='bar', color=['green', 'red'])\n","plt.title('Class Distribution in Training Data - \"target\" column')\n","plt.xlabel('Class (0: Not Disaster, 1: Disaster)')\n","plt.ylabel('Count')\n","plt.xticks(rotation=0)\n","plt.show()"],"metadata":{"_uuid":"f01247f3-f5cc-4606-98c9-e5708bb301ea","_cell_guid":"8f319ce4-f9e0-4376-ad1c-d12a42a07dc0","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:23.009894Z","iopub.execute_input":"2023-10-27T19:41:23.010262Z","iopub.status.idle":"2023-10-27T19:41:23.288651Z","shell.execute_reply.started":"2023-10-27T19:41:23.010234Z","shell.execute_reply":"2023-10-27T19:41:23.287774Z"},"trusted":true,"id":"qt02hGGkq8Ip","outputId":"0930901a-aeec-4157-984f-434a7ff78d0d"},"execution_count":null,"outputs":[{"name":"stdout","text":"0 = Not a Disaster Tweet counts, 1 = Disaster Tweet counts\ntarget\n0 4342\n1 3271\nName: count, dtype: int64\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"<Figure size 600x400 with 1 Axes>","image/png":""},"metadata":{}}]},{"cell_type":"markdown","source":["from the previous histogram we can see that we have 4324 tweets that doesn't indicate a disaster and 3271 tweets that indicates a disaster which is a fare representation for the two classes."],"metadata":{"_uuid":"0f63f459-e975-41b2-9aa9-d303aa018970","_cell_guid":"abea9f31-58e7-4919-852a-fa6800caad21","trusted":true,"id":"f-3PoibEq8Iq"}},{"cell_type":"markdown","source":["let's take a random sample from the dataset \"text\" colomn to check the data:"],"metadata":{"_uuid":"97211c76-c937-4d40-a649-235edc282728","_cell_guid":"f1c9b925-ea8c-4ce8-897b-69883080bcaf","trusted":true,"id":"8pN2pN0tq8Ir"}},{"cell_type":"code","source":["random_sample_text = random.sample(list(df_train['text']), 10)\n","for i, text in enumerate(random_sample_text):\n"," print(f'text n_{i}: {text}\\n')"],"metadata":{"_uuid":"6d1b27f6-e58d-4d4d-94ae-6fff760b6ace","_cell_guid":"beef0338-988f-45b4-ad77-909f7380473b","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:24.727975Z","iopub.execute_input":"2023-10-27T19:41:24.728348Z","iopub.status.idle":"2023-10-27T19:41:24.735998Z","shell.execute_reply.started":"2023-10-27T19:41:24.728318Z","shell.execute_reply":"2023-10-27T19:41:24.735013Z"},"trusted":true,"id":"sLVaE8VQq8Ir","outputId":"88a3e2f1-44f9-448a-9f2b-14e5c5a085f9"},"execution_count":null,"outputs":[{"name":"stdout","text":"text n_0: i be on that hotboy shit\n\ntext n_1: Trial Date Set for Man Charged with Arson Burglary http://t.co/WftCrLz32P\n\ntext n_2: waiting for my chocolate lava cakes to get here ??????\n\ntext n_3: Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt\n\ntext n_4: @Rebelmage2 I'm glad you got away XD But My 'be safe' was in reference to a tornado near calgary and drum heller at around 4 :O\n\ntext n_5: Texas Seeks Comment on Rules for Changes to Windstorm Insurer http://t.co/BZ07c9WthX via @ijournal\n\ntext n_6: @godsfirstson1 and she wrapped his coat around herself. It practically engulfed her.\n\ntext n_7: Kosciusko police investigating pedestrian fatality hit by a train Thursday http://t.co/m5djLLxoZP\n\ntext n_8: One Direction Is my pick for http://t.co/q2eBlOKeVE Fan Army #Directioners http://t.co/eNCmhz6y34 x1441\n\ntext n_9: I'm drowning in hw now and that's w/o going to swim ohlordy\n\n","output_type":"stream"}]},{"cell_type":"markdown","source":["after taking a random sample we can see that the text contains UURL's that doesnt provide any useful information, so let's attempt to remove them...\n","also it's almost that users add this URL's at the end of the tweet.\n","so we can remove the URL's by using regural expressions"],"metadata":{"_uuid":"3be35232-2f02-4877-8e29-3b1d1f06eb90","_cell_guid":"5fdc858c-7e7f-4a7f-bd4c-d829913177c1","trusted":true,"id":"ChsFU4c7q8Is"}},{"cell_type":"code","source":["url_rgex = r'(http\\S*)'\n","\n","df_train['text'] = df_train['text'].str.replace(url_rgex, '', regex=True)\n","df_test['text'] = df_test['text'].str.replace(url_rgex, '', regex=True)"],"metadata":{"_uuid":"8e2300a3-11d4-4cb0-89eb-b9c8336d8ab9","_cell_guid":"6d14a396-a643-4059-a278-627781352814","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:25.925474Z","iopub.execute_input":"2023-10-27T19:41:25.925857Z","iopub.status.idle":"2023-10-27T19:41:25.946391Z","shell.execute_reply.started":"2023-10-27T19:41:25.925827Z","shell.execute_reply":"2023-10-27T19:41:25.945488Z"},"trusted":true,"id":"nSaVmheFq8It"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["random_sample_text = random.sample(list(df_train['text']), 2)\n","for i, text in enumerate(random_sample_text):\n"," print(f'text n_{i}: {text}\\n')"],"metadata":{"_uuid":"0257b086-caaf-46de-82ce-e66f9148548b","_cell_guid":"75011780-41e1-4270-bdc0-3a92f61af657","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:26.780011Z","iopub.execute_input":"2023-10-27T19:41:26.780832Z","iopub.status.idle":"2023-10-27T19:41:26.788250Z","shell.execute_reply.started":"2023-10-27T19:41:26.780787Z","shell.execute_reply":"2023-10-27T19:41:26.787059Z"},"trusted":true,"id":"1ADQwTspq8Iu","outputId":"78b80bb6-e19a-4a2a-b49a-213fcd0bd1c3"},"execution_count":null,"outputs":[{"name":"stdout","text":"text n_0: Obama Declares Disaster for Typhoon-Devastated Saipan: Obama signs disaster declaration for Northern Marians a... \n\ntext n_1: Now Trending in Nigeria: Police charge traditional ruler others with informantΒΓΒͺs murder \n\n","output_type":"stream"}]},{"cell_type":"markdown","source":["another thing to notice that there are alot of users names and hastags which don't provide a lot of information since users also sometimes use the most famous tags to increase tweets reach and the names doesn't provide alot of valuable information so let's remove them:"],"metadata":{"_uuid":"73d9d07b-6d88-4492-9441-d33a6a444461","_cell_guid":"4d3ba251-6716-47eb-8f38-9c141753cb32","trusted":true,"id":"ctuZ8JRTq8Iu"}},{"cell_type":"code","source":["user_regex = r'(@\\S*)'\n","df_train['text'] = df_train['text'].str.replace(user_regex, '', regex=True)\n","df_test['text'] = df_test['text'].str.replace(user_regex, '', regex=True)"],"metadata":{"_uuid":"c42034b7-e79b-47eb-a3fa-e5b3fce4ae63","_cell_guid":"5921e2fc-f654-41f7-9267-88a173de2e4d","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:28.696734Z","iopub.execute_input":"2023-10-27T19:41:28.697136Z","iopub.status.idle":"2023-10-27T19:41:28.715136Z","shell.execute_reply.started":"2023-10-27T19:41:28.697106Z","shell.execute_reply":"2023-10-27T19:41:28.714127Z"},"trusted":true,"id":"SQcB2dD-q8Iv"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["tags_regex = r'(#)'\n","df_train['text'] = df_train['text'].str.replace(tags_regex, '', regex=True)\n","df_test['text'] = df_test['text'].str.replace(tags_regex, '', regex=True)"],"metadata":{"_uuid":"f63bab01-04fd-454c-b128-589549169a9a","_cell_guid":"70f86e6b-7357-4c27-b9bd-edf6d0fcabf5","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:29.579409Z","iopub.execute_input":"2023-10-27T19:41:29.579802Z","iopub.status.idle":"2023-10-27T19:41:29.597782Z","shell.execute_reply.started":"2023-10-27T19:41:29.579767Z","shell.execute_reply":"2023-10-27T19:41:29.596900Z"},"trusted":true,"id":"-aYahFSNq8Iv"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["let's remove non-english alphabets"],"metadata":{"_uuid":"9888805b-4374-481b-8f32-79e828744829","_cell_guid":"e3538a3f-ac65-4f27-b8ec-b86c73592c9c","trusted":true,"id":"yGqB6QNWq8Iw"}},{"cell_type":"code","source":["error_pattern = r'(ΒΓ\\S*) | (ΒΓΒͺ)'\n","\n","df_train['text'] = df_train['text'].str.replace(error_pattern, '', regex=True)\n","df_test['text'] = df_test['text'].str.replace(error_pattern, '', regex=True)"],"metadata":{"_uuid":"e3107e9a-ff52-4a4c-9230-759c386cc2f3","_cell_guid":"1941cf2b-4f0c-4fbd-b67c-a1e6660b73d9","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:30.954944Z","iopub.execute_input":"2023-10-27T19:41:30.955819Z","iopub.status.idle":"2023-10-27T19:41:31.036663Z","shell.execute_reply.started":"2023-10-27T19:41:30.955785Z","shell.execute_reply":"2023-10-27T19:41:31.035742Z"},"trusted":true,"id":"VZKt2Qf8q8Iw"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["let's check another random sample after all the preprocessing:"],"metadata":{"_uuid":"2c9cb783-c9f0-4d93-a321-49107ff30e28","_cell_guid":"f88d16a1-b988-4c92-93c4-7e4fcfe2a1d7","trusted":true,"id":"y7OwlQ3eq8Ix"}},{"cell_type":"code","source":["random_sample_text = random.sample(list(df_train['text']), 10)\n","for i, text in enumerate(random_sample_text):\n"," print(f'text n_{i}: {text}\\n')"],"metadata":{"_uuid":"1fb72e54-69e1-4fa2-9dad-8a5f997ca85b","_cell_guid":"3a0c12e4-7645-46ac-97ac-f35829c1479d","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:32.387128Z","iopub.execute_input":"2023-10-27T19:41:32.388040Z","iopub.status.idle":"2023-10-27T19:41:32.398047Z","shell.execute_reply.started":"2023-10-27T19:41:32.387980Z","shell.execute_reply":"2023-10-27T19:41:32.396742Z"},"trusted":true,"id":"NETkj9Ajq8Iy","outputId":"b0f86893-d576-4b67-a875-a6b6aa55104e"},"execution_count":null,"outputs":[{"name":"stdout","text":"text n_0: Criminals Who Hijack Lorries And Buses Arrested In Enugu: According to the Nigerian Police Force... Via \n\ntext n_1: in 5 min // Speaker Deck\n\ntext n_2: NBCNightlyNews: Malaysian Officials Say Debris Found on Reunion Island Is From MH370. BillNeelyNBC reports: \n\ntext n_3: 'Nobody remembers who came in second.' Charles Schulz\n\ntext n_4: Heavy smoke pouring out of buildings on fire in Port Coquitlam \n\ntext n_5: Green line service on south side disrupted after CTA train derails passengers evacuated. \n\ntext n_6: MORE-->OSHA officers on siteinvestigating Noranda explosion -KFVS12 News Cape Girardeau Carbondale Poplar Bluff \n\ntext n_7: trapped in its disappearance\n\ntext n_8: lets hope it's a upper class white mass murderer....''' Mmmm\n\ntext n_9: Summer is lovely\n\n","output_type":"stream"}]},{"cell_type":"markdown","source":["**Now this dataset makes more sense**"],"metadata":{"_uuid":"d3134ebc-eaf9-4817-b09a-e1f11b6865db","_cell_guid":"c1f2dc4b-817a-4e51-b3d6-a9dac7820dcc","trusted":true,"id":"UzMYyknqq8Iz"}},{"cell_type":"markdown","source":["### tokanizing and creating the mask"],"metadata":{"_uuid":"f25843b2-d303-44c2-a3be-25131e5c7e89","_cell_guid":"cf16e56b-de11-41c7-ab6e-be973314dec1","trusted":true,"id":"Ak531eHNq8Iz"}},{"cell_type":"markdown","source":["I will be using the tokanizer from huggingface [sacculifer/dimbat_disaster_distilbert](http://huggingface.co/sacculifer/dimbat_disaster_distilbert) because it will serve us well in this task"],"metadata":{"_uuid":"d0a28b79-b94e-423e-a950-f24675147c0f","_cell_guid":"51d24430-4242-4931-a574-6bce4ad090a6","trusted":true,"id":"mewZUuAJq8I0"}},{"cell_type":"code","source":["tokenizer = AutoTokenizer.from_pretrained(\"sacculifer/dimbat_disaster_distilbert\", do_lower_case=True)"],"metadata":{"_uuid":"c78dd8f4-6f13-4974-ba71-10dbd558a532","_cell_guid":"d29eac28-1942-4e58-80ab-6778026cb2f9","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:34.502573Z","iopub.execute_input":"2023-10-27T19:41:34.503050Z","iopub.status.idle":"2023-10-27T19:41:35.339038Z","shell.execute_reply.started":"2023-10-27T19:41:34.503006Z","shell.execute_reply":"2023-10-27T19:41:35.338022Z"},"trusted":true,"id":"24yT7-qFq8I0","outputId":"f415002d-e201-435d-af99-ebd5a461b90d","colab":{"referenced_widgets":["3ff5527998da43628ebd563410b4f3fb","c77f453507be4607a43862b8a7338730","6ef2876ce615455396fd0215b2a4171f","f328f49ca29d4e258538071541666cd5"]}},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":"Downloading (β¦)okenizer_config.json: 0%| | 0.00/333 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"3ff5527998da43628ebd563410b4f3fb"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading (β¦)solve/main/vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"c77f453507be4607a43862b8a7338730"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading (β¦)/main/tokenizer.json: 0%| | 0.00/711k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"6ef2876ce615455396fd0215b2a4171f"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading (β¦)cial_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"f328f49ca29d4e258538071541666cd5"}},"metadata":{}}]},{"cell_type":"markdown","source":["Now since we are planning to work with transformers, we should create the masks for the inputs, The mask will tokanize the inputs and add paddings making all of the inputs the same length so they can be processed by the model."],"metadata":{"_uuid":"5816763d-7113-453a-9123-708193221bcc","_cell_guid":"875c4ff6-b4e4-41ed-a484-f83584feb8fa","trusted":true,"id":"UNkipMVvq8I2"}},{"cell_type":"code","source":["def create_masks(texts):\n"," input_ids= []\n"," attention_masks=[]\n","\n"," for text in texts:\n"," token= tokenizer(\n"," text,\n"," padding= \"max_length\",\n"," add_special_tokens= True,\n"," return_attention_mask= True,\n"," return_token_type_ids= False\n"," )\n"," input_ids.append(token['input_ids'])\n"," attention_masks.append(token['attention_mask'])\n"," return input_ids, attention_masks"],"metadata":{"_uuid":"81ba0640-b7d0-46be-be1c-08f6fbe6c06c","_cell_guid":"6772d5e3-10b1-4054-9e72-2849b2a673bb","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:35.774651Z","iopub.execute_input":"2023-10-27T19:41:35.775608Z","iopub.status.idle":"2023-10-27T19:41:35.781268Z","shell.execute_reply.started":"2023-10-27T19:41:35.775572Z","shell.execute_reply":"2023-10-27T19:41:35.780280Z"},"trusted":true,"id":"LBCR8b_1q8I2"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["input_ids_train, attention_mask_train = create_masks(df_train['text'])\n","input_ids_test, attention_mask_test = create_masks(df_test['text'])"],"metadata":{"_uuid":"9e2dd218-097b-4fbc-9f61-cbaa9f6d3d55","_cell_guid":"cf037f6e-00bf-45ab-bcc9-74c0ad2a94e4","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:36.312626Z","iopub.execute_input":"2023-10-27T19:41:36.313522Z","iopub.status.idle":"2023-10-27T19:41:38.447797Z","shell.execute_reply.started":"2023-10-27T19:41:36.313488Z","shell.execute_reply":"2023-10-27T19:41:38.446990Z"},"trusted":true,"id":"lgOdZybkq8I3"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["specify that we will be using a GPU device"],"metadata":{"_uuid":"e2a11011-9f12-4c97-8104-f259a325cc4c","_cell_guid":"89a20422-bd8e-43e4-aafb-2342058f96ea","trusted":true,"id":"QboHDymFq8I3"}},{"cell_type":"code","source":["gpu_devices = tf.config.experimental.list_physical_devices(\"GPU\")\n","for device in gpu_devices:\n"," tf.config.experimental.set_memory_growth(device, True)"],"metadata":{"_uuid":"622dc603-7800-4618-b03e-fc7a35ba81af","_cell_guid":"829a6989-2794-4393-a1b7-70b57797a163","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:38.449314Z","iopub.execute_input":"2023-10-27T19:41:38.449648Z","iopub.status.idle":"2023-10-27T19:41:38.546262Z","shell.execute_reply.started":"2023-10-27T19:41:38.449621Z","shell.execute_reply":"2023-10-27T19:41:38.545201Z"},"trusted":true,"id":"DzQkSI-Mq8I4"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["create new datasets of tensors using the tokens that was created previusly to train our models on them"],"metadata":{"_uuid":"29f3d60d-1b52-4597-bd3d-cdf09509d5d4","_cell_guid":"f36c9076-e703-4c83-9f31-1a2ba02d6fee","trusted":true,"id":"CFzBJbSrq8I5"}},{"cell_type":"code","source":["labels_train= df_train['target']\n","\n","ds_train_ = tf.data.Dataset.from_tensor_slices((input_ids_train, attention_mask_train, labels_train))\n","ds_test = tf.data.Dataset.from_tensor_slices((input_ids_test, attention_mask_test))\n","\n","\n","def map_train(ids, mask, labels):\n"," return {\n"," 'input_ids': ids,\n"," 'attention_mask': mask\n"," }, labels\n","\n","def map_test(ids, mask):\n"," return {\n"," \"input_ids\": ids,\n"," \"attention_mask\": mask\n"," }\n","ds_train_ = (ds_train_.map(map_train)).batch(24)\n","ds_test = (ds_test.map(map_test)).batch(24)"],"metadata":{"_uuid":"f0f7cbf9-e594-47b3-95e8-60bdff0a3f46","_cell_guid":"aa0fe5b1-a253-4c53-b3e2-06076012d78f","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:40.476874Z","iopub.execute_input":"2023-10-27T19:41:40.477224Z","iopub.status.idle":"2023-10-27T19:42:20.036287Z","shell.execute_reply.started":"2023-10-27T19:41:40.477199Z","shell.execute_reply":"2023-10-27T19:42:20.035460Z"},"trusted":true,"id":"pqInicSvq8I6"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["let's free up some space :)"],"metadata":{"_uuid":"5f5d8f89-6cc7-4479-bd70-45c2d67dcf51","_cell_guid":"d5932f7f-34c5-449d-a2a8-6fcecb3819ff","trusted":true,"id":"lWCW56_oq8I6"}},{"cell_type":"code","source":["del input_ids_train, attention_mask_train, input_ids_test, attention_mask_test"],"metadata":{"_uuid":"82a9a5a6-abd3-4ab3-8b50-028efd584d23","_cell_guid":"c086b43a-2c78-42c7-b435-8b6e2dbc1759","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:42:20.037880Z","iopub.execute_input":"2023-10-27T19:42:20.038176Z","iopub.status.idle":"2023-10-27T19:42:20.082056Z","shell.execute_reply.started":"2023-10-27T19:42:20.038136Z","shell.execute_reply":"2023-10-27T19:42:20.080785Z"},"trusted":true,"id":"taxJDrfsq8I7"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["split train dataset into train and validation:"],"metadata":{"_uuid":"372c39a1-20a7-43ad-b332-d5e4d5a33b04","_cell_guid":"96ba53b2-2ee4-4363-87f8-a9a0710fe439","trusted":true,"id":"rWRzxZwzq8I8"}},{"cell_type":"code","source":["size_train = int(len(ds_train_) * 0.8)\n","\n","ds_train = ds_train_.take(size_train)\n","ds_validation = ds_train.skip(size_train)\n","\n","del ds_train_"],"metadata":{"_uuid":"de7efbe1-8efa-481f-ac05-ed3eeff4f608","_cell_guid":"afa620e5-a5c2-49b2-801e-27db37fd2a43","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:42:20.083365Z","iopub.execute_input":"2023-10-27T19:42:20.085285Z","iopub.status.idle":"2023-10-27T19:42:20.104967Z","shell.execute_reply.started":"2023-10-27T19:42:20.085247Z","shell.execute_reply":"2023-10-27T19:42:20.104041Z"},"trusted":true,"id":"Nk6BC8DFq8I8"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## the Model"],"metadata":{"_uuid":"ba658825-9989-49cf-87dc-228eae299aae","_cell_guid":"673dcc89-5458-4191-8638-973265fffcd6","trusted":true,"id":"AJyzz5Vnq8I9"}},{"cell_type":"markdown","source":["I will be using the pretrained huggingface model [sacculifer/dimbat_disaster_distilbert](https://huggingface.co/sacculifer/dimbat_disaster_distilbert) to intialize the model's wieghts"],"metadata":{"_uuid":"4d95cf08-daef-4821-9c40-94b408626330","_cell_guid":"ccd27eea-15ea-4a13-8037-46eebb910727","trusted":true,"id":"QIOFdy3wq8I-"}},{"cell_type":"code","source":["config = AutoConfig.from_pretrained('sacculifer/dimbat_disaster_distilbert')\n","transformer = TFDistilBertModel.from_pretrained(\"sacculifer/dimbat_disaster_distilbert\", config=config)"],"metadata":{"_uuid":"2f48e550-7076-40ba-ad36-abf53c4a6919","_cell_guid":"0823482a-9592-40f9-92bb-1ab419160f93","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:42:20.106932Z","iopub.execute_input":"2023-10-27T19:42:20.107207Z","iopub.status.idle":"2023-10-27T19:42:31.290686Z","shell.execute_reply.started":"2023-10-27T19:42:20.107182Z","shell.execute_reply":"2023-10-27T19:42:31.289623Z"},"trusted":true,"id":"gux6T_8qq8I-","outputId":"ebfb8a76-7126-4ea3-8302-93aeb2821588","colab":{"referenced_widgets":["88e20c8b33844d0d856cce90d9ac0590","f51e35b0e297428187fafd732124d8af"]}},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":"Downloading (β¦)lve/main/config.json: 0%| | 0.00/557 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"88e20c8b33844d0d856cce90d9ac0590"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"Downloading tf_model.h5: 0%| | 0.00/268M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"f51e35b0e297428187fafd732124d8af"}},"metadata":{}},{"name":"stderr","text":"Some layers from the model checkpoint at sacculifer/dimbat_disaster_distilbert were not used when initializing TFDistilBertModel: ['pre_classifier', 'classifier', 'dropout_19']\n- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\nAll the layers of TFDistilBertModel were initialized from the model checkpoint at sacculifer/dimbat_disaster_distilbert.\nIf your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.\n","output_type":"stream"}]},{"cell_type":"markdown","source":["define the tensorflow model"],"metadata":{"_uuid":"de6d8d1e-3b88-4046-978a-62f77278eca0","_cell_guid":"bac2575a-e0ab-4bbd-8781-ac0f2b6f3b58","trusted":true,"id":"X6SiYEkSq8I_"}},{"cell_type":"code","source":["input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')\n","attention_mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')\n","\n","embeddings = transformer(input_ids, attention_mask=attention_mask)[0]\n","pooling = tf.keras.layers.GlobalAveragePooling1D()(embeddings)\n","\n","net = tf.keras.layers.BatchNormalization()(pooling)\n","net = tf.keras.layers.Dense(1024, activation='relu')(net)\n","net = tf.keras.layers.Dropout(0.2)(net)\n","net = tf.keras.layers.Dense(1024, activation='relu')(net)\n","net = tf.keras.layers.Dropout(0.2)(net)\n","net = tf.keras.layers.Dense(1, activation='sigmoid')(net)\n","\n","model = tf.keras.Model(inputs=(input_ids, attention_mask), outputs=net)\n","model.layers[2].trainable = True # freeze for transform layers"],"metadata":{"_uuid":"52559397-3681-453e-a178-7a517f842ddd","_cell_guid":"bc766da5-5ba8-4e99-8a85-58b7d1dbf3bc","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:42:31.292394Z","iopub.execute_input":"2023-10-27T19:42:31.292705Z","iopub.status.idle":"2023-10-27T19:42:35.561708Z","shell.execute_reply.started":"2023-10-27T19:42:31.292677Z","shell.execute_reply":"2023-10-27T19:42:35.560785Z"},"trusted":true,"id":"zPzy-k1Hq8JA"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["model.compile(\n"," optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),\n"," loss=tf.keras.losses.BinaryCrossentropy(),\n"," metrics=['accuracy']\n",")"],"metadata":{"_uuid":"91f43001-936f-4ba1-9530-b5c91a5746e9","_cell_guid":"ecde80c4-0bcf-4116-b0e5-6206eace3e76","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:42:35.562760Z","iopub.execute_input":"2023-10-27T19:42:35.563064Z","iopub.status.idle":"2023-10-27T19:42:35.581654Z","shell.execute_reply.started":"2023-10-27T19:42:35.563038Z","shell.execute_reply":"2023-10-27T19:42:35.580942Z"},"trusted":true,"id":"6P-Hm-IRq8JB"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["model.summary()"],"metadata":{"_uuid":"afedab0b-0344-4d2a-bfdf-f75955604857","_cell_guid":"9ffd434d-a42d-4d29-9718-2c9e201311e2","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:42:35.582647Z","iopub.execute_input":"2023-10-27T19:42:35.582931Z","iopub.status.idle":"2023-10-27T19:42:35.623983Z","shell.execute_reply.started":"2023-10-27T19:42:35.582907Z","shell.execute_reply":"2023-10-27T19:42:35.623093Z"},"trusted":true,"id":"JA4nYPynq8JB","outputId":"dc7816cb-20f1-4989-ae0b-f688b34c5e65"},"execution_count":null,"outputs":[{"name":"stdout","text":"Model: \"model\"\n__________________________________________________________________________________________________\n Layer (type) Output Shape Param # Connected to \n==================================================================================================\n input_ids (InputLayer) [(None, 512)] 0 [] \n \n attention_mask (InputLayer) [(None, 512)] 0 [] \n \n tf_distil_bert_model (TFDistil TFBaseModelOutput(l 66362880 ['input_ids[0][0]', \n BertModel) ast_hidden_state=(N 'attention_mask[0][0]'] \n one, 512, 768), \n hidden_states=None \n , attentions=None) \n \n global_average_pooling1d (Glob (None, 768) 0 ['tf_distil_bert_model[0][0]'] \n alAveragePooling1D) \n \n batch_normalization (BatchNorm (None, 768) 3072 ['global_average_pooling1d[0][0]'\n alization) ] \n \n dense (Dense) (None, 1024) 787456 ['batch_normalization[0][0]'] \n \n dropout_19 (Dropout) (None, 1024) 0 ['dense[0][0]'] \n \n dense_1 (Dense) (None, 1024) 1049600 ['dropout_19[0][0]'] \n \n dropout_20 (Dropout) (None, 1024) 0 ['dense_1[0][0]'] \n \n dense_2 (Dense) (None, 1) 1025 ['dropout_20[0][0]'] \n \n==================================================================================================\nTotal params: 68,204,033\nTrainable params: 68,202,497\nNon-trainable params: 1,536\n__________________________________________________________________________________________________\n","output_type":"stream"}]},{"cell_type":"code","source":["\"\"\"early_stopping = tf.keras.callbacks.EarlyStopping(\n"," monitor='val_loss',\n"," patience=2\n",")\n","history = model.fit(\n"," ds_train,\n"," callbacks=[early_stopping],\n"," validation_data=(ds_validation),\n"," epochs=4\n",")\"\"\""],"metadata":{"_uuid":"7a363172-e8bf-4543-a5eb-53fee42e230c","_cell_guid":"bf3f9353-5646-4470-993c-34f23375e889","execution":{"iopub.status.busy":"2023-10-26T17:55:22.185808Z","iopub.execute_input":"2023-10-26T17:55:22.186507Z","iopub.status.idle":"2023-10-26T17:55:22.190865Z","shell.execute_reply.started":"2023-10-26T17:55:22.186467Z","shell.execute_reply":"2023-10-26T17:55:22.189821Z"},"jupyter":{"outputs_hidden":false},"trusted":true,"id":"2MC8YTTdq8JC"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["checkpoint_path = \"/kaggle/working/training_1/cp.ckpt\"\n","checkpoint_dir = os.path.dirname(checkpoint_path)\n","\n","# Create a callback that saves the model's weights\n","cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,\n"," save_weights_only=True,\n"," verbose=1)"],"metadata":{"_uuid":"ea54b468-5ec8-4e14-9940-2218f9786a16","_cell_guid":"d3b7a67c-b46c-436e-ae78-e4c2f7fcda77","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:42:40.224307Z","iopub.execute_input":"2023-10-27T19:42:40.225056Z","iopub.status.idle":"2023-10-27T19:42:40.230301Z","shell.execute_reply.started":"2023-10-27T19:42:40.225020Z","shell.execute_reply":"2023-10-27T19:42:40.229313Z"},"trusted":true,"id":"AISduOHOq8JD"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["history_ = model.fit(\n"," ds_train,\n"," callbacks=[cp_callback],\n"," validation_data=(ds_validation),\n"," epochs=8\n",")"],"metadata":{"_uuid":"0f0a8d23-b4c2-4f12-afbc-a1a0df84b812","_cell_guid":"dfac40bc-5fbc-4955-958d-bfd588cb960b","jupyter":{"outputs_hidden":false},"_kg_hide-output":true,"trusted":true,"id":"E8Ki0udoq8JE"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["we can see that the accuracy has improved from 0.73 at the begining of the training to 0.977 which is considered an amazing accracy:\n","here is the sumarry of the model's history:\n","> {**'loss'**: [0.5624821186065674,\n"," 0.39667385816574097,\n"," 0.29472893476486206,\n"," 0.21065986156463623,\n"," 0.15157125890254974,\n"," 0.11613788455724716,\n"," 0.0892966166138649,\n"," 0.07200820744037628],\n"," **'accuracy'**: [0.7321194410324097,\n"," 0.8330052495002747,\n"," 0.8807414770126343,\n"," 0.9191272854804993,\n"," 0.9465222954750061,\n"," 0.9635826945304871,\n"," 0.9714567065238953,\n"," 0.977854311466217]}"],"metadata":{"_uuid":"284145c1-1eec-4899-9bb2-19c69e11b03f","_cell_guid":"42fef8ca-d698-4059-a98f-90abea1b4c49","trusted":true,"id":"i30MTUP9q8JF"}},{"cell_type":"code","source":["predictions_proba = model.predict(ds_test)"],"metadata":{"_uuid":"a1f7fba1-842f-4a12-adbb-48f557d0a2b9","_cell_guid":"851f878f-9b05-4ace-a7d8-7002a6c6c574","execution":{"iopub.status.busy":"2023-10-27T13:03:59.009556Z","iopub.execute_input":"2023-10-27T13:03:59.010333Z","iopub.status.idle":"2023-10-27T13:04:06.081126Z","shell.execute_reply.started":"2023-10-27T13:03:59.010300Z","shell.execute_reply":"2023-10-27T13:04:06.079878Z"},"jupyter":{"outputs_hidden":false},"trusted":true,"id":"AsSit086q8JG"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["avg_proba = []\n","for x in predictions_proba:\n"," avg_proba.append(np.mean(x))\n","\n","predictions = np.round(avg_proba).astype(np.int32)"],"metadata":{"_uuid":"e7979225-6341-48d1-ba36-cd4617effecc","_cell_guid":"8f5aaa4a-e682-4083-9e20-8ca0e9c47763","execution":{"iopub.status.busy":"2023-10-26T18:44:42.151613Z","iopub.execute_input":"2023-10-26T18:44:42.151905Z","iopub.status.idle":"2023-10-26T18:44:42.205692Z","shell.execute_reply.started":"2023-10-26T18:44:42.151880Z","shell.execute_reply":"2023-10-26T18:44:42.204926Z"},"jupyter":{"outputs_hidden":false},"trusted":true,"id":"y98XHzBFq8JG"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Check model accuracy"],"metadata":{"_uuid":"4ad20fe3-0b02-41b4-90cb-dd7de09a65c2","_cell_guid":"be7b15bb-823d-44b2-a0dc-715ee3291ae0","trusted":true,"id":"_Qd0xuRIq8JH"}},{"cell_type":"code","source":["history_.history"],"metadata":{"_uuid":"d31ad7e7-53bd-43e0-baa3-8495dd3c9d0e","_cell_guid":"514a5af4-6394-4859-94bc-f0f947b54f59","execution":{"iopub.status.busy":"2023-10-26T18:44:42.206688Z","iopub.execute_input":"2023-10-26T18:44:42.206950Z","iopub.status.idle":"2023-10-26T18:44:42.213372Z","shell.execute_reply.started":"2023-10-26T18:44:42.206928Z","shell.execute_reply":"2023-10-26T18:44:42.212428Z"},"jupyter":{"outputs_hidden":false},"trusted":true,"id":"3WZEaT1gq8JJ"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["so the accuracy is 0.9779 which is really good"],"metadata":{"_uuid":"d0da4eb2-385b-49ee-b3e0-dc832b062faf","_cell_guid":"966ac4b4-e3fc-4e30-8549-bf1f19ab4143","trusted":true,"id":"zq0j-zJAq8JK"}},{"cell_type":"markdown","source":["### save model"],"metadata":{"_uuid":"ba04aebe-2ca9-4892-a39c-4429fc684181","_cell_guid":"e649253e-49b4-450e-a3f0-97a5de9b5d0d","trusted":true,"id":"fCp3u4Hfq8JK"}},{"cell_type":"code","source":["model.save(\"/kaggle/working/train/custom_model.keras\")"],"metadata":{"_uuid":"4ccc0bf8-ad6a-483a-8482-f7d43324f810","_cell_guid":"f019ad6b-ce06-46ab-893c-e9aa5f6acf53","execution":{"iopub.status.busy":"2023-10-26T18:49:55.262407Z","iopub.execute_input":"2023-10-26T18:49:55.263367Z","iopub.status.idle":"2023-10-26T18:49:56.546981Z","shell.execute_reply.started":"2023-10-26T18:49:55.263325Z","shell.execute_reply":"2023-10-26T18:49:56.546171Z"},"jupyter":{"outputs_hidden":false},"trusted":true,"id":"ancz52Vzq8JL"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## USE the MODEL"],"metadata":{"_uuid":"0bc6c695-b872-463c-83a5-2b4778bf530a","_cell_guid":"6bc64039-3a17-460b-9058-71a10cd5b9b6","trusted":true,"id":"Fk2nL8Ooq8JL"}},{"cell_type":"markdown","source":["### create the test model\n","create a new instance of the model and load the wieghts"],"metadata":{"_uuid":"004282e9-49ec-427e-b8e1-01e83c65b0bf","_cell_guid":"d6b183e1-1b14-43d8-9dec-bab4d07eb1ca","trusted":true,"id":"Qx3aAQAyq8JM"}},{"cell_type":"code","source":["class twitter_model:\n"," def __init__(self,model_weights=\"/kaggle/input/model-nlp-twitter/custom_model.keras\"):\n"," #activate gpu\n"," gpu_devices = tf.config.experimental.list_physical_devices(\"GPU\")\n"," for device in gpu_devices:\n"," tf.config.experimental.set_memory_growth(device, True)\n","\n"," #define a tokenizer\n"," self.tokenizer = AutoTokenizer.from_pretrained(\"sacculifer/dimbat_disaster_distilbert\", do_lower_case=True)\n","\n"," #define the pretrained model\n"," #model = TFAutoModelForSequenceClassification.from_pretrained(\"sacculifer/dimbat_disaster_distilbert\")\n"," config = AutoConfig.from_pretrained('sacculifer/dimbat_disaster_distilbert')\n"," transformer = TFDistilBertModel.from_pretrained(\"sacculifer/dimbat_disaster_distilbert\", config=config)\n","\n"," input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')\n"," attention_mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')\n","\n"," embeddings = transformer(input_ids, attention_mask=attention_mask)[0]\n"," pooling = tf.keras.layers.GlobalAveragePooling1D()(embeddings)\n","\n"," net = tf.keras.layers.BatchNormalization()(pooling)\n"," net = tf.keras.layers.Dense(1024, activation='relu')(net)\n"," net = tf.keras.layers.Dropout(0.2)(net)\n"," net = tf.keras.layers.Dense(1024, activation='relu')(net)\n"," net = tf.keras.layers.Dropout(0.2)(net)\n"," net = tf.keras.layers.Dense(1, activation='sigmoid')(net)\n","\n"," self.model = tf.keras.Model(inputs=(input_ids, attention_mask), outputs=net)\n"," self.model.layers[2].trainable = True # freeze for transform layers\n","\n"," self.model.compile(\n"," optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),\n"," loss=tf.keras.losses.BinaryCrossentropy(),\n"," metrics=['accuracy']\n"," )\n","\n"," # Loads the weights\n"," self.model.load_weights(model_weights)\n","\n"," def predict(self, text_input=\"help there is an flood\"):\n"," \"\"\"token['input_ids']),token['attention_mask'])\"\"\"\n","\n"," token= self.tokenizer(\n"," text_input,\n"," padding= \"max_length\",\n"," add_special_tokens= True,\n"," return_attention_mask= True,\n"," return_token_type_ids= False\n"," )\n","\n"," input_ids_tensor = tf.constant(token['input_ids'], dtype=tf.int32, shape=(1, 512))\n"," attention_mask_tensor = tf.constant(token['attention_mask'], dtype=tf.int32, shape=(1, 512))\n"," token_tensor={'input_ids': input_ids_tensor, 'attention_mask':attention_mask_tensor}\n"," prediction = self.model.predict(token_tensor)\n"," return prediction"],"metadata":{"_uuid":"b4025af5-d5ec-479d-805d-97ad628fcebc","_cell_guid":"5110913c-fa02-4b07-8104-984393031401","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:43:25.877399Z","iopub.execute_input":"2023-10-27T19:43:25.877825Z","iopub.status.idle":"2023-10-27T19:43:25.891523Z","shell.execute_reply.started":"2023-10-27T19:43:25.877792Z","shell.execute_reply":"2023-10-27T19:43:25.890554Z"},"trusted":true,"id":"l81CwzUqq8JM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#directory = os.getcwd()\n","#weights_path= directory+\"/custom_model.keras\"\n","weights_path=\"/kaggle/input/model-nlp-twitter/custom_model.keras\"\n","#print(weights_path, model_weights)\n","model_test= twitter_model(weights_path)"],"metadata":{"_uuid":"7055e896-cdbd-4df3-ae9e-44d284396808","_cell_guid":"5ff8fb50-1c80-48c8-8a87-c0e3eee9103f","jupyter":{"outputs_hidden":false},"trusted":true,"id":"hIu--8zCq8JO"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["input_text=\"there is a volcano\"\n","prediction= np.round(model_test.predict(input_text))\n","disaster= False\n","if prediction==1:\n"," disaster= True\n","if disaster:\n"," print(\"the text: '\",input_text, \"' means there is a disaster\" )\n","else:\n"," print(\"the text: \",input_text, \"means there is NO disaster\" )"],"metadata":{"_uuid":"c0d22cbe-27eb-4e17-9c81-3662e73f56d6","_cell_guid":"1475e514-55c6-4aeb-a0fe-20fd74d68f34","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:44:03.113207Z","iopub.execute_input":"2023-10-27T19:44:03.113654Z","iopub.status.idle":"2023-10-27T19:44:03.226016Z","shell.execute_reply.started":"2023-10-27T19:44:03.113616Z","shell.execute_reply":"2023-10-27T19:44:03.224830Z"},"trusted":true,"id":"aEU4okWsq8JP","outputId":"f401c67c-c3a1-41a3-e413-cc79b2d97d94"},"execution_count":null,"outputs":[{"name":"stdout","text":"1/1 [==============================] - 0s 58ms/step\nthe text: ' there is a volcano ' means there is a disaster\n","output_type":"stream"}]},{"cell_type":"markdown","source":["## creating the application"],"metadata":{"_uuid":"302ffcff-854c-4496-89d5-92924a6d9fea","_cell_guid":"abc90885-5d0d-4414-b06b-f64281216f53","trusted":true,"id":"LG3--Xngq8JP"}},{"cell_type":"code","source":["def main():\n"," st.header('Twitter disater detector')\n"," directory = os.getcwd()\n"," weights_path= directory+\"/custom_model.keras\"\n"," model_test= twitter_model(weights_path)\n"," input_text=st.text_input(\"Please enter your sentence:\", \"type a word\")\n"," prediction= np.round(model_test.predict(input_text))\n"," disaster= False\n"," if prediction==1:\n"," disaster= True\n"," if disaster:\n"," st.write(\"the text: '\",input_text, \"' means there is a disaster\" )\n"," else:\n"," st.write(\"the text: \",input_text, \"means there is NO disaster\" )\n","\n","\n","\n","if __name__ == '__main__':\n"," main()"],"metadata":{"_uuid":"2dbdd2ab-3fc2-4fb9-8762-5b0a5d5f9316","_cell_guid":"ad6c2251-898e-41e2-a6d4-d89ce8a1ec19","jupyter":{"outputs_hidden":false},"trusted":true,"id":"wJ4CaYC1q8JQ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"Lkn9jm-aq8JR"},"execution_count":null,"outputs":[]}]} |