{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"colab":{"provenance":[]}},"nbformat_minor":0,"nbformat":4,"cells":[{"cell_type":"code","source":["# This Python 3 environment comes with many helpful analytics libraries installed\n","# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n","# For example, here's several helpful packages to load\n","\n","import numpy as np # linear algebra\n","import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","\n","# Input data files are available in the read-only \"../input/\" directory\n","# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n","\n","import os\n","for dirname, _, filenames in os.walk('/kaggle/input'):\n"," for filename in filenames:\n"," print(os.path.join(dirname, filename))\n","\n","# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\"\n","# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"],"metadata":{"_uuid":"d8e725fd-259f-4b0c-8256-134a9f427df7","_cell_guid":"cf754f24-e571-4113-a5ed-656f1f24bce6","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:39:36.406539Z","iopub.execute_input":"2023-10-27T19:39:36.407090Z","iopub.status.idle":"2023-10-27T19:39:36.775683Z","shell.execute_reply.started":"2023-10-27T19:39:36.407063Z","shell.execute_reply":"2023-10-27T19:39:36.774605Z"},"trusted":true,"id":"aLMiXulAq8IQ","executionInfo":{"status":"ok","timestamp":1698517883348,"user_tz":-180,"elapsed":487,"user":{"displayName":"Jasem Almansour","userId":"17867395552910147411"}}},"execution_count":1,"outputs":[]},{"cell_type":"markdown","source":["### install and import important libraries"],"metadata":{"_uuid":"6793367f-b4a4-401a-acdd-f6b5352e182a","_cell_guid":"305a9ca9-d45a-4ef2-8fdb-5cd4c2995a6f","trusted":true,"id":"97oxHEgXq8IY"}},{"cell_type":"code","source":["pip install streamlit"],"metadata":{"_kg_hide-input":true,"execution":{"iopub.status.busy":"2023-10-27T19:40:46.121550Z","iopub.execute_input":"2023-10-27T19:40:46.122052Z","iopub.status.idle":"2023-10-27T19:41:00.894314Z","shell.execute_reply.started":"2023-10-27T19:40:46.122020Z","shell.execute_reply":"2023-10-27T19:41:00.893204Z"},"trusted":true,"id":"tXq3HLXHq8Ib","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1698517896926,"user_tz":-180,"elapsed":10830,"user":{"displayName":"Jasem Almansour","userId":"17867395552910147411"}},"outputId":"48738f3a-72bd-4975-aa11-9df1128d5324"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting streamlit\n"," Downloading streamlit-1.28.0-py2.py3-none-any.whl (8.4 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.4/8.4 MB\u001b[0m \u001b[31m23.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: altair<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (4.2.2)\n","Requirement already satisfied: blinker<2,>=1.0.0 in /usr/lib/python3/dist-packages (from streamlit) (1.4)\n","Requirement already satisfied: cachetools<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (5.3.2)\n","Requirement already satisfied: click<9,>=7.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (8.1.7)\n","Requirement already satisfied: importlib-metadata<7,>=1.4 in /usr/local/lib/python3.10/dist-packages (from streamlit) (6.8.0)\n","Requirement already satisfied: numpy<2,>=1.19.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (1.23.5)\n","Requirement already satisfied: packaging<24,>=16.8 in /usr/local/lib/python3.10/dist-packages (from streamlit) (23.2)\n","Requirement already satisfied: pandas<3,>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (1.5.3)\n","Requirement already satisfied: pillow<11,>=7.1.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (9.4.0)\n","Requirement already satisfied: protobuf<5,>=3.20 in /usr/local/lib/python3.10/dist-packages (from streamlit) (3.20.3)\n","Requirement already satisfied: pyarrow>=6.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (9.0.0)\n","Requirement already satisfied: python-dateutil<3,>=2.7.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (2.8.2)\n","Requirement already satisfied: requests<3,>=2.27 in /usr/local/lib/python3.10/dist-packages (from streamlit) (2.31.0)\n","Requirement already satisfied: rich<14,>=10.14.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (13.6.0)\n","Requirement already satisfied: tenacity<9,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (8.2.3)\n","Requirement already satisfied: toml<2,>=0.10.1 in /usr/local/lib/python3.10/dist-packages (from streamlit) (0.10.2)\n","Requirement already satisfied: typing-extensions<5,>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from streamlit) (4.5.0)\n","Requirement already satisfied: tzlocal<6,>=1.1 in /usr/local/lib/python3.10/dist-packages (from streamlit) (5.2)\n","Collecting validators<1,>=0.2 (from streamlit)\n"," Downloading validators-0.22.0-py3-none-any.whl (26 kB)\n","Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)\n"," Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m190.6/190.6 kB\u001b[0m \u001b[31m17.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)\n"," Downloading pydeck-0.8.1b0-py2.py3-none-any.whl (4.8 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.8/4.8 MB\u001b[0m \u001b[31m51.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: tornado<7,>=6.0.3 in /usr/local/lib/python3.10/dist-packages (from streamlit) (6.3.2)\n","Collecting watchdog>=2.1.5 (from streamlit)\n"," Downloading watchdog-3.0.0-py3-none-manylinux2014_x86_64.whl (82 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.1/82.1 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (0.4)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (3.1.2)\n","Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (4.19.1)\n","Requirement already satisfied: toolz in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit) (0.12.0)\n","Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)\n"," Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata<7,>=1.4->streamlit) (3.17.0)\n","Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3,>=1.3.0->streamlit) (2023.3.post1)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil<3,>=2.7.3->streamlit) (1.16.0)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.27->streamlit) (3.3.1)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.27->streamlit) (3.4)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.27->streamlit) (2.0.7)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.27->streamlit) (2023.7.22)\n","Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit) (3.0.0)\n","Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit) (2.16.1)\n","Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit)\n"," Downloading smmap-5.0.1-py3-none-any.whl (24 kB)\n","Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->altair<6,>=4.0->streamlit) (2.1.3)\n","Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (23.1.0)\n","Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (2023.7.1)\n","Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (0.30.2)\n","Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (0.10.6)\n","Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<14,>=10.14.0->streamlit) (0.1.2)\n","Installing collected packages: watchdog, validators, smmap, pydeck, gitdb, gitpython, streamlit\n","Successfully installed gitdb-4.0.11 gitpython-3.1.40 pydeck-0.8.1b0 smmap-5.0.1 streamlit-1.28.0 validators-0.22.0 watchdog-3.0.0\n"]}]},{"cell_type":"code","source":["import matplotlib.pyplot as plt\n","import random\n","import keras\n","import tensorflow as tf\n","\n","from transformers import AutoTokenizer\n","from transformers import TFDistilBertModel, AutoConfig\n","\n","import streamlit as st"],"metadata":{"_uuid":"764b0395-0f2e-4a26-b71b-b3b238e65803","_cell_guid":"93240b3c-c91d-45fc-afdc-6e25ccde9a76","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:14.020793Z","iopub.execute_input":"2023-10-27T19:41:14.021294Z","iopub.status.idle":"2023-10-27T19:41:14.988123Z","shell.execute_reply.started":"2023-10-27T19:41:14.021266Z","shell.execute_reply":"2023-10-27T19:41:14.987315Z"},"trusted":true,"id":"2gWitzGQq8Ic","colab":{"base_uri":"https://localhost:8080/","height":383},"executionInfo":{"status":"error","timestamp":1698517925997,"user_tz":-180,"elapsed":652,"user":{"displayName":"Jasem Almansour","userId":"17867395552910147411"}},"outputId":"0ae4e935-7486-475e-eaba-3f3dcbb01b4e"},"execution_count":4,"outputs":[{"output_type":"error","ename":"ModuleNotFoundError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtensorflow\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtransformers\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAutoTokenizer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtransformers\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mTFDistilBertModel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAutoConfig\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'transformers'","","\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n"],"errorDetails":{"actions":[{"action":"open_url","actionText":"Open Examples","url":"/notebooks/snippets/importing_libraries.ipynb"}]}}]},{"cell_type":"markdown","source":["### Preprocessing"],"metadata":{"_uuid":"f61c70f1-311b-4dcd-938c-ff4ab102865c","_cell_guid":"b7308c78-1bfa-4dff-988f-7fcb45c5ee31","trusted":true,"id":"6_-QMa4nq8Id"}},{"cell_type":"code","source":["# Load the dataset files\n","df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')\n","df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')"],"metadata":{"_uuid":"64c41258-61fe-4193-a5c8-00a1fcdcd7cb","_cell_guid":"c0e5785a-3393-4e7a-a573-0da8c001bfa0","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:14.989959Z","iopub.execute_input":"2023-10-27T19:41:14.990260Z","iopub.status.idle":"2023-10-27T19:41:15.103684Z","shell.execute_reply.started":"2023-10-27T19:41:14.990233Z","shell.execute_reply":"2023-10-27T19:41:15.102798Z"},"trusted":true,"id":"DnPsxqPpq8Id","colab":{"base_uri":"https://localhost:8080/","height":355},"executionInfo":{"status":"error","timestamp":1698517943331,"user_tz":-180,"elapsed":425,"user":{"displayName":"Jasem Almansour","userId":"17867395552910147411"}},"outputId":"ce351468-1e00-439f-db42-94160b65ebcd"},"execution_count":5,"outputs":[{"output_type":"error","ename":"FileNotFoundError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Load the dataset files\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/kaggle/input/nlp-getting-started/train.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdf_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/kaggle/input/nlp-getting-started/test.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnew_arg_name\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_arg_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 211\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 212\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mF\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfind_stack_level\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 330\u001b[0m )\n\u001b[0;32m--> 331\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 332\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 333\u001b[0m \u001b[0;31m# error: \"Callable[[VarArg(Any), KwArg(Any)], Any]\" has no\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m 948\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkwds_defaults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 949\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 950\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 951\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 952\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 604\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 605\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 606\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 607\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1440\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1441\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandles\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mIOHandles\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1442\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1443\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1444\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/parsers/readers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1733\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;34m\"b\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1735\u001b[0;31m self.handles = get_handle(\n\u001b[0m\u001b[1;32m 1736\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/io/common.py\u001b[0m in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoding\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 855\u001b[0m \u001b[0;31m# Encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 856\u001b[0;31m handle = open(\n\u001b[0m\u001b[1;32m 857\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[0mioargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/kaggle/input/nlp-getting-started/train.csv'"]}]},{"cell_type":"markdown","source":["let's look at the training data"],"metadata":{"_uuid":"5d1d2327-e567-4704-9961-dba574735645","_cell_guid":"00d3bfa5-4e1b-4db8-be5a-55d9516ddb7f","trusted":true,"id":"zTIeDUw8q8If"}},{"cell_type":"code","source":["print(df_train.columns)\n","print(df_train.shape)\n","df_train.head()"],"metadata":{"_uuid":"e6b3baf4-d084-4f76-9ec0-7f54234541cd","_cell_guid":"65b54a29-f70c-4fee-adca-9e2a9451c2e1","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:19.084998Z","iopub.execute_input":"2023-10-27T19:41:19.085364Z","iopub.status.idle":"2023-10-27T19:41:19.108415Z","shell.execute_reply.started":"2023-10-27T19:41:19.085334Z","shell.execute_reply":"2023-10-27T19:41:19.107466Z"},"trusted":true,"id":"FXivPW8Nq8If","outputId":"55982b8b-b60e-4fa6-d608-ff06718367b5"},"execution_count":null,"outputs":[{"name":"stdout","text":"Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')\n(7613, 5)\n","output_type":"stream"},{"execution_count":6,"output_type":"execute_result","data":{"text/plain":" id keyword location text \\\n0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... \n1 4 NaN NaN Forest fire near La Ronge Sask. Canada \n2 5 NaN NaN All residents asked to 'shelter in place' are ... \n3 6 NaN NaN 13,000 people receive #wildfires evacuation or... \n4 7 NaN NaN Just got sent this photo from Ruby #Alaska as ... \n\n target \n0 1 \n1 1 \n2 1 \n3 1 \n4 1 ","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idkeywordlocationtexttarget
01NaNNaNOur Deeds are the Reason of this #earthquake M...1
14NaNNaNForest fire near La Ronge Sask. Canada1
25NaNNaNAll residents asked to 'shelter in place' are ...1
36NaNNaN13,000 people receive #wildfires evacuation or...1
47NaNNaNJust got sent this photo from Ruby #Alaska as ...1
\n
"},"metadata":{}}]},{"cell_type":"markdown","source":["Now let's look at the testing data"],"metadata":{"_uuid":"515ab512-06b7-4cce-9a95-f2916f1fd4ad","_cell_guid":"c96e4868-5978-4ca3-8272-af81f4f1d56f","trusted":true,"id":"jLhs-rRiq8Ig"}},{"cell_type":"code","source":["print(df_test.columns)\n","print(df_test.shape)\n","df_test.head()"],"metadata":{"_uuid":"1ea066a1-3a43-467a-a70f-91c131d5868c","_cell_guid":"c08cc742-e3c4-4a66-be8f-422965d1c8a8","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:19.914598Z","iopub.execute_input":"2023-10-27T19:41:19.914973Z","iopub.status.idle":"2023-10-27T19:41:19.926242Z","shell.execute_reply.started":"2023-10-27T19:41:19.914942Z","shell.execute_reply":"2023-10-27T19:41:19.925362Z"},"trusted":true,"id":"LSuuIC2jq8Il","outputId":"16e482e3-aa53-421a-ef8d-22ba2bfbc96a"},"execution_count":null,"outputs":[{"name":"stdout","text":"Index(['id', 'keyword', 'location', 'text'], dtype='object')\n(3263, 4)\n","output_type":"stream"},{"execution_count":7,"output_type":"execute_result","data":{"text/plain":" id keyword location text\n0 0 NaN NaN Just happened a terrible car crash\n1 2 NaN NaN Heard about #earthquake is different cities, s...\n2 3 NaN NaN there is a forest fire at spot pond, geese are...\n3 9 NaN NaN Apocalypse lighting. #Spokane #wildfires\n4 11 NaN NaN Typhoon Soudelor kills 28 in China and Taiwan","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
idkeywordlocationtext
00NaNNaNJust happened a terrible car crash
12NaNNaNHeard about #earthquake is different cities, s...
23NaNNaNthere is a forest fire at spot pond, geese are...
39NaNNaNApocalypse lighting. #Spokane #wildfires
411NaNNaNTyphoon Soudelor kills 28 in China and Taiwan
\n
"},"metadata":{}}]},{"cell_type":"markdown","source":["let's check the null values"],"metadata":{"_uuid":"0a7e59be-5a02-438d-8ecb-334ada13bce9","_cell_guid":"1de18dac-da71-4512-9556-7f7d4ebea7f1","trusted":true,"id":"uOCaMj6Uq8Im"}},{"cell_type":"code","source":["df_train.isna().sum()"],"metadata":{"_uuid":"39359b8a-160a-4718-ace0-e289ac71f01e","_cell_guid":"a31bdec4-7cd0-4466-989b-cdcdbc10bee4","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:20.969504Z","iopub.execute_input":"2023-10-27T19:41:20.970222Z","iopub.status.idle":"2023-10-27T19:41:20.982161Z","shell.execute_reply.started":"2023-10-27T19:41:20.970192Z","shell.execute_reply":"2023-10-27T19:41:20.981195Z"},"trusted":true,"id":"jENpYCHVq8In","outputId":"505c10ff-2c1d-452c-d4cb-19fe183fb434"},"execution_count":null,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":"id 0\nkeyword 61\nlocation 2533\ntext 0\ntarget 0\ndtype: int64"},"metadata":{}}]},{"cell_type":"markdown","source":["we can easily see that the colomns \"keyword, location\" doesn't provide any useful information since most of them are None values...\n","so let's Drop them"],"metadata":{"_uuid":"2676e5e8-c49a-4578-9bbc-75cd5c301b6a","_cell_guid":"999b0c97-291d-43e1-9b35-e5e0fd29be82","trusted":true,"id":"dikPAsUwq8In"}},{"cell_type":"code","source":["df_train= df_train.drop(columns=['keyword', 'location'])\n","df_test= df_test.drop(columns=['keyword', 'location'])\n","print(\"training dataset columns:\",df_train.columns)\n","print(\"testing dataset columns:\",df_test.columns)"],"metadata":{"_uuid":"c626b9de-06b9-4d4f-af1c-59bf7499196e","_cell_guid":"27988b26-0c42-4d8b-b427-1e5c3e157e64","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:21.865299Z","iopub.execute_input":"2023-10-27T19:41:21.865625Z","iopub.status.idle":"2023-10-27T19:41:21.883019Z","shell.execute_reply.started":"2023-10-27T19:41:21.865600Z","shell.execute_reply":"2023-10-27T19:41:21.882046Z"},"trusted":true,"id":"f0zEWKd9q8Io","outputId":"58efec41-8701-4295-f73f-d7c7fe6cfe64"},"execution_count":null,"outputs":[{"name":"stdout","text":"training dataset columns: Index(['id', 'text', 'target'], dtype='object')\ntesting dataset columns: Index(['id', 'text'], dtype='object')\n","output_type":"stream"}]},{"cell_type":"markdown","source":["Now let's check if the two target classes are represented equaly or if one class is under-represented."],"metadata":{"_uuid":"866bd007-6ccc-4e9c-aff7-52edd786dde9","_cell_guid":"3859d518-440f-4034-8556-02a7c0b274d3","trusted":true,"id":"TYc--H-_q8Ip"}},{"cell_type":"code","source":["# Count the number of tweets in each class\n","class_counts = df_train['target'].value_counts()\n","print('0 = Not a Disaster Tweet counts, 1 = Disaster Tweet counts')\n","print(class_counts)\n","\n","# Create a bar plot for the Class Distribution\n","plt.figure(figsize=(6, 4))\n","class_counts.plot(kind='bar', color=['green', 'red'])\n","plt.title('Class Distribution in Training Data - \"target\" column')\n","plt.xlabel('Class (0: Not Disaster, 1: Disaster)')\n","plt.ylabel('Count')\n","plt.xticks(rotation=0)\n","plt.show()"],"metadata":{"_uuid":"f01247f3-f5cc-4606-98c9-e5708bb301ea","_cell_guid":"8f319ce4-f9e0-4376-ad1c-d12a42a07dc0","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:23.009894Z","iopub.execute_input":"2023-10-27T19:41:23.010262Z","iopub.status.idle":"2023-10-27T19:41:23.288651Z","shell.execute_reply.started":"2023-10-27T19:41:23.010234Z","shell.execute_reply":"2023-10-27T19:41:23.287774Z"},"trusted":true,"id":"qt02hGGkq8Ip","outputId":"0930901a-aeec-4157-984f-434a7ff78d0d"},"execution_count":null,"outputs":[{"name":"stdout","text":"0 = Not a Disaster Tweet counts, 1 = Disaster Tweet counts\ntarget\n0 4342\n1 3271\nName: count, dtype: int64\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"
","image/png":""},"metadata":{}}]},{"cell_type":"markdown","source":["from the previous histogram we can see that we have 4324 tweets that doesn't indicate a disaster and 3271 tweets that indicates a disaster which is a fare representation for the two classes."],"metadata":{"_uuid":"0f63f459-e975-41b2-9aa9-d303aa018970","_cell_guid":"abea9f31-58e7-4919-852a-fa6800caad21","trusted":true,"id":"f-3PoibEq8Iq"}},{"cell_type":"markdown","source":["let's take a random sample from the dataset \"text\" colomn to check the data:"],"metadata":{"_uuid":"97211c76-c937-4d40-a649-235edc282728","_cell_guid":"f1c9b925-ea8c-4ce8-897b-69883080bcaf","trusted":true,"id":"8pN2pN0tq8Ir"}},{"cell_type":"code","source":["random_sample_text = random.sample(list(df_train['text']), 10)\n","for i, text in enumerate(random_sample_text):\n"," print(f'text n_{i}: {text}\\n')"],"metadata":{"_uuid":"6d1b27f6-e58d-4d4d-94ae-6fff760b6ace","_cell_guid":"beef0338-988f-45b4-ad77-909f7380473b","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:24.727975Z","iopub.execute_input":"2023-10-27T19:41:24.728348Z","iopub.status.idle":"2023-10-27T19:41:24.735998Z","shell.execute_reply.started":"2023-10-27T19:41:24.728318Z","shell.execute_reply":"2023-10-27T19:41:24.735013Z"},"trusted":true,"id":"sLVaE8VQq8Ir","outputId":"88a3e2f1-44f9-448a-9f2b-14e5c5a085f9"},"execution_count":null,"outputs":[{"name":"stdout","text":"text n_0: i be on that hotboy shit\n\ntext n_1: Trial Date Set for Man Charged with Arson Burglary http://t.co/WftCrLz32P\n\ntext n_2: waiting for my chocolate lava cakes to get here ??????\n\ntext n_3: Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt\n\ntext n_4: @Rebelmage2 I'm glad you got away XD But My 'be safe' was in reference to a tornado near calgary and drum heller at around 4 :O\n\ntext n_5: Texas Seeks Comment on Rules for Changes to Windstorm Insurer http://t.co/BZ07c9WthX via @ijournal\n\ntext n_6: @godsfirstson1 and she wrapped his coat around herself. It practically engulfed her.\n\ntext n_7: Kosciusko police investigating pedestrian fatality hit by a train Thursday http://t.co/m5djLLxoZP\n\ntext n_8: One Direction Is my pick for http://t.co/q2eBlOKeVE Fan Army #Directioners http://t.co/eNCmhz6y34 x1441\n\ntext n_9: I'm drowning in hw now and that's w/o going to swim ohlordy\n\n","output_type":"stream"}]},{"cell_type":"markdown","source":["after taking a random sample we can see that the text contains UURL's that doesnt provide any useful information, so let's attempt to remove them...\n","also it's almost that users add this URL's at the end of the tweet.\n","so we can remove the URL's by using regural expressions"],"metadata":{"_uuid":"3be35232-2f02-4877-8e29-3b1d1f06eb90","_cell_guid":"5fdc858c-7e7f-4a7f-bd4c-d829913177c1","trusted":true,"id":"ChsFU4c7q8Is"}},{"cell_type":"code","source":["url_rgex = r'(http\\S*)'\n","\n","df_train['text'] = df_train['text'].str.replace(url_rgex, '', regex=True)\n","df_test['text'] = df_test['text'].str.replace(url_rgex, '', regex=True)"],"metadata":{"_uuid":"8e2300a3-11d4-4cb0-89eb-b9c8336d8ab9","_cell_guid":"6d14a396-a643-4059-a278-627781352814","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:25.925474Z","iopub.execute_input":"2023-10-27T19:41:25.925857Z","iopub.status.idle":"2023-10-27T19:41:25.946391Z","shell.execute_reply.started":"2023-10-27T19:41:25.925827Z","shell.execute_reply":"2023-10-27T19:41:25.945488Z"},"trusted":true,"id":"nSaVmheFq8It"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["random_sample_text = random.sample(list(df_train['text']), 2)\n","for i, text in enumerate(random_sample_text):\n"," print(f'text n_{i}: {text}\\n')"],"metadata":{"_uuid":"0257b086-caaf-46de-82ce-e66f9148548b","_cell_guid":"75011780-41e1-4270-bdc0-3a92f61af657","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:26.780011Z","iopub.execute_input":"2023-10-27T19:41:26.780832Z","iopub.status.idle":"2023-10-27T19:41:26.788250Z","shell.execute_reply.started":"2023-10-27T19:41:26.780787Z","shell.execute_reply":"2023-10-27T19:41:26.787059Z"},"trusted":true,"id":"1ADQwTspq8Iu","outputId":"78b80bb6-e19a-4a2a-b49a-213fcd0bd1c3"},"execution_count":null,"outputs":[{"name":"stdout","text":"text n_0: Obama Declares Disaster for Typhoon-Devastated Saipan: Obama signs disaster declaration for Northern Marians a... \n\ntext n_1: Now Trending in Nigeria: Police charge traditional ruler others with informant‰Ûªs murder \n\n","output_type":"stream"}]},{"cell_type":"markdown","source":["another thing to notice that there are alot of users names and hastags which don't provide a lot of information since users also sometimes use the most famous tags to increase tweets reach and the names doesn't provide alot of valuable information so let's remove them:"],"metadata":{"_uuid":"73d9d07b-6d88-4492-9441-d33a6a444461","_cell_guid":"4d3ba251-6716-47eb-8f38-9c141753cb32","trusted":true,"id":"ctuZ8JRTq8Iu"}},{"cell_type":"code","source":["user_regex = r'(@\\S*)'\n","df_train['text'] = df_train['text'].str.replace(user_regex, '', regex=True)\n","df_test['text'] = df_test['text'].str.replace(user_regex, '', regex=True)"],"metadata":{"_uuid":"c42034b7-e79b-47eb-a3fa-e5b3fce4ae63","_cell_guid":"5921e2fc-f654-41f7-9267-88a173de2e4d","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:28.696734Z","iopub.execute_input":"2023-10-27T19:41:28.697136Z","iopub.status.idle":"2023-10-27T19:41:28.715136Z","shell.execute_reply.started":"2023-10-27T19:41:28.697106Z","shell.execute_reply":"2023-10-27T19:41:28.714127Z"},"trusted":true,"id":"SQcB2dD-q8Iv"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["tags_regex = r'(#)'\n","df_train['text'] = df_train['text'].str.replace(tags_regex, '', regex=True)\n","df_test['text'] = df_test['text'].str.replace(tags_regex, '', regex=True)"],"metadata":{"_uuid":"f63bab01-04fd-454c-b128-589549169a9a","_cell_guid":"70f86e6b-7357-4c27-b9bd-edf6d0fcabf5","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:29.579409Z","iopub.execute_input":"2023-10-27T19:41:29.579802Z","iopub.status.idle":"2023-10-27T19:41:29.597782Z","shell.execute_reply.started":"2023-10-27T19:41:29.579767Z","shell.execute_reply":"2023-10-27T19:41:29.596900Z"},"trusted":true,"id":"-aYahFSNq8Iv"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["let's remove non-english alphabets"],"metadata":{"_uuid":"9888805b-4374-481b-8f32-79e828744829","_cell_guid":"e3538a3f-ac65-4f27-b8ec-b86c73592c9c","trusted":true,"id":"yGqB6QNWq8Iw"}},{"cell_type":"code","source":["error_pattern = r'(‰Û\\S*) | (‰Ûª)'\n","\n","df_train['text'] = df_train['text'].str.replace(error_pattern, '', regex=True)\n","df_test['text'] = df_test['text'].str.replace(error_pattern, '', regex=True)"],"metadata":{"_uuid":"e3107e9a-ff52-4a4c-9230-759c386cc2f3","_cell_guid":"1941cf2b-4f0c-4fbd-b67c-a1e6660b73d9","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:30.954944Z","iopub.execute_input":"2023-10-27T19:41:30.955819Z","iopub.status.idle":"2023-10-27T19:41:31.036663Z","shell.execute_reply.started":"2023-10-27T19:41:30.955785Z","shell.execute_reply":"2023-10-27T19:41:31.035742Z"},"trusted":true,"id":"VZKt2Qf8q8Iw"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["let's check another random sample after all the preprocessing:"],"metadata":{"_uuid":"2c9cb783-c9f0-4d93-a321-49107ff30e28","_cell_guid":"f88d16a1-b988-4c92-93c4-7e4fcfe2a1d7","trusted":true,"id":"y7OwlQ3eq8Ix"}},{"cell_type":"code","source":["random_sample_text = random.sample(list(df_train['text']), 10)\n","for i, text in enumerate(random_sample_text):\n"," print(f'text n_{i}: {text}\\n')"],"metadata":{"_uuid":"1fb72e54-69e1-4fa2-9dad-8a5f997ca85b","_cell_guid":"3a0c12e4-7645-46ac-97ac-f35829c1479d","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:32.387128Z","iopub.execute_input":"2023-10-27T19:41:32.388040Z","iopub.status.idle":"2023-10-27T19:41:32.398047Z","shell.execute_reply.started":"2023-10-27T19:41:32.387980Z","shell.execute_reply":"2023-10-27T19:41:32.396742Z"},"trusted":true,"id":"NETkj9Ajq8Iy","outputId":"b0f86893-d576-4b67-a875-a6b6aa55104e"},"execution_count":null,"outputs":[{"name":"stdout","text":"text n_0: Criminals Who Hijack Lorries And Buses Arrested In Enugu: According to the Nigerian Police Force... Via \n\ntext n_1: in 5 min // Speaker Deck\n\ntext n_2: NBCNightlyNews: Malaysian Officials Say Debris Found on Reunion Island Is From MH370. BillNeelyNBC reports: \n\ntext n_3: 'Nobody remembers who came in second.' Charles Schulz\n\ntext n_4: Heavy smoke pouring out of buildings on fire in Port Coquitlam \n\ntext n_5: Green line service on south side disrupted after CTA train derails passengers evacuated. \n\ntext n_6: MORE-->OSHA officers on siteinvestigating Noranda explosion -KFVS12 News Cape Girardeau Carbondale Poplar Bluff \n\ntext n_7: trapped in its disappearance\n\ntext n_8: lets hope it's a upper class white mass murderer....''' Mmmm\n\ntext n_9: Summer is lovely\n\n","output_type":"stream"}]},{"cell_type":"markdown","source":["**Now this dataset makes more sense**"],"metadata":{"_uuid":"d3134ebc-eaf9-4817-b09a-e1f11b6865db","_cell_guid":"c1f2dc4b-817a-4e51-b3d6-a9dac7820dcc","trusted":true,"id":"UzMYyknqq8Iz"}},{"cell_type":"markdown","source":["### tokanizing and creating the mask"],"metadata":{"_uuid":"f25843b2-d303-44c2-a3be-25131e5c7e89","_cell_guid":"cf16e56b-de11-41c7-ab6e-be973314dec1","trusted":true,"id":"Ak531eHNq8Iz"}},{"cell_type":"markdown","source":["I will be using the tokanizer from huggingface [sacculifer/dimbat_disaster_distilbert](http://huggingface.co/sacculifer/dimbat_disaster_distilbert) because it will serve us well in this task"],"metadata":{"_uuid":"d0a28b79-b94e-423e-a950-f24675147c0f","_cell_guid":"51d24430-4242-4931-a574-6bce4ad090a6","trusted":true,"id":"mewZUuAJq8I0"}},{"cell_type":"code","source":["tokenizer = AutoTokenizer.from_pretrained(\"sacculifer/dimbat_disaster_distilbert\", do_lower_case=True)"],"metadata":{"_uuid":"c78dd8f4-6f13-4974-ba71-10dbd558a532","_cell_guid":"d29eac28-1942-4e58-80ab-6778026cb2f9","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:41:34.502573Z","iopub.execute_input":"2023-10-27T19:41:34.503050Z","iopub.status.idle":"2023-10-27T19:41:35.339038Z","shell.execute_reply.started":"2023-10-27T19:41:34.503006Z","shell.execute_reply":"2023-10-27T19:41:35.338022Z"},"trusted":true,"id":"24yT7-qFq8I0","outputId":"f415002d-e201-435d-af99-ebd5a461b90d","colab":{"referenced_widgets":["3ff5527998da43628ebd563410b4f3fb","c77f453507be4607a43862b8a7338730","6ef2876ce615455396fd0215b2a4171f","f328f49ca29d4e258538071541666cd5"]}},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":"Downloading (…)okenizer_config.json: 0%| | 0.00/333 [00:00 {**'loss'**: [0.5624821186065674,\n"," 0.39667385816574097,\n"," 0.29472893476486206,\n"," 0.21065986156463623,\n"," 0.15157125890254974,\n"," 0.11613788455724716,\n"," 0.0892966166138649,\n"," 0.07200820744037628],\n"," **'accuracy'**: [0.7321194410324097,\n"," 0.8330052495002747,\n"," 0.8807414770126343,\n"," 0.9191272854804993,\n"," 0.9465222954750061,\n"," 0.9635826945304871,\n"," 0.9714567065238953,\n"," 0.977854311466217]}"],"metadata":{"_uuid":"284145c1-1eec-4899-9bb2-19c69e11b03f","_cell_guid":"42fef8ca-d698-4059-a98f-90abea1b4c49","trusted":true,"id":"i30MTUP9q8JF"}},{"cell_type":"code","source":["predictions_proba = model.predict(ds_test)"],"metadata":{"_uuid":"a1f7fba1-842f-4a12-adbb-48f557d0a2b9","_cell_guid":"851f878f-9b05-4ace-a7d8-7002a6c6c574","execution":{"iopub.status.busy":"2023-10-27T13:03:59.009556Z","iopub.execute_input":"2023-10-27T13:03:59.010333Z","iopub.status.idle":"2023-10-27T13:04:06.081126Z","shell.execute_reply.started":"2023-10-27T13:03:59.010300Z","shell.execute_reply":"2023-10-27T13:04:06.079878Z"},"jupyter":{"outputs_hidden":false},"trusted":true,"id":"AsSit086q8JG"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["avg_proba = []\n","for x in predictions_proba:\n"," avg_proba.append(np.mean(x))\n","\n","predictions = np.round(avg_proba).astype(np.int32)"],"metadata":{"_uuid":"e7979225-6341-48d1-ba36-cd4617effecc","_cell_guid":"8f5aaa4a-e682-4083-9e20-8ca0e9c47763","execution":{"iopub.status.busy":"2023-10-26T18:44:42.151613Z","iopub.execute_input":"2023-10-26T18:44:42.151905Z","iopub.status.idle":"2023-10-26T18:44:42.205692Z","shell.execute_reply.started":"2023-10-26T18:44:42.151880Z","shell.execute_reply":"2023-10-26T18:44:42.204926Z"},"jupyter":{"outputs_hidden":false},"trusted":true,"id":"y98XHzBFq8JG"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Check model accuracy"],"metadata":{"_uuid":"4ad20fe3-0b02-41b4-90cb-dd7de09a65c2","_cell_guid":"be7b15bb-823d-44b2-a0dc-715ee3291ae0","trusted":true,"id":"_Qd0xuRIq8JH"}},{"cell_type":"code","source":["history_.history"],"metadata":{"_uuid":"d31ad7e7-53bd-43e0-baa3-8495dd3c9d0e","_cell_guid":"514a5af4-6394-4859-94bc-f0f947b54f59","execution":{"iopub.status.busy":"2023-10-26T18:44:42.206688Z","iopub.execute_input":"2023-10-26T18:44:42.206950Z","iopub.status.idle":"2023-10-26T18:44:42.213372Z","shell.execute_reply.started":"2023-10-26T18:44:42.206928Z","shell.execute_reply":"2023-10-26T18:44:42.212428Z"},"jupyter":{"outputs_hidden":false},"trusted":true,"id":"3WZEaT1gq8JJ"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["so the accuracy is 0.9779 which is really good"],"metadata":{"_uuid":"d0da4eb2-385b-49ee-b3e0-dc832b062faf","_cell_guid":"966ac4b4-e3fc-4e30-8549-bf1f19ab4143","trusted":true,"id":"zq0j-zJAq8JK"}},{"cell_type":"markdown","source":["### save model"],"metadata":{"_uuid":"ba04aebe-2ca9-4892-a39c-4429fc684181","_cell_guid":"e649253e-49b4-450e-a3f0-97a5de9b5d0d","trusted":true,"id":"fCp3u4Hfq8JK"}},{"cell_type":"code","source":["model.save(\"/kaggle/working/train/custom_model.keras\")"],"metadata":{"_uuid":"4ccc0bf8-ad6a-483a-8482-f7d43324f810","_cell_guid":"f019ad6b-ce06-46ab-893c-e9aa5f6acf53","execution":{"iopub.status.busy":"2023-10-26T18:49:55.262407Z","iopub.execute_input":"2023-10-26T18:49:55.263367Z","iopub.status.idle":"2023-10-26T18:49:56.546981Z","shell.execute_reply.started":"2023-10-26T18:49:55.263325Z","shell.execute_reply":"2023-10-26T18:49:56.546171Z"},"jupyter":{"outputs_hidden":false},"trusted":true,"id":"ancz52Vzq8JL"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## USE the MODEL"],"metadata":{"_uuid":"0bc6c695-b872-463c-83a5-2b4778bf530a","_cell_guid":"6bc64039-3a17-460b-9058-71a10cd5b9b6","trusted":true,"id":"Fk2nL8Ooq8JL"}},{"cell_type":"markdown","source":["### create the test model\n","create a new instance of the model and load the wieghts"],"metadata":{"_uuid":"004282e9-49ec-427e-b8e1-01e83c65b0bf","_cell_guid":"d6b183e1-1b14-43d8-9dec-bab4d07eb1ca","trusted":true,"id":"Qx3aAQAyq8JM"}},{"cell_type":"code","source":["class twitter_model:\n"," def __init__(self,model_weights=\"/kaggle/input/model-nlp-twitter/custom_model.keras\"):\n"," #activate gpu\n"," gpu_devices = tf.config.experimental.list_physical_devices(\"GPU\")\n"," for device in gpu_devices:\n"," tf.config.experimental.set_memory_growth(device, True)\n","\n"," #define a tokenizer\n"," self.tokenizer = AutoTokenizer.from_pretrained(\"sacculifer/dimbat_disaster_distilbert\", do_lower_case=True)\n","\n"," #define the pretrained model\n"," #model = TFAutoModelForSequenceClassification.from_pretrained(\"sacculifer/dimbat_disaster_distilbert\")\n"," config = AutoConfig.from_pretrained('sacculifer/dimbat_disaster_distilbert')\n"," transformer = TFDistilBertModel.from_pretrained(\"sacculifer/dimbat_disaster_distilbert\", config=config)\n","\n"," input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')\n"," attention_mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')\n","\n"," embeddings = transformer(input_ids, attention_mask=attention_mask)[0]\n"," pooling = tf.keras.layers.GlobalAveragePooling1D()(embeddings)\n","\n"," net = tf.keras.layers.BatchNormalization()(pooling)\n"," net = tf.keras.layers.Dense(1024, activation='relu')(net)\n"," net = tf.keras.layers.Dropout(0.2)(net)\n"," net = tf.keras.layers.Dense(1024, activation='relu')(net)\n"," net = tf.keras.layers.Dropout(0.2)(net)\n"," net = tf.keras.layers.Dense(1, activation='sigmoid')(net)\n","\n"," self.model = tf.keras.Model(inputs=(input_ids, attention_mask), outputs=net)\n"," self.model.layers[2].trainable = True # freeze for transform layers\n","\n"," self.model.compile(\n"," optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),\n"," loss=tf.keras.losses.BinaryCrossentropy(),\n"," metrics=['accuracy']\n"," )\n","\n"," # Loads the weights\n"," self.model.load_weights(model_weights)\n","\n"," def predict(self, text_input=\"help there is an flood\"):\n"," \"\"\"token['input_ids']),token['attention_mask'])\"\"\"\n","\n"," token= self.tokenizer(\n"," text_input,\n"," padding= \"max_length\",\n"," add_special_tokens= True,\n"," return_attention_mask= True,\n"," return_token_type_ids= False\n"," )\n","\n"," input_ids_tensor = tf.constant(token['input_ids'], dtype=tf.int32, shape=(1, 512))\n"," attention_mask_tensor = tf.constant(token['attention_mask'], dtype=tf.int32, shape=(1, 512))\n"," token_tensor={'input_ids': input_ids_tensor, 'attention_mask':attention_mask_tensor}\n"," prediction = self.model.predict(token_tensor)\n"," return prediction"],"metadata":{"_uuid":"b4025af5-d5ec-479d-805d-97ad628fcebc","_cell_guid":"5110913c-fa02-4b07-8104-984393031401","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:43:25.877399Z","iopub.execute_input":"2023-10-27T19:43:25.877825Z","iopub.status.idle":"2023-10-27T19:43:25.891523Z","shell.execute_reply.started":"2023-10-27T19:43:25.877792Z","shell.execute_reply":"2023-10-27T19:43:25.890554Z"},"trusted":true,"id":"l81CwzUqq8JM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["#directory = os.getcwd()\n","#weights_path= directory+\"/custom_model.keras\"\n","weights_path=\"/kaggle/input/model-nlp-twitter/custom_model.keras\"\n","#print(weights_path, model_weights)\n","model_test= twitter_model(weights_path)"],"metadata":{"_uuid":"7055e896-cdbd-4df3-ae9e-44d284396808","_cell_guid":"5ff8fb50-1c80-48c8-8a87-c0e3eee9103f","jupyter":{"outputs_hidden":false},"trusted":true,"id":"hIu--8zCq8JO"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["input_text=\"there is a volcano\"\n","prediction= np.round(model_test.predict(input_text))\n","disaster= False\n","if prediction==1:\n"," disaster= True\n","if disaster:\n"," print(\"the text: '\",input_text, \"' means there is a disaster\" )\n","else:\n"," print(\"the text: \",input_text, \"means there is NO disaster\" )"],"metadata":{"_uuid":"c0d22cbe-27eb-4e17-9c81-3662e73f56d6","_cell_guid":"1475e514-55c6-4aeb-a0fe-20fd74d68f34","jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-10-27T19:44:03.113207Z","iopub.execute_input":"2023-10-27T19:44:03.113654Z","iopub.status.idle":"2023-10-27T19:44:03.226016Z","shell.execute_reply.started":"2023-10-27T19:44:03.113616Z","shell.execute_reply":"2023-10-27T19:44:03.224830Z"},"trusted":true,"id":"aEU4okWsq8JP","outputId":"f401c67c-c3a1-41a3-e413-cc79b2d97d94"},"execution_count":null,"outputs":[{"name":"stdout","text":"1/1 [==============================] - 0s 58ms/step\nthe text: ' there is a volcano ' means there is a disaster\n","output_type":"stream"}]},{"cell_type":"markdown","source":["## creating the application"],"metadata":{"_uuid":"302ffcff-854c-4496-89d5-92924a6d9fea","_cell_guid":"abc90885-5d0d-4414-b06b-f64281216f53","trusted":true,"id":"LG3--Xngq8JP"}},{"cell_type":"code","source":["def main():\n"," st.header('Twitter disater detector')\n"," directory = os.getcwd()\n"," weights_path= directory+\"/custom_model.keras\"\n"," model_test= twitter_model(weights_path)\n"," input_text=st.text_input(\"Please enter your sentence:\", \"type a word\")\n"," prediction= np.round(model_test.predict(input_text))\n"," disaster= False\n"," if prediction==1:\n"," disaster= True\n"," if disaster:\n"," st.write(\"the text: '\",input_text, \"' means there is a disaster\" )\n"," else:\n"," st.write(\"the text: \",input_text, \"means there is NO disaster\" )\n","\n","\n","\n","if __name__ == '__main__':\n"," main()"],"metadata":{"_uuid":"2dbdd2ab-3fc2-4fb9-8762-5b0a5d5f9316","_cell_guid":"ad6c2251-898e-41e2-a6d4-d89ce8a1ec19","jupyter":{"outputs_hidden":false},"trusted":true,"id":"wJ4CaYC1q8JQ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"Lkn9jm-aq8JR"},"execution_count":null,"outputs":[]}]}