File size: 17,814 Bytes

75f0757

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[1;32m      3\u001b[0m dataset \u001b[38;5;241m=\u001b[39m load_dataset(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimsoumyaneel/sentiment-analysis-llama2\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/datasets/__init__.py:18\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# ruff: noqa\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\u001b[39;00m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[1;32m     14\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[1;32m     16\u001b[0m __version__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m2.18.0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01marrow_dataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Dataset\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01marrow_reader\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ReadInstruction\n\u001b[1;32m     20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbuilder\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder\n",
      "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/datasets/arrow_dataset.py:59\u001b[0m\n\u001b[1;32m     57\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mfsspec\u001b[39;00m\n\u001b[1;32m     58\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m---> 59\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m     60\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpa\u001b[39;00m\n\u001b[1;32m     61\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompute\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpc\u001b[39;00m\n",
      "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/pandas/__init__.py:26\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m _hard_dependencies, _dependency, _missing_dependencies\n\u001b[1;32m     24\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m     25\u001b[0m     \u001b[38;5;66;03m# numpy compat\u001b[39;00m\n\u001b[0;32m---> 26\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m     27\u001b[0m         is_numpy_dev \u001b[38;5;28;01mas\u001b[39;00m _is_numpy_dev,  \u001b[38;5;66;03m# pyright: ignore[reportUnusedImport] # noqa: F401\u001b[39;00m\n\u001b[1;32m     28\u001b[0m     )\n\u001b[1;32m     29\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m _err:  \u001b[38;5;66;03m# pragma: no cover\u001b[39;00m\n\u001b[1;32m     30\u001b[0m     _module \u001b[38;5;241m=\u001b[39m _err\u001b[38;5;241m.\u001b[39mname\n",
      "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/pandas/compat/__init__.py:27\u001b[0m\n\u001b[1;32m     25\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompressors\u001b[39;00m\n\u001b[1;32m     26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m is_numpy_dev\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpyarrow\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m     28\u001b[0m     pa_version_under10p1,\n\u001b[1;32m     29\u001b[0m     pa_version_under11p0,\n\u001b[1;32m     30\u001b[0m     pa_version_under13p0,\n\u001b[1;32m     31\u001b[0m     pa_version_under14p0,\n\u001b[1;32m     32\u001b[0m     pa_version_under14p1,\n\u001b[1;32m     33\u001b[0m )\n\u001b[1;32m     35\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m TYPE_CHECKING:\n\u001b[1;32m     36\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_typing\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m F\n",
      "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/pandas/compat/pyarrow.py:8\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutil\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Version\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m----> 8\u001b[0m     \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpa\u001b[39;00m\n\u001b[1;32m     10\u001b[0m     _palv \u001b[38;5;241m=\u001b[39m Version(Version(pa\u001b[38;5;241m.\u001b[39m__version__)\u001b[38;5;241m.\u001b[39mbase_version)\n\u001b[1;32m     11\u001b[0m     pa_version_under10p1 \u001b[38;5;241m=\u001b[39m _palv \u001b[38;5;241m<\u001b[39m Version(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m10.0.1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/pyarrow/__init__.py:65\u001b[0m\n\u001b[1;32m     63\u001b[0m _gc_enabled \u001b[38;5;241m=\u001b[39m _gc\u001b[38;5;241m.\u001b[39misenabled()\n\u001b[1;32m     64\u001b[0m _gc\u001b[38;5;241m.\u001b[39mdisable()\n\u001b[0;32m---> 65\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_lib\u001b[39;00m\n\u001b[1;32m     66\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _gc_enabled:\n\u001b[1;32m     67\u001b[0m     _gc\u001b[38;5;241m.\u001b[39menable()\n",
      "File \u001b[0;32m<frozen importlib._bootstrap>:404\u001b[0m, in \u001b[0;36mparent\u001b[0;34m(self)\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "dataset = load_dataset(\"imsoumyaneel/sentiment-analysis-llama2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pandas.core.frame import DataFrame as df\n",
    "\n",
    "train_dataset = df(dataset['train'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sentence</th>\n",
       "      <th>label</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>I'll throw out the garbage .</td>\n",
       "      <td>neutral</td>\n",
       "      <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>So Dick , how about getting some coffee for to...</td>\n",
       "      <td>joy</td>\n",
       "      <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Come on , you can at least try a little , besi...</td>\n",
       "      <td>neutral</td>\n",
       "      <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>What ’ s wrong with that ? Cigarette is the th...</td>\n",
       "      <td>anger</td>\n",
       "      <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Not for me , Dick .</td>\n",
       "      <td>neutral</td>\n",
       "      <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>598293</th>\n",
       "      <td>You got banned for participating in a brigade.</td>\n",
       "      <td>sadness</td>\n",
       "      <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>598294</th>\n",
       "      <td>A joke is subjective pal, second of all you ne...</td>\n",
       "      <td>joy</td>\n",
       "      <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>598295</th>\n",
       "      <td>Well, I'm glad you're out of all that now. How...</td>\n",
       "      <td>joy</td>\n",
       "      <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>598296</th>\n",
       "      <td>Everyone likes [NAME].</td>\n",
       "      <td>love</td>\n",
       "      <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>598297</th>\n",
       "      <td>The FDA has plenty to criticize. But like here...</td>\n",
       "      <td>anger</td>\n",
       "      <td>###Human:\\nyou are a sentiment analist. guess ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>598298 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 sentence    label  \\\n",
       "0                            I'll throw out the garbage .  neutral   \n",
       "1       So Dick , how about getting some coffee for to...      joy   \n",
       "2       Come on , you can at least try a little , besi...  neutral   \n",
       "3       What ’ s wrong with that ? Cigarette is the th...    anger   \n",
       "4                                     Not for me , Dick .  neutral   \n",
       "...                                                   ...      ...   \n",
       "598293     You got banned for participating in a brigade.  sadness   \n",
       "598294  A joke is subjective pal, second of all you ne...      joy   \n",
       "598295  Well, I'm glad you're out of all that now. How...      joy   \n",
       "598296                             Everyone likes [NAME].     love   \n",
       "598297  The FDA has plenty to criticize. But like here...    anger   \n",
       "\n",
       "                                                     text  \n",
       "0       ###Human:\\nyou are a sentiment analist. guess ...  \n",
       "1       ###Human:\\nyou are a sentiment analist. guess ...  \n",
       "2       ###Human:\\nyou are a sentiment analist. guess ...  \n",
       "3       ###Human:\\nyou are a sentiment analist. guess ...  \n",
       "4       ###Human:\\nyou are a sentiment analist. guess ...  \n",
       "...                                                   ...  \n",
       "598293  ###Human:\\nyou are a sentiment analist. guess ...  \n",
       "598294  ###Human:\\nyou are a sentiment analist. guess ...  \n",
       "598295  ###Human:\\nyou are a sentiment analist. guess ...  \n",
       "598296  ###Human:\\nyou are a sentiment analist. guess ...  \n",
       "598297  ###Human:\\nyou are a sentiment analist. guess ...  \n",
       "\n",
       "[598298 rows x 3 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# imports for model creation\n",
    "import tensorflow as tf\n",
    "from keras import layers\n",
    "from keras import losses\n",
    "import keras\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tokenization of dataset\n",
    "tokenizer = Tokenizer()\n",
    "tokenizer.fit_on_texts(train_dataset['sentence'])\n",
    "\n",
    "vocab_size = len(tokenizer.word_index) + 1    # our dataset vocab size (space split)\n",
    "max_length = 200    # max words in a sentence\n",
    "embedding_dim = 50    # TODO: need to adjust accordingly\n",
    "\n",
    "X = tokenizer.texts_to_sequences(train_dataset['sentence'])\n",
    "X = pad_sequences(X, maxlen=max_length, padding='post')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Encode the lables\n",
    "labels = train_dataset['label'].map({'neutral': '1', 'joy': '2', 'sadness': '3', 'anger': '4', 'fear': '5', 'love': '6', 'surprise': '7'}).astype('float32').values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build the model\n",
    "model = keras.Sequential([\n",
    "  keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),\n",
    "  keras.layers.GlobalAveragePooling1D(),\n",
    "  keras.layers.Dense(16, activation='relu'),\n",
    "  keras.layers.Dense(1, activation='sigmoid')\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compile the model\n",
    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# split the dataset into train and test\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train the model\n",
    "model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluate the model\n",
    "loss, accuracy = model.evaluate(X_test, y_test)\n",
    "accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save the model\n",
    "try:\n",
    "  model.save(\"../models/sentimental-analysis-llama2.keras\")\n",
    "except FileNotFoundError:\n",
    "  os.mkdir(\"../models\")\n",
    "  model.save(\"../models/sentimental-analysis-llama2.keras\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}