{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[1;32m 3\u001b[0m dataset \u001b[38;5;241m=\u001b[39m load_dataset(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimsoumyaneel/sentiment-analysis-llama2\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/datasets/__init__.py:18\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# ruff: noqa\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[1;32m 16\u001b[0m __version__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m2.18.0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01marrow_dataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Dataset\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01marrow_reader\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ReadInstruction\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbuilder\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder\n", "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/datasets/arrow_dataset.py:59\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mfsspec\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m---> 59\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpa\u001b[39;00m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompute\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpc\u001b[39;00m\n", "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/pandas/__init__.py:26\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m _hard_dependencies, _dependency, _missing_dependencies\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 25\u001b[0m \u001b[38;5;66;03m# numpy compat\u001b[39;00m\n\u001b[0;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 27\u001b[0m is_numpy_dev \u001b[38;5;28;01mas\u001b[39;00m _is_numpy_dev, \u001b[38;5;66;03m# pyright: ignore[reportUnusedImport] # noqa: F401\u001b[39;00m\n\u001b[1;32m 28\u001b[0m )\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m _err: \u001b[38;5;66;03m# pragma: no cover\u001b[39;00m\n\u001b[1;32m 30\u001b[0m _module \u001b[38;5;241m=\u001b[39m _err\u001b[38;5;241m.\u001b[39mname\n", "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/pandas/compat/__init__.py:27\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompressors\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m is_numpy_dev\n\u001b[0;32m---> 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompat\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpyarrow\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 28\u001b[0m pa_version_under10p1,\n\u001b[1;32m 29\u001b[0m pa_version_under11p0,\n\u001b[1;32m 30\u001b[0m pa_version_under13p0,\n\u001b[1;32m 31\u001b[0m pa_version_under14p0,\n\u001b[1;32m 32\u001b[0m pa_version_under14p1,\n\u001b[1;32m 33\u001b[0m )\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m TYPE_CHECKING:\n\u001b[1;32m 36\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_typing\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m F\n", "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/pandas/compat/pyarrow.py:8\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutil\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Version\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpa\u001b[39;00m\n\u001b[1;32m 10\u001b[0m _palv \u001b[38;5;241m=\u001b[39m Version(Version(pa\u001b[38;5;241m.\u001b[39m__version__)\u001b[38;5;241m.\u001b[39mbase_version)\n\u001b[1;32m 11\u001b[0m pa_version_under10p1 \u001b[38;5;241m=\u001b[39m _palv \u001b[38;5;241m<\u001b[39m Version(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m10.0.1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[0;32m~/Documents/models/twitter_model/.venv/lib/python3.10/site-packages/pyarrow/__init__.py:65\u001b[0m\n\u001b[1;32m 63\u001b[0m _gc_enabled \u001b[38;5;241m=\u001b[39m _gc\u001b[38;5;241m.\u001b[39misenabled()\n\u001b[1;32m 64\u001b[0m _gc\u001b[38;5;241m.\u001b[39mdisable()\n\u001b[0;32m---> 65\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlib\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_lib\u001b[39;00m\n\u001b[1;32m 66\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _gc_enabled:\n\u001b[1;32m 67\u001b[0m _gc\u001b[38;5;241m.\u001b[39menable()\n", "File \u001b[0;32m:404\u001b[0m, in \u001b[0;36mparent\u001b[0;34m(self)\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"imsoumyaneel/sentiment-analysis-llama2\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pandas.core.frame import DataFrame as df\n", "\n", "train_dataset = df(dataset['train'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentencelabeltext
0I'll throw out the garbage .neutral###Human:\\nyou are a sentiment analist. guess ...
1So Dick , how about getting some coffee for to...joy###Human:\\nyou are a sentiment analist. guess ...
2Come on , you can at least try a little , besi...neutral###Human:\\nyou are a sentiment analist. guess ...
3What ’ s wrong with that ? Cigarette is the th...anger###Human:\\nyou are a sentiment analist. guess ...
4Not for me , Dick .neutral###Human:\\nyou are a sentiment analist. guess ...
............
598293You got banned for participating in a brigade.sadness###Human:\\nyou are a sentiment analist. guess ...
598294A joke is subjective pal, second of all you ne...joy###Human:\\nyou are a sentiment analist. guess ...
598295Well, I'm glad you're out of all that now. How...joy###Human:\\nyou are a sentiment analist. guess ...
598296Everyone likes [NAME].love###Human:\\nyou are a sentiment analist. guess ...
598297The FDA has plenty to criticize. But like here...anger###Human:\\nyou are a sentiment analist. guess ...
\n", "

598298 rows × 3 columns

\n", "
" ], "text/plain": [ " sentence label \\\n", "0 I'll throw out the garbage . neutral \n", "1 So Dick , how about getting some coffee for to... joy \n", "2 Come on , you can at least try a little , besi... neutral \n", "3 What ’ s wrong with that ? Cigarette is the th... anger \n", "4 Not for me , Dick . neutral \n", "... ... ... \n", "598293 You got banned for participating in a brigade. sadness \n", "598294 A joke is subjective pal, second of all you ne... joy \n", "598295 Well, I'm glad you're out of all that now. How... joy \n", "598296 Everyone likes [NAME]. love \n", "598297 The FDA has plenty to criticize. But like here... anger \n", "\n", " text \n", "0 ###Human:\\nyou are a sentiment analist. guess ... \n", "1 ###Human:\\nyou are a sentiment analist. guess ... \n", "2 ###Human:\\nyou are a sentiment analist. guess ... \n", "3 ###Human:\\nyou are a sentiment analist. guess ... \n", "4 ###Human:\\nyou are a sentiment analist. guess ... \n", "... ... \n", "598293 ###Human:\\nyou are a sentiment analist. guess ... \n", "598294 ###Human:\\nyou are a sentiment analist. guess ... \n", "598295 ###Human:\\nyou are a sentiment analist. guess ... \n", "598296 ###Human:\\nyou are a sentiment analist. guess ... \n", "598297 ###Human:\\nyou are a sentiment analist. guess ... \n", "\n", "[598298 rows x 3 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# imports for model creation\n", "import tensorflow as tf\n", "from keras import layers\n", "from keras import losses\n", "import keras\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Tokenization of dataset\n", "tokenizer = Tokenizer()\n", "tokenizer.fit_on_texts(train_dataset['sentence'])\n", "\n", "vocab_size = len(tokenizer.word_index) + 1 # our dataset vocab size (space split)\n", "max_length = 200 # max words in a sentence\n", "embedding_dim = 50 # TODO: need to adjust accordingly\n", "\n", "X = tokenizer.texts_to_sequences(train_dataset['sentence'])\n", "X = pad_sequences(X, maxlen=max_length, padding='post')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Encode the lables\n", "labels = train_dataset['label'].map({'neutral': '1', 'joy': '2', 'sadness': '3', 'anger': '4', 'fear': '5', 'love': '6', 'surprise': '7'}).astype('float32').values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Build the model\n", "model = keras.Sequential([\n", " keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),\n", " keras.layers.GlobalAveragePooling1D(),\n", " keras.layers.Dense(16, activation='relu'),\n", " keras.layers.Dense(1, activation='sigmoid')\n", "])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Compile the model\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# split the dataset into train and test\n", "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42, shuffle=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# train the model\n", "model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Evaluate the model\n", "loss, accuracy = model.evaluate(X_test, y_test)\n", "accuracy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# save the model\n", "try:\n", " model.save(\"../models/sentimental-analysis-llama2.keras\")\n", "except FileNotFoundError:\n", " os.mkdir(\"../models\")\n", " model.save(\"../models/sentimental-analysis-llama2.keras\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }