girolamodiceglie
/

sentiment-analysis-model

Keras

Model card Files Files and versions Community

girolamodiceglie commited on Apr 3, 2024

Commit

e7c153b

verified ·

1 Parent(s): 66d06d9

Upload sentiment.ipynb

Browse files

Files changed (1) hide show

sentiment.ipynb +438 -0

sentiment.ipynb ADDED Viewed

	@@ -0,0 +1,438 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Pre-processamento dei dati il metodo riceve in input una stringa e ne restituisce il suo pre-processamento\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "import string\n",
+    "\n",
+    "def preprocess_text(text):\n",
+    "    #Lower text\n",
+    "    tokens = word_tokenize(text.lower())\n",
+    "    #Rimozione stop words\n",
+    "    filtered_tokens = [token for token in tokens if token not in stopwords.words('italian')]\n",
+    "    #Lemmatizzazione\n",
+    "    lemmatizer = WordNetLemmatizer()\n",
+    "    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]\n",
+    "    #Join lemmatizzazione del testo\n",
+    "    processed_text = ' '.join(lemmatized_tokens)\n",
+    "    #Eliminazione punteggiatura\n",
+    "    return processed_text.translate(str.maketrans('','', string.punctuation))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "nltk.download()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Importazione del dataframe\n",
+    "import pandas as pd\n",
+    "\n",
+    "#Dataset ngt\n",
+    "df_ngt = pd.read_csv('ngt_sentiment_dataset/ngt_lang_dataset.csv')\n",
+    "\n",
+    "print(df_ngt.describe())\n",
+    "\n",
+    "X_ngt = df_ngt.text.apply(preprocess_text)\n",
+    "y_ngt = df_ngt.tag\n",
+    "\n",
+    "print(X_ngt[0])\n",
+    "print(y_ngt[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Vettorizzazione del testo tramite tokenizzazione\n",
+    "from keras.preprocessing.text import Tokenizer\n",
+    "from keras.preprocessing.sequence import pad_sequences\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "maxlen = 100\n",
+    "max_words = 10000\n",
+    "\n",
+    "tokenizer = Tokenizer(num_words=max_words)\n",
+    "tokenizer.fit_on_texts(X_ngt)\n",
+    "sequences = tokenizer.texts_to_sequences(X_ngt)\n",
+    "word_index = tokenizer.word_index\n",
+    "print('Found %s unique tokens' % len(word_index))\n",
+    "\n",
+    "X_ngt = pad_sequences(sequences, maxlen=maxlen)\n",
+    "\n",
+    "y_ngt = np.asarray(y_ngt)\n",
+    "\n",
+    "indices = np.arange(X_ngt.shape[0])\n",
+    "\n",
+    "np.random.shuffle(indices)\n",
+    "X_ngt = X_ngt[indices]\n",
+    "y_ngt = y_ngt[indices]\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_ngt, y_ngt, test_size=0.2, shuffle=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "sentence = input(\"Enter the sentence: \")\n",
+    "\n",
+    "preprocess_text(sentence)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "df_train = pd.read_csv('recensioni_train.csv')\n",
+    "df_test = pd.read_csv('recensioni_test.csv')\n",
+    "\n",
+    "X_train = df_train['text'].apply(preprocess_text)\n",
+    "X_test = df_test['text'].apply(preprocess_text)\n",
+    "\n",
+    "tags_train = df_train['tag']\n",
+    "tags_test = df_test['tag']\n",
+    "\n",
+    "y_train = []\n",
+    "y_test = []\n",
+    "\n",
+    "#Train\n",
+    "for e in tags_train:\n",
+    "    if e=='pos':\n",
+    "        y_train.append(1)\n",
+    "    else:\n",
+    "        y_train.append(0)\n",
+    "\n",
+    "#Test\n",
+    "for e in tags_test:\n",
+    "    if e=='pos':\n",
+    "        y_test.append(1)\n",
+    "    else:\n",
+    "        y_test.append(0)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#######################\n",
+    "\n",
+    "tokenizer_train = Tokenizer(num_words=10000)\n",
+    "tokenizer_train.fit_on_texts(X_train)\n",
+    "sequences_train = tokenizer_train.texts_to_sequences(X_train)\n",
+    "word_index_train = tokenizer_train.word_index\n",
+    "print('Found %s unique tokens' % len(word_index_train))\n",
+    "\n",
+    "print(X_train[0])\n",
+    "print(y_train[0])\n",
+    "\n",
+    "#######################\n",
+    "\n",
+    "tokenizer_test = Tokenizer(num_words=10000)\n",
+    "tokenizer_test.fit_on_texts(X_test)\n",
+    "sequences_test = tokenizer_test.texts_to_sequences(X_test)\n",
+    "word_index_test = tokenizer_test.word_index\n",
+    "print('Found %s unique tokens' % len(word_index_test))\n",
+    "\n",
+    "print(X_test[0])\n",
+    "print(y_test[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Dataset NGT\n",
+    "\n",
+    "tokenizer_ngt = Tokenizer(num_words=10000)\n",
+    "tokenizer_ngt.fit_on_texts(X_ngt)\n",
+    "sequences_ngt = tokenizer_ngt.texts_to_sequences(X_ngt)\n",
+    "word_index_ngt = tokenizer_ngt.word_index\n",
+    "print('Found %s unique tokens' % len(word_index_ngt))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Dataset NGT\n",
+    "\n",
+    "X_ngt = pad_sequences(sequences_ngt)\n",
+    "y_ngt = np.asarray(y_ngt)\n",
+    "indices_ngt = np.arange(X_ngt.shape[0])\n",
+    "\n",
+    "\n",
+    "np.random.shuffle(indices_ngt)\n",
+    "X_ngt = X_ngt[indices_ngt]\n",
+    "y_ngt = y_ngt[indices_ngt]\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_ngt, y_ngt, test_size=0.2)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(X_train[0])\n",
+    "print(y_train[0])\n",
+    "\n",
+    "print(X_test[0])\n",
+    "print(y_test[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train = pad_sequences(sequences_train)\n",
+    "y_train = np.asarray(y_train)\n",
+    "indices_train = np.arange(X_train.shape[0])\n",
+    "\n",
+    "\n",
+    "X_test = pad_sequences(sequences_test)\n",
+    "y_test = np.asarray(y_test)\n",
+    "indices_test = np.arange(X_test.shape[0])\n",
+    "\n",
+    "print(indices_train)\n",
+    "print(X_train[0])\n",
+    "print(y_train[0])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.shuffle(indices_train)\n",
+    "X_train = X_train[indices_train]\n",
+    "y_train = y_train[indices_train]\n",
+    "\n",
+    "\n",
+    "np.random.shuffle(indices_test)\n",
+    "X_test = X_train[indices_test]\n",
+    "y_test = y_train[indices_test]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train.shape\n",
+    "\n",
+    "print(X_train.dtype)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from keras.models import Sequential\n",
+    "from keras.layers import Dense\n",
+    "\n",
+    "model = Sequential()\n",
+    "\n",
+    "model.add(Dense(512, activation='relu'))\n",
+    "model.add(Dense(32, activation='relu'))\n",
+    "model.add(Dense(1, activation='sigmoid'))\n",
+    "\n",
+    "#model.compile(optimizer='SGD', loss='binary_crossentropy', metrics=['acc'])\n",
+    "#model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\n",
+    "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
+    "\n",
+    "history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Tracciamento dei risultati\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "acc = history.history['acc']\n",
+    "val_acc = history.history['val_acc']\n",
+    "loss = history.history['loss']\n",
+    "val_loss = history.history['val_loss']\n",
+    "\n",
+    "epochs = range(1, len(acc) + 1)\n",
+    "\n",
+    "plt.plot(epochs, acc, 'bo', label='Training acc')\n",
+    "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n",
+    "plt.title('Training and validation accuracy')\n",
+    "plt.legend()\n",
+    "\n",
+    "plt.figure()\n",
+    "\n",
+    "plt.plot(epochs, loss, 'bo', label='Training loss')\n",
+    "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n",
+    "plt.title('Training and validation loss')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "#Salvataggio del modello\n",
+    "\n",
+    "model.save('binary.keras')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dataset ngt\n",
+    "# model.add(Dense(512, activation='relu'))\n",
+    "# model.add(Dense(8, activation='relu'))\n",
+    "# model.add(Dense(1, activation='sigmoid'))\n",
+    "\n",
+    "\n",
+    "# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
+    "\n",
+    "# history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))\n",
+    "\n",
+    "\n",
+    "# Epoch 10/10\n",
+    "# 100/100 [==============================] - 0s 3ms/step - loss: 0.6099 - acc: 0.6712 - val_loss: 0.6311 - val_acc: 0.6525\n",
+    "\n",
+    "\n",
+    "################################################\n",
+    "\n",
+    "\n",
+    "# Altro dataset\n",
+    "# model.add(Dense(512, activation='relu'))\n",
+    "# model.add(Dense(32, activation='relu'))\n",
+    "# model.add(Dense(1, activation='sigmoid'))\n",
+    "\n",
+    "# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
+    "\n",
+    "# history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))\n",
+    "\n",
+    "# Epoch 5/5\n",
+    "# 63/63 [==============================] - 0s 3ms/step - loss: 0.5344 - acc: 0.7185 - val_loss: 0.5255 - val_acc: 0.7525"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1/1 [==============================] - 0s 51ms/step\n",
+      "NEGATIVO 58 %\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Test\n",
+    "\n",
+    "#load model\n",
+    "from keras.models import load_model\n",
+    "from keras.preprocessing.sequence import pad_sequences\n",
+    "from keras.preprocessing.text import Tokenizer\n",
+    "from keras.preprocessing.text import Tokenizer\n",
+    "\n",
+    "loaded_model = load_model('sentiment_dfngt.keras')\n",
+    "\n",
+    "sentence = input(\"Enter the sentence: \")\n",
+    "sequence = preprocess_text(sentence)\n",
+    "sequence = Tokenizer().texts_to_sequences([sequence])\n",
+    "test = pad_sequences(sequence, maxlen=100)\n",
+    "yhat = loaded_model.predict(test)\n",
+    "\n",
+    "threshold = 0.5\n",
+    "\n",
+    "if yhat > threshold:\n",
+    "    print('POSITIVO', int((yhat)*100), '%')\n",
+    "else:\n",
+    "    print('NEGATIVO', int((1-yhat)*100), '%')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}