File size: 6,020 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Pre-processamento dei dati il metodo riceve in input una stringa e ne restituisce il suo pre-processamento\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "import string\n",
    "\n",
    "def preprocess_text(text):\n",
    "    #Lower text\n",
    "    tokens = word_tokenize(text.lower())\n",
    "    #Rimozione stop words\n",
    "    filtered_tokens = [token for token in tokens if token not in stopwords.words('italian')]\n",
    "    #Lemmatizzazione\n",
    "    lemmatizer = WordNetLemmatizer()\n",
    "    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]\n",
    "    #Join lemmatizzazione del testo\n",
    "    processed_text = ' '.join(lemmatized_tokens)\n",
    "    #Eliminazione punteggiatura\n",
    "    return processed_text.translate(str.maketrans('','', string.punctuation))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Importazione del dataframe\n",
    "import pandas as pd\n",
    "\n",
    "#Dataset ngt\n",
    "df_ngt = pd.read_csv('ngt_sentiment_dataset/ngt_lang_dataset.csv')\n",
    "\n",
    "print(df_ngt.describe())\n",
    "\n",
    "X_ngt = df_ngt.text.apply(preprocess_text)\n",
    "y_ngt = df_ngt.tag\n",
    "\n",
    "print(X_ngt[0])\n",
    "print(y_ngt[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Vettorizzazione del testo tramite tokenizzazione\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "maxlen = 100\n",
    "max_words = 10000\n",
    "\n",
    "tokenizer = Tokenizer(num_words=max_words)\n",
    "tokenizer.fit_on_texts(X_ngt)\n",
    "sequences = tokenizer.texts_to_sequences(X_ngt)\n",
    "word_index = tokenizer.word_index\n",
    "print('Found %s unique tokens' % len(word_index))\n",
    "\n",
    "X_ngt = pad_sequences(sequences, maxlen=maxlen)\n",
    "\n",
    "y_ngt = np.asarray(y_ngt)\n",
    "\n",
    "indices = np.arange(X_ngt.shape[0])\n",
    "\n",
    "np.random.shuffle(indices)\n",
    "X_ngt = X_ngt[indices]\n",
    "y_ngt = y_ngt[indices]\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X_ngt, y_ngt, test_size=0.2, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(X_train[0])\n",
    "print(y_train[0])\n",
    "\n",
    "print(X_test[0])\n",
    "print(y_test[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "\n",
    "model = Sequential()\n",
    "\n",
    "model.add(Dense(512, activation='relu'))\n",
    "model.add(Dense(32, activation='relu'))\n",
    "model.add(Dense(1, activation='sigmoid'))\n",
    "\n",
    "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
    "\n",
    "history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Tracciamento dei risultati\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "acc = history.history['acc']\n",
    "val_acc = history.history['val_acc']\n",
    "loss = history.history['loss']\n",
    "val_loss = history.history['val_loss']\n",
    "\n",
    "epochs = range(1, len(acc) + 1)\n",
    "\n",
    "plt.plot(epochs, acc, 'bo', label='Training acc')\n",
    "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n",
    "plt.title('Training and validation accuracy')\n",
    "plt.legend()\n",
    "\n",
    "plt.figure()\n",
    "\n",
    "plt.plot(epochs, loss, 'bo', label='Training loss')\n",
    "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n",
    "plt.title('Training and validation loss')\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#Salvataggio del modello\n",
    "model.save('model.keras')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Test\n",
    "\n",
    "#Load model\n",
    "from keras.models import load_model\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "\n",
    "loaded_model = load_model('model.keras')\n",
    "\n",
    "sentence = input(\"Enter the sentence: \")\n",
    "sequence = preprocess_text(sentence)\n",
    "sequence = Tokenizer().texts_to_sequences([sequence])\n",
    "test = pad_sequences(sequence, maxlen=100)\n",
    "yhat = loaded_model.predict(test)\n",
    "\n",
    "threshold = 0.5\n",
    "\n",
    "if yhat > threshold:\n",
    "    print('POSITIVO', int((yhat)*100), '%')\n",
    "else:\n",
    "    print('NEGATIVO', int((1-yhat)*100), '%')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "2.7.18"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}