{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Pre-processamento dei dati il metodo riceve in input una stringa e ne restituisce il suo pre-processamento\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "import string\n", "\n", "def preprocess_text(text):\n", " #Lower text\n", " tokens = word_tokenize(text.lower())\n", " #Rimozione stop words\n", " filtered_tokens = [token for token in tokens if token not in stopwords.words('italian')]\n", " #Lemmatizzazione\n", " lemmatizer = WordNetLemmatizer()\n", " lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]\n", " #Join lemmatizzazione del testo\n", " processed_text = ' '.join(lemmatized_tokens)\n", " #Eliminazione punteggiatura\n", " return processed_text.translate(str.maketrans('','', string.punctuation))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Importazione del dataframe\n", "import pandas as pd\n", "\n", "#Dataset ngt\n", "df_ngt = pd.read_csv('ngt_sentiment_dataset/ngt_lang_dataset.csv')\n", "\n", "print(df_ngt.describe())\n", "\n", "X_ngt = df_ngt.text.apply(preprocess_text)\n", "y_ngt = df_ngt.tag\n", "\n", "print(X_ngt[0])\n", "print(y_ngt[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Vettorizzazione del testo tramite tokenizzazione\n", "from keras.preprocessing.text import Tokenizer\n", "from keras.preprocessing.sequence import pad_sequences\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "\n", "maxlen = 100\n", "max_words = 10000\n", "\n", "tokenizer = Tokenizer(num_words=max_words)\n", "tokenizer.fit_on_texts(X_ngt)\n", "sequences = tokenizer.texts_to_sequences(X_ngt)\n", "word_index = tokenizer.word_index\n", "print('Found %s unique tokens' % len(word_index))\n", "\n", "X_ngt = pad_sequences(sequences, maxlen=maxlen)\n", "\n", "y_ngt = np.asarray(y_ngt)\n", "\n", "indices = np.arange(X_ngt.shape[0])\n", "\n", "np.random.shuffle(indices)\n", "X_ngt = X_ngt[indices]\n", "y_ngt = y_ngt[indices]\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X_ngt, y_ngt, test_size=0.2, shuffle=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(X_train[0])\n", "print(y_train[0])\n", "\n", "print(X_test[0])\n", "print(y_test[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from keras.models import Sequential\n", "from keras.layers import Dense\n", "\n", "model = Sequential()\n", "\n", "model.add(Dense(512, activation='relu'))\n", "model.add(Dense(32, activation='relu'))\n", "model.add(Dense(1, activation='sigmoid'))\n", "\n", "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n", "\n", "history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.summary()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Tracciamento dei risultati\n", "import matplotlib.pyplot as plt\n", "\n", "acc = history.history['acc']\n", "val_acc = history.history['val_acc']\n", "loss = history.history['loss']\n", "val_loss = history.history['val_loss']\n", "\n", "epochs = range(1, len(acc) + 1)\n", "\n", "plt.plot(epochs, acc, 'bo', label='Training acc')\n", "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", "plt.title('Training and validation accuracy')\n", "plt.legend()\n", "\n", "plt.figure()\n", "\n", "plt.plot(epochs, loss, 'bo', label='Training loss')\n", "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", "plt.title('Training and validation loss')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "#Salvataggio del modello\n", "model.save('model.keras')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Test\n", "\n", "#Load model\n", "from keras.models import load_model\n", "from keras.preprocessing.sequence import pad_sequences\n", "from keras.preprocessing.text import Tokenizer\n", "\n", "loaded_model = load_model('model.keras')\n", "\n", "sentence = input(\"Enter the sentence: \")\n", "sequence = preprocess_text(sentence)\n", "sequence = Tokenizer().texts_to_sequences([sequence])\n", "test = pad_sequences(sequence, maxlen=100)\n", "yhat = loaded_model.predict(test)\n", "\n", "threshold = 0.5\n", "\n", "if yhat > threshold:\n", " print('POSITIVO', int((yhat)*100), '%')\n", "else:\n", " print('NEGATIVO', int((1-yhat)*100), '%')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "2.7.18" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }