In [None]:
#Pre-processamento dei dati il metodo riceve in input una stringa e ne restituisce il suo pre-processamento
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

def preprocess_text(text):
 #Lower text
 tokens = word_tokenize(text.lower())
 #Rimozione stop words
 filtered_tokens = [token for token in tokens if token not in stopwords.words('italian')]
 #Lemmatizzazione
 lemmatizer = WordNetLemmatizer()
 lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
 #Join lemmatizzazione del testo
 processed_text = ' '.join(lemmatized_tokens)
 #Eliminazione punteggiatura
 return processed_text.translate(str.maketrans('','', string.punctuation))

In [None]:
#Importazione del dataframe
import pandas as pd

#Dataset ngt
df_ngt = pd.read_csv('ngt_sentiment_dataset/ngt_lang_dataset.csv')

print(df_ngt.describe())

X_ngt = df_ngt.text.apply(preprocess_text)
y_ngt = df_ngt.tag

print(X_ngt[0])
print(y_ngt[0])

In [None]:
#Vettorizzazione del testo tramite tokenizzazione
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split

maxlen = 100
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_ngt)
sequences = tokenizer.texts_to_sequences(X_ngt)
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

X_ngt = pad_sequences(sequences, maxlen=maxlen)

y_ngt = np.asarray(y_ngt)

indices = np.arange(X_ngt.shape[0])

np.random.shuffle(indices)
X_ngt = X_ngt[indices]
y_ngt = y_ngt[indices]

X_train, X_test, y_train, y_test = train_test_split(X_ngt, y_ngt, test_size=0.2, shuffle=True)

In [None]:
print(X_train[0])
print(y_train[0])

print(X_test[0])
print(y_test[0])

In [None]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()

model.add(Dense(512, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

In [None]:
model.summary()

In [None]:
#Tracciamento dei risultati
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:

#Salvataggio del modello
model.save('model.keras')

In [None]:
#Test

#Load model
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

loaded_model = load_model('model.keras')

sentence = input("Enter the sentence: ")
sequence = preprocess_text(sentence)
sequence = Tokenizer().texts_to_sequences([sequence])
test = pad_sequences(sequence, maxlen=100)
yhat = loaded_model.predict(test)

threshold = 0.5

if yhat > threshold:
 print('POSITIVO', int((yhat)*100), '%')
else:
 print('NEGATIVO', int((1-yhat)*100), '%')