girolamodiceglie commited on
Commit
e7c153b
1 Parent(s): 66d06d9

Upload sentiment.ipynb

Browse files
Files changed (1) hide show
  1. sentiment.ipynb +438 -0
sentiment.ipynb ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "#Pre-processamento dei dati il metodo riceve in input una stringa e ne restituisce il suo pre-processamento\n",
10
+ "from nltk.tokenize import word_tokenize\n",
11
+ "from nltk.corpus import stopwords\n",
12
+ "from nltk.stem import WordNetLemmatizer\n",
13
+ "import string\n",
14
+ "\n",
15
+ "def preprocess_text(text):\n",
16
+ " #Lower text\n",
17
+ " tokens = word_tokenize(text.lower())\n",
18
+ " #Rimozione stop words\n",
19
+ " filtered_tokens = [token for token in tokens if token not in stopwords.words('italian')]\n",
20
+ " #Lemmatizzazione\n",
21
+ " lemmatizer = WordNetLemmatizer()\n",
22
+ " lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]\n",
23
+ " #Join lemmatizzazione del testo\n",
24
+ " processed_text = ' '.join(lemmatized_tokens)\n",
25
+ " #Eliminazione punteggiatura\n",
26
+ " return processed_text.translate(str.maketrans('','', string.punctuation))"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "import nltk\n",
36
+ "nltk.download()"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": null,
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "#Importazione del dataframe\n",
46
+ "import pandas as pd\n",
47
+ "\n",
48
+ "#Dataset ngt\n",
49
+ "df_ngt = pd.read_csv('ngt_sentiment_dataset/ngt_lang_dataset.csv')\n",
50
+ "\n",
51
+ "print(df_ngt.describe())\n",
52
+ "\n",
53
+ "X_ngt = df_ngt.text.apply(preprocess_text)\n",
54
+ "y_ngt = df_ngt.tag\n",
55
+ "\n",
56
+ "print(X_ngt[0])\n",
57
+ "print(y_ngt[0])"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": null,
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "#Vettorizzazione del testo tramite tokenizzazione\n",
67
+ "from keras.preprocessing.text import Tokenizer\n",
68
+ "from keras.preprocessing.sequence import pad_sequences\n",
69
+ "import numpy as np\n",
70
+ "from sklearn.model_selection import train_test_split\n",
71
+ "\n",
72
+ "maxlen = 100\n",
73
+ "max_words = 10000\n",
74
+ "\n",
75
+ "tokenizer = Tokenizer(num_words=max_words)\n",
76
+ "tokenizer.fit_on_texts(X_ngt)\n",
77
+ "sequences = tokenizer.texts_to_sequences(X_ngt)\n",
78
+ "word_index = tokenizer.word_index\n",
79
+ "print('Found %s unique tokens' % len(word_index))\n",
80
+ "\n",
81
+ "X_ngt = pad_sequences(sequences, maxlen=maxlen)\n",
82
+ "\n",
83
+ "y_ngt = np.asarray(y_ngt)\n",
84
+ "\n",
85
+ "indices = np.arange(X_ngt.shape[0])\n",
86
+ "\n",
87
+ "np.random.shuffle(indices)\n",
88
+ "X_ngt = X_ngt[indices]\n",
89
+ "y_ngt = y_ngt[indices]\n",
90
+ "\n",
91
+ "X_train, X_test, y_train, y_test = train_test_split(X_ngt, y_ngt, test_size=0.2, shuffle=True)"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": null,
97
+ "metadata": {},
98
+ "outputs": [],
99
+ "source": [
100
+ "\n",
101
+ "sentence = input(\"Enter the sentence: \")\n",
102
+ "\n",
103
+ "preprocess_text(sentence)"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "\n",
113
+ "df_train = pd.read_csv('recensioni_train.csv')\n",
114
+ "df_test = pd.read_csv('recensioni_test.csv')\n",
115
+ "\n",
116
+ "X_train = df_train['text'].apply(preprocess_text)\n",
117
+ "X_test = df_test['text'].apply(preprocess_text)\n",
118
+ "\n",
119
+ "tags_train = df_train['tag']\n",
120
+ "tags_test = df_test['tag']\n",
121
+ "\n",
122
+ "y_train = []\n",
123
+ "y_test = []\n",
124
+ "\n",
125
+ "#Train\n",
126
+ "for e in tags_train:\n",
127
+ " if e=='pos':\n",
128
+ " y_train.append(1)\n",
129
+ " else:\n",
130
+ " y_train.append(0)\n",
131
+ "\n",
132
+ "#Test\n",
133
+ "for e in tags_test:\n",
134
+ " if e=='pos':\n",
135
+ " y_test.append(1)\n",
136
+ " else:\n",
137
+ " y_test.append(0)\n",
138
+ "\n"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": null,
144
+ "metadata": {},
145
+ "outputs": [],
146
+ "source": [
147
+ "#######################\n",
148
+ "\n",
149
+ "tokenizer_train = Tokenizer(num_words=10000)\n",
150
+ "tokenizer_train.fit_on_texts(X_train)\n",
151
+ "sequences_train = tokenizer_train.texts_to_sequences(X_train)\n",
152
+ "word_index_train = tokenizer_train.word_index\n",
153
+ "print('Found %s unique tokens' % len(word_index_train))\n",
154
+ "\n",
155
+ "print(X_train[0])\n",
156
+ "print(y_train[0])\n",
157
+ "\n",
158
+ "#######################\n",
159
+ "\n",
160
+ "tokenizer_test = Tokenizer(num_words=10000)\n",
161
+ "tokenizer_test.fit_on_texts(X_test)\n",
162
+ "sequences_test = tokenizer_test.texts_to_sequences(X_test)\n",
163
+ "word_index_test = tokenizer_test.word_index\n",
164
+ "print('Found %s unique tokens' % len(word_index_test))\n",
165
+ "\n",
166
+ "print(X_test[0])\n",
167
+ "print(y_test[0])"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": null,
173
+ "metadata": {},
174
+ "outputs": [],
175
+ "source": [
176
+ "#Dataset NGT\n",
177
+ "\n",
178
+ "tokenizer_ngt = Tokenizer(num_words=10000)\n",
179
+ "tokenizer_ngt.fit_on_texts(X_ngt)\n",
180
+ "sequences_ngt = tokenizer_ngt.texts_to_sequences(X_ngt)\n",
181
+ "word_index_ngt = tokenizer_ngt.word_index\n",
182
+ "print('Found %s unique tokens' % len(word_index_ngt))\n"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "metadata": {},
189
+ "outputs": [],
190
+ "source": [
191
+ "#Dataset NGT\n",
192
+ "\n",
193
+ "X_ngt = pad_sequences(sequences_ngt)\n",
194
+ "y_ngt = np.asarray(y_ngt)\n",
195
+ "indices_ngt = np.arange(X_ngt.shape[0])\n",
196
+ "\n",
197
+ "\n",
198
+ "np.random.shuffle(indices_ngt)\n",
199
+ "X_ngt = X_ngt[indices_ngt]\n",
200
+ "y_ngt = y_ngt[indices_ngt]\n",
201
+ "\n",
202
+ "X_train, X_test, y_train, y_test = train_test_split(X_ngt, y_ngt, test_size=0.2)\n"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": null,
208
+ "metadata": {},
209
+ "outputs": [],
210
+ "source": [
211
+ "print(X_train[0])\n",
212
+ "print(y_train[0])\n",
213
+ "\n",
214
+ "print(X_test[0])\n",
215
+ "print(y_test[0])"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": null,
221
+ "metadata": {},
222
+ "outputs": [],
223
+ "source": [
224
+ "X_train = pad_sequences(sequences_train)\n",
225
+ "y_train = np.asarray(y_train)\n",
226
+ "indices_train = np.arange(X_train.shape[0])\n",
227
+ "\n",
228
+ "\n",
229
+ "X_test = pad_sequences(sequences_test)\n",
230
+ "y_test = np.asarray(y_test)\n",
231
+ "indices_test = np.arange(X_test.shape[0])\n",
232
+ "\n",
233
+ "print(indices_train)\n",
234
+ "print(X_train[0])\n",
235
+ "print(y_train[0])\n"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": null,
241
+ "metadata": {},
242
+ "outputs": [],
243
+ "source": [
244
+ "np.random.shuffle(indices_train)\n",
245
+ "X_train = X_train[indices_train]\n",
246
+ "y_train = y_train[indices_train]\n",
247
+ "\n",
248
+ "\n",
249
+ "np.random.shuffle(indices_test)\n",
250
+ "X_test = X_train[indices_test]\n",
251
+ "y_test = y_train[indices_test]"
252
+ ]
253
+ },
254
+ {
255
+ "cell_type": "code",
256
+ "execution_count": null,
257
+ "metadata": {},
258
+ "outputs": [],
259
+ "source": [
260
+ "X_train.shape\n",
261
+ "\n",
262
+ "print(X_train.dtype)\n"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "execution_count": null,
268
+ "metadata": {},
269
+ "outputs": [],
270
+ "source": [
271
+ "from keras.models import Sequential\n",
272
+ "from keras.layers import Dense\n",
273
+ "\n",
274
+ "model = Sequential()\n",
275
+ "\n",
276
+ "model.add(Dense(512, activation='relu'))\n",
277
+ "model.add(Dense(32, activation='relu'))\n",
278
+ "model.add(Dense(1, activation='sigmoid'))\n",
279
+ "\n",
280
+ "#model.compile(optimizer='SGD', loss='binary_crossentropy', metrics=['acc'])\n",
281
+ "#model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\n",
282
+ "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
283
+ "\n",
284
+ "history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "code",
289
+ "execution_count": null,
290
+ "metadata": {},
291
+ "outputs": [],
292
+ "source": [
293
+ "model.summary()"
294
+ ]
295
+ },
296
+ {
297
+ "cell_type": "code",
298
+ "execution_count": null,
299
+ "metadata": {},
300
+ "outputs": [],
301
+ "source": [
302
+ "#Tracciamento dei risultati\n",
303
+ "import matplotlib.pyplot as plt\n",
304
+ "\n",
305
+ "acc = history.history['acc']\n",
306
+ "val_acc = history.history['val_acc']\n",
307
+ "loss = history.history['loss']\n",
308
+ "val_loss = history.history['val_loss']\n",
309
+ "\n",
310
+ "epochs = range(1, len(acc) + 1)\n",
311
+ "\n",
312
+ "plt.plot(epochs, acc, 'bo', label='Training acc')\n",
313
+ "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n",
314
+ "plt.title('Training and validation accuracy')\n",
315
+ "plt.legend()\n",
316
+ "\n",
317
+ "plt.figure()\n",
318
+ "\n",
319
+ "plt.plot(epochs, loss, 'bo', label='Training loss')\n",
320
+ "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n",
321
+ "plt.title('Training and validation loss')\n",
322
+ "plt.legend()\n",
323
+ "plt.show()"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": null,
329
+ "metadata": {},
330
+ "outputs": [],
331
+ "source": [
332
+ "\n",
333
+ "#Salvataggio del modello\n",
334
+ "\n",
335
+ "model.save('binary.keras')\n",
336
+ "\n"
337
+ ]
338
+ },
339
+ {
340
+ "cell_type": "code",
341
+ "execution_count": null,
342
+ "metadata": {},
343
+ "outputs": [],
344
+ "source": [
345
+ "# Dataset ngt\n",
346
+ "# model.add(Dense(512, activation='relu'))\n",
347
+ "# model.add(Dense(8, activation='relu'))\n",
348
+ "# model.add(Dense(1, activation='sigmoid'))\n",
349
+ "\n",
350
+ "\n",
351
+ "# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
352
+ "\n",
353
+ "# history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))\n",
354
+ "\n",
355
+ "\n",
356
+ "# Epoch 10/10\n",
357
+ "# 100/100 [==============================] - 0s 3ms/step - loss: 0.6099 - acc: 0.6712 - val_loss: 0.6311 - val_acc: 0.6525\n",
358
+ "\n",
359
+ "\n",
360
+ "################################################\n",
361
+ "\n",
362
+ "\n",
363
+ "# Altro dataset\n",
364
+ "# model.add(Dense(512, activation='relu'))\n",
365
+ "# model.add(Dense(32, activation='relu'))\n",
366
+ "# model.add(Dense(1, activation='sigmoid'))\n",
367
+ "\n",
368
+ "# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
369
+ "\n",
370
+ "# history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))\n",
371
+ "\n",
372
+ "# Epoch 5/5\n",
373
+ "# 63/63 [==============================] - 0s 3ms/step - loss: 0.5344 - acc: 0.7185 - val_loss: 0.5255 - val_acc: 0.7525"
374
+ ]
375
+ },
376
+ {
377
+ "cell_type": "code",
378
+ "execution_count": 19,
379
+ "metadata": {},
380
+ "outputs": [
381
+ {
382
+ "name": "stdout",
383
+ "output_type": "stream",
384
+ "text": [
385
+ "1/1 [==============================] - 0s 51ms/step\n",
386
+ "NEGATIVO 58 %\n"
387
+ ]
388
+ }
389
+ ],
390
+ "source": [
391
+ "#Test\n",
392
+ "\n",
393
+ "#load model\n",
394
+ "from keras.models import load_model\n",
395
+ "from keras.preprocessing.sequence import pad_sequences\n",
396
+ "from keras.preprocessing.text import Tokenizer\n",
397
+ "from keras.preprocessing.text import Tokenizer\n",
398
+ "\n",
399
+ "loaded_model = load_model('sentiment_dfngt.keras')\n",
400
+ "\n",
401
+ "sentence = input(\"Enter the sentence: \")\n",
402
+ "sequence = preprocess_text(sentence)\n",
403
+ "sequence = Tokenizer().texts_to_sequences([sequence])\n",
404
+ "test = pad_sequences(sequence, maxlen=100)\n",
405
+ "yhat = loaded_model.predict(test)\n",
406
+ "\n",
407
+ "threshold = 0.5\n",
408
+ "\n",
409
+ "if yhat > threshold:\n",
410
+ " print('POSITIVO', int((yhat)*100), '%')\n",
411
+ "else:\n",
412
+ " print('NEGATIVO', int((1-yhat)*100), '%')"
413
+ ]
414
+ }
415
+ ],
416
+ "metadata": {
417
+ "kernelspec": {
418
+ "display_name": "Python 3",
419
+ "language": "python",
420
+ "name": "python3"
421
+ },
422
+ "language_info": {
423
+ "codemirror_mode": {
424
+ "name": "ipython",
425
+ "version": 3
426
+ },
427
+ "file_extension": ".py",
428
+ "mimetype": "text/x-python",
429
+ "name": "python",
430
+ "nbconvert_exporter": "python",
431
+ "pygments_lexer": "ipython3",
432
+ "version": "3.11.5"
433
+ },
434
+ "orig_nbformat": 4
435
+ },
436
+ "nbformat": 4,
437
+ "nbformat_minor": 2
438
+ }