girolamodiceglie
commited on
Commit
•
bc21e8a
1
Parent(s):
e7c153b
update
Browse files- sentiment.ipynb +5 -224
sentiment.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
@@ -26,16 +26,6 @@
|
|
26 |
" return processed_text.translate(str.maketrans('','', string.punctuation))"
|
27 |
]
|
28 |
},
|
29 |
-
{
|
30 |
-
"cell_type": "code",
|
31 |
-
"execution_count": null,
|
32 |
-
"metadata": {},
|
33 |
-
"outputs": [],
|
34 |
-
"source": [
|
35 |
-
"import nltk\n",
|
36 |
-
"nltk.download()"
|
37 |
-
]
|
38 |
-
},
|
39 |
{
|
40 |
"cell_type": "code",
|
41 |
"execution_count": null,
|
@@ -97,171 +87,13 @@
|
|
97 |
"metadata": {},
|
98 |
"outputs": [],
|
99 |
"source": [
|
100 |
-
"\n",
|
101 |
-
"sentence = input(\"Enter the sentence: \")\n",
|
102 |
-
"\n",
|
103 |
-
"preprocess_text(sentence)"
|
104 |
-
]
|
105 |
-
},
|
106 |
-
{
|
107 |
-
"cell_type": "code",
|
108 |
-
"execution_count": null,
|
109 |
-
"metadata": {},
|
110 |
-
"outputs": [],
|
111 |
-
"source": [
|
112 |
-
"\n",
|
113 |
-
"df_train = pd.read_csv('recensioni_train.csv')\n",
|
114 |
-
"df_test = pd.read_csv('recensioni_test.csv')\n",
|
115 |
-
"\n",
|
116 |
-
"X_train = df_train['text'].apply(preprocess_text)\n",
|
117 |
-
"X_test = df_test['text'].apply(preprocess_text)\n",
|
118 |
-
"\n",
|
119 |
-
"tags_train = df_train['tag']\n",
|
120 |
-
"tags_test = df_test['tag']\n",
|
121 |
-
"\n",
|
122 |
-
"y_train = []\n",
|
123 |
-
"y_test = []\n",
|
124 |
-
"\n",
|
125 |
-
"#Train\n",
|
126 |
-
"for e in tags_train:\n",
|
127 |
-
" if e=='pos':\n",
|
128 |
-
" y_train.append(1)\n",
|
129 |
-
" else:\n",
|
130 |
-
" y_train.append(0)\n",
|
131 |
-
"\n",
|
132 |
-
"#Test\n",
|
133 |
-
"for e in tags_test:\n",
|
134 |
-
" if e=='pos':\n",
|
135 |
-
" y_test.append(1)\n",
|
136 |
-
" else:\n",
|
137 |
-
" y_test.append(0)\n",
|
138 |
-
"\n"
|
139 |
-
]
|
140 |
-
},
|
141 |
-
{
|
142 |
-
"cell_type": "code",
|
143 |
-
"execution_count": null,
|
144 |
-
"metadata": {},
|
145 |
-
"outputs": [],
|
146 |
-
"source": [
|
147 |
-
"#######################\n",
|
148 |
-
"\n",
|
149 |
-
"tokenizer_train = Tokenizer(num_words=10000)\n",
|
150 |
-
"tokenizer_train.fit_on_texts(X_train)\n",
|
151 |
-
"sequences_train = tokenizer_train.texts_to_sequences(X_train)\n",
|
152 |
-
"word_index_train = tokenizer_train.word_index\n",
|
153 |
-
"print('Found %s unique tokens' % len(word_index_train))\n",
|
154 |
-
"\n",
|
155 |
"print(X_train[0])\n",
|
156 |
"print(y_train[0])\n",
|
157 |
"\n",
|
158 |
-
"#######################\n",
|
159 |
-
"\n",
|
160 |
-
"tokenizer_test = Tokenizer(num_words=10000)\n",
|
161 |
-
"tokenizer_test.fit_on_texts(X_test)\n",
|
162 |
-
"sequences_test = tokenizer_test.texts_to_sequences(X_test)\n",
|
163 |
-
"word_index_test = tokenizer_test.word_index\n",
|
164 |
-
"print('Found %s unique tokens' % len(word_index_test))\n",
|
165 |
-
"\n",
|
166 |
"print(X_test[0])\n",
|
167 |
"print(y_test[0])"
|
168 |
]
|
169 |
},
|
170 |
-
{
|
171 |
-
"cell_type": "code",
|
172 |
-
"execution_count": null,
|
173 |
-
"metadata": {},
|
174 |
-
"outputs": [],
|
175 |
-
"source": [
|
176 |
-
"#Dataset NGT\n",
|
177 |
-
"\n",
|
178 |
-
"tokenizer_ngt = Tokenizer(num_words=10000)\n",
|
179 |
-
"tokenizer_ngt.fit_on_texts(X_ngt)\n",
|
180 |
-
"sequences_ngt = tokenizer_ngt.texts_to_sequences(X_ngt)\n",
|
181 |
-
"word_index_ngt = tokenizer_ngt.word_index\n",
|
182 |
-
"print('Found %s unique tokens' % len(word_index_ngt))\n"
|
183 |
-
]
|
184 |
-
},
|
185 |
-
{
|
186 |
-
"cell_type": "code",
|
187 |
-
"execution_count": null,
|
188 |
-
"metadata": {},
|
189 |
-
"outputs": [],
|
190 |
-
"source": [
|
191 |
-
"#Dataset NGT\n",
|
192 |
-
"\n",
|
193 |
-
"X_ngt = pad_sequences(sequences_ngt)\n",
|
194 |
-
"y_ngt = np.asarray(y_ngt)\n",
|
195 |
-
"indices_ngt = np.arange(X_ngt.shape[0])\n",
|
196 |
-
"\n",
|
197 |
-
"\n",
|
198 |
-
"np.random.shuffle(indices_ngt)\n",
|
199 |
-
"X_ngt = X_ngt[indices_ngt]\n",
|
200 |
-
"y_ngt = y_ngt[indices_ngt]\n",
|
201 |
-
"\n",
|
202 |
-
"X_train, X_test, y_train, y_test = train_test_split(X_ngt, y_ngt, test_size=0.2)\n"
|
203 |
-
]
|
204 |
-
},
|
205 |
-
{
|
206 |
-
"cell_type": "code",
|
207 |
-
"execution_count": null,
|
208 |
-
"metadata": {},
|
209 |
-
"outputs": [],
|
210 |
-
"source": [
|
211 |
-
"print(X_train[0])\n",
|
212 |
-
"print(y_train[0])\n",
|
213 |
-
"\n",
|
214 |
-
"print(X_test[0])\n",
|
215 |
-
"print(y_test[0])"
|
216 |
-
]
|
217 |
-
},
|
218 |
-
{
|
219 |
-
"cell_type": "code",
|
220 |
-
"execution_count": null,
|
221 |
-
"metadata": {},
|
222 |
-
"outputs": [],
|
223 |
-
"source": [
|
224 |
-
"X_train = pad_sequences(sequences_train)\n",
|
225 |
-
"y_train = np.asarray(y_train)\n",
|
226 |
-
"indices_train = np.arange(X_train.shape[0])\n",
|
227 |
-
"\n",
|
228 |
-
"\n",
|
229 |
-
"X_test = pad_sequences(sequences_test)\n",
|
230 |
-
"y_test = np.asarray(y_test)\n",
|
231 |
-
"indices_test = np.arange(X_test.shape[0])\n",
|
232 |
-
"\n",
|
233 |
-
"print(indices_train)\n",
|
234 |
-
"print(X_train[0])\n",
|
235 |
-
"print(y_train[0])\n"
|
236 |
-
]
|
237 |
-
},
|
238 |
-
{
|
239 |
-
"cell_type": "code",
|
240 |
-
"execution_count": null,
|
241 |
-
"metadata": {},
|
242 |
-
"outputs": [],
|
243 |
-
"source": [
|
244 |
-
"np.random.shuffle(indices_train)\n",
|
245 |
-
"X_train = X_train[indices_train]\n",
|
246 |
-
"y_train = y_train[indices_train]\n",
|
247 |
-
"\n",
|
248 |
-
"\n",
|
249 |
-
"np.random.shuffle(indices_test)\n",
|
250 |
-
"X_test = X_train[indices_test]\n",
|
251 |
-
"y_test = y_train[indices_test]"
|
252 |
-
]
|
253 |
-
},
|
254 |
-
{
|
255 |
-
"cell_type": "code",
|
256 |
-
"execution_count": null,
|
257 |
-
"metadata": {},
|
258 |
-
"outputs": [],
|
259 |
-
"source": [
|
260 |
-
"X_train.shape\n",
|
261 |
-
"\n",
|
262 |
-
"print(X_train.dtype)\n"
|
263 |
-
]
|
264 |
-
},
|
265 |
{
|
266 |
"cell_type": "code",
|
267 |
"execution_count": null,
|
@@ -277,8 +109,6 @@
|
|
277 |
"model.add(Dense(32, activation='relu'))\n",
|
278 |
"model.add(Dense(1, activation='sigmoid'))\n",
|
279 |
"\n",
|
280 |
-
"#model.compile(optimizer='SGD', loss='binary_crossentropy', metrics=['acc'])\n",
|
281 |
-
"#model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\n",
|
282 |
"model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
|
283 |
"\n",
|
284 |
"history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))"
|
@@ -331,9 +161,7 @@
|
|
331 |
"source": [
|
332 |
"\n",
|
333 |
"#Salvataggio del modello\n",
|
334 |
-
"
|
335 |
-
"model.save('binary.keras')\n",
|
336 |
-
"\n"
|
337 |
]
|
338 |
},
|
339 |
{
|
@@ -341,62 +169,15 @@
|
|
341 |
"execution_count": null,
|
342 |
"metadata": {},
|
343 |
"outputs": [],
|
344 |
-
"source": [
|
345 |
-
"# Dataset ngt\n",
|
346 |
-
"# model.add(Dense(512, activation='relu'))\n",
|
347 |
-
"# model.add(Dense(8, activation='relu'))\n",
|
348 |
-
"# model.add(Dense(1, activation='sigmoid'))\n",
|
349 |
-
"\n",
|
350 |
-
"\n",
|
351 |
-
"# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
|
352 |
-
"\n",
|
353 |
-
"# history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))\n",
|
354 |
-
"\n",
|
355 |
-
"\n",
|
356 |
-
"# Epoch 10/10\n",
|
357 |
-
"# 100/100 [==============================] - 0s 3ms/step - loss: 0.6099 - acc: 0.6712 - val_loss: 0.6311 - val_acc: 0.6525\n",
|
358 |
-
"\n",
|
359 |
-
"\n",
|
360 |
-
"################################################\n",
|
361 |
-
"\n",
|
362 |
-
"\n",
|
363 |
-
"# Altro dataset\n",
|
364 |
-
"# model.add(Dense(512, activation='relu'))\n",
|
365 |
-
"# model.add(Dense(32, activation='relu'))\n",
|
366 |
-
"# model.add(Dense(1, activation='sigmoid'))\n",
|
367 |
-
"\n",
|
368 |
-
"# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
|
369 |
-
"\n",
|
370 |
-
"# history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))\n",
|
371 |
-
"\n",
|
372 |
-
"# Epoch 5/5\n",
|
373 |
-
"# 63/63 [==============================] - 0s 3ms/step - loss: 0.5344 - acc: 0.7185 - val_loss: 0.5255 - val_acc: 0.7525"
|
374 |
-
]
|
375 |
-
},
|
376 |
-
{
|
377 |
-
"cell_type": "code",
|
378 |
-
"execution_count": 19,
|
379 |
-
"metadata": {},
|
380 |
-
"outputs": [
|
381 |
-
{
|
382 |
-
"name": "stdout",
|
383 |
-
"output_type": "stream",
|
384 |
-
"text": [
|
385 |
-
"1/1 [==============================] - 0s 51ms/step\n",
|
386 |
-
"NEGATIVO 58 %\n"
|
387 |
-
]
|
388 |
-
}
|
389 |
-
],
|
390 |
"source": [
|
391 |
"#Test\n",
|
392 |
"\n",
|
393 |
-
"#
|
394 |
"from keras.models import load_model\n",
|
395 |
"from keras.preprocessing.sequence import pad_sequences\n",
|
396 |
"from keras.preprocessing.text import Tokenizer\n",
|
397 |
-
"from keras.preprocessing.text import Tokenizer\n",
|
398 |
"\n",
|
399 |
-
"loaded_model = load_model('
|
400 |
"\n",
|
401 |
"sentence = input(\"Enter the sentence: \")\n",
|
402 |
"sequence = preprocess_text(sentence)\n",
|
@@ -429,7 +210,7 @@
|
|
429 |
"name": "python",
|
430 |
"nbconvert_exporter": "python",
|
431 |
"pygments_lexer": "ipython3",
|
432 |
-
"version": "
|
433 |
},
|
434 |
"orig_nbformat": 4
|
435 |
},
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
|
|
26 |
" return processed_text.translate(str.maketrans('','', string.punctuation))"
|
27 |
]
|
28 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
{
|
30 |
"cell_type": "code",
|
31 |
"execution_count": null,
|
|
|
87 |
"metadata": {},
|
88 |
"outputs": [],
|
89 |
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
"print(X_train[0])\n",
|
91 |
"print(y_train[0])\n",
|
92 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
"print(X_test[0])\n",
|
94 |
"print(y_test[0])"
|
95 |
]
|
96 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
{
|
98 |
"cell_type": "code",
|
99 |
"execution_count": null,
|
|
|
109 |
"model.add(Dense(32, activation='relu'))\n",
|
110 |
"model.add(Dense(1, activation='sigmoid'))\n",
|
111 |
"\n",
|
|
|
|
|
112 |
"model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
|
113 |
"\n",
|
114 |
"history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))"
|
|
|
161 |
"source": [
|
162 |
"\n",
|
163 |
"#Salvataggio del modello\n",
|
164 |
+
"model.save('model.keras')"
|
|
|
|
|
165 |
]
|
166 |
},
|
167 |
{
|
|
|
169 |
"execution_count": null,
|
170 |
"metadata": {},
|
171 |
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
"source": [
|
173 |
"#Test\n",
|
174 |
"\n",
|
175 |
+
"#Load model\n",
|
176 |
"from keras.models import load_model\n",
|
177 |
"from keras.preprocessing.sequence import pad_sequences\n",
|
178 |
"from keras.preprocessing.text import Tokenizer\n",
|
|
|
179 |
"\n",
|
180 |
+
"loaded_model = load_model('model.keras')\n",
|
181 |
"\n",
|
182 |
"sentence = input(\"Enter the sentence: \")\n",
|
183 |
"sequence = preprocess_text(sentence)\n",
|
|
|
210 |
"name": "python",
|
211 |
"nbconvert_exporter": "python",
|
212 |
"pygments_lexer": "ipython3",
|
213 |
+
"version": "2.7.18"
|
214 |
},
|
215 |
"orig_nbformat": 4
|
216 |
},
|