Hossein-NK commited on
Commit
b08551c
1 Parent(s): 950d5e7

Upload Tweet_Financial_News_Classification.ipynb

Browse files
Tweet_Financial_News_Classification.ipynb ADDED
@@ -0,0 +1,549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ },
15
+ "accelerator": "TPU"
16
+ },
17
+ "cells": [
18
+ {
19
+ "cell_type": "code",
20
+ "source": [
21
+ "import warnings\n",
22
+ "warnings.filterwarnings('ignore')\n",
23
+ "\n",
24
+ "import transformers\n",
25
+ "transformers_version = transformers.__version__\n",
26
+ "\n",
27
+ "if transformers_version > '4.31.1':\n",
28
+ " !pip uninstall transformers\n",
29
+ " !pip install transformers==4.31\n",
30
+ "else:\n",
31
+ " print(\"transformers version:\", transformers.__version__)"
32
+ ],
33
+ "metadata": {
34
+ "id": "2RcFPIqQJ6CY",
35
+ "colab": {
36
+ "base_uri": "https://localhost:8080/"
37
+ },
38
+ "outputId": "8030dedf-b9f5-4687-ef87-1c5a4d8ee9b9"
39
+ },
40
+ "execution_count": 1,
41
+ "outputs": [
42
+ {
43
+ "output_type": "stream",
44
+ "name": "stdout",
45
+ "text": [
46
+ "Found existing installation: transformers 4.31.0\n",
47
+ "Uninstalling transformers-4.31.0:\n",
48
+ " Would remove:\n",
49
+ " /usr/local/bin/transformers-cli\n",
50
+ " /usr/local/lib/python3.10/dist-packages/transformers-4.31.0.dist-info/*\n",
51
+ " /usr/local/lib/python3.10/dist-packages/transformers/*\n",
52
+ "Proceed (Y/n)? n\n",
53
+ "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
54
+ "\u001b[0mRequirement already satisfied: transformers==4.31 in /usr/local/lib/python3.10/dist-packages (4.31.0)\n",
55
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (3.13.4)\n",
56
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (0.20.3)\n",
57
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (1.25.2)\n",
58
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (24.0)\n",
59
+ "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (6.0.1)\n",
60
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (2023.12.25)\n",
61
+ "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (2.31.0)\n",
62
+ "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (0.13.3)\n",
63
+ "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (0.4.3)\n",
64
+ "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.31) (4.66.2)\n",
65
+ "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.31) (2023.6.0)\n",
66
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.31) (4.11.0)\n",
67
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (3.3.2)\n",
68
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (3.7)\n",
69
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (2.0.7)\n",
70
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.31) (2024.2.2)\n",
71
+ "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/usr/local/lib/python3.10/dist-packages)\u001b[0m\u001b[33m\n",
72
+ "\u001b[0m"
73
+ ]
74
+ }
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "source": [
80
+ "import tensorflow as tf\n",
81
+ "print(\"TensorFlow version:\", tf.__version__)\n",
82
+ "\n",
83
+ "import keras\n",
84
+ "print(\"Keras version:\", keras.__version__)"
85
+ ],
86
+ "metadata": {
87
+ "colab": {
88
+ "base_uri": "https://localhost:8080/"
89
+ },
90
+ "id": "b_0OPx3WukSi",
91
+ "outputId": "0d205aa3-33b4-4a34-9055-d670cc5ac049"
92
+ },
93
+ "execution_count": 2,
94
+ "outputs": [
95
+ {
96
+ "output_type": "stream",
97
+ "name": "stdout",
98
+ "text": [
99
+ "TensorFlow version: 2.15.0\n",
100
+ "Keras version: 2.15.0\n"
101
+ ]
102
+ }
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": 3,
108
+ "metadata": {
109
+ "id": "WkzyTQGqzbPS",
110
+ "colab": {
111
+ "base_uri": "https://localhost:8080/"
112
+ },
113
+ "outputId": "9bc0c671-8557-4b3c-a120-0237d7f96253"
114
+ },
115
+ "outputs": [
116
+ {
117
+ "output_type": "stream",
118
+ "name": "stdout",
119
+ "text": [
120
+ "Mounted at /content/drive\n"
121
+ ]
122
+ }
123
+ ],
124
+ "source": [
125
+ "from google.colab import drive\n",
126
+ "drive.mount('/content/drive')"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "markdown",
131
+ "source": [
132
+ "### Loading the Data ###"
133
+ ],
134
+ "metadata": {
135
+ "id": "BKn5EaROLKeX"
136
+ }
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "source": [
141
+ "import pandas as pd\n",
142
+ "\n",
143
+ "# Load the CSV file in memory\n",
144
+ "train_path = '/content/drive/MyDrive/dataset/Twitter_Financial_News_Sentiment/train.csv'\n",
145
+ "test_path = '/content/drive/MyDrive/dataset/Twitter_Financial_News_Sentiment/test.csv'\n",
146
+ "\n",
147
+ "train_df = pd.read_csv(train_path, usecols=['text', 'label'])\n",
148
+ "test_df = pd.read_csv(test_path, usecols=['text', 'label'])"
149
+ ],
150
+ "metadata": {
151
+ "id": "QztIz9VOKLuV"
152
+ },
153
+ "execution_count": null,
154
+ "outputs": []
155
+ },
156
+ {
157
+ "cell_type": "markdown",
158
+ "source": [
159
+ "Show example"
160
+ ],
161
+ "metadata": {
162
+ "id": "hn5ONAwkNeFS"
163
+ }
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "source": [
168
+ "train_df.head()"
169
+ ],
170
+ "metadata": {
171
+ "id": "zwYzU-dANpJ-"
172
+ },
173
+ "execution_count": null,
174
+ "outputs": []
175
+ },
176
+ {
177
+ "source": [
178
+ "#import matplotlib library\n",
179
+ "from matplotlib import pyplot as plt\n",
180
+ "\n",
181
+ "#Histogram of \"Label\" column in train datset\n",
182
+ "train_df['label'].plot(kind='hist', title='Label')\n",
183
+ "plt.gca().spines[['top', 'right']].set_visible(False)"
184
+ ],
185
+ "cell_type": "code",
186
+ "execution_count": null,
187
+ "outputs": [],
188
+ "metadata": {
189
+ "id": "2M1XLsAeN2GN"
190
+ }
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "source": [
195
+ "test_df.head()"
196
+ ],
197
+ "metadata": {
198
+ "id": "g5_oGvo1NvON"
199
+ },
200
+ "execution_count": null,
201
+ "outputs": []
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "source": [
206
+ "# Pritn theshape of datasets\n",
207
+ "print(f'train_df shape: {train_df.shape}')\n",
208
+ "print(f'test_df shape: {test_df.shape}')"
209
+ ],
210
+ "metadata": {
211
+ "id": "kCFupI1FQlMF"
212
+ },
213
+ "execution_count": null,
214
+ "outputs": []
215
+ },
216
+ {
217
+ "cell_type": "markdown",
218
+ "source": [
219
+ "### Removing the Special Characters ###"
220
+ ],
221
+ "metadata": {
222
+ "id": "zRcmc15aSNx6"
223
+ }
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "source": [
228
+ "\n",
229
+ "!pip install text_hammer\n",
230
+ "\n",
231
+ "import text_hammer as th\n",
232
+ "\n",
233
+ "def text_proccessing(df, col_name):\n",
234
+ " \"\"\"\n",
235
+ " Process text data in a DataFrame column by performing the following operations:\n",
236
+ "\n",
237
+ " 1. Convert text to lowercase.\n",
238
+ " 2. Remove emails from the text.\n",
239
+ " 3. Remove accented characters from the text.\n",
240
+ " 4. Remove URLs from the text.\n",
241
+ "\n",
242
+ " Parameters:\n",
243
+ " df (DataFrame): Input DataFrame containing text data.\n",
244
+ " col_name (str): Name of the column in the DataFrame containing text data.\n",
245
+ "\n",
246
+ " Returns:\n",
247
+ " DataFrame: Processed DataFrame with text data after applying the specified operations.\n",
248
+ " \"\"\"\n",
249
+ "\n",
250
+ " # df[col_name] = df[col_name].apply(lambda x:str(x).lower())\n",
251
+ " df[col_name] = df[col_name].apply(lambda x: th.remove_emails(x))\n",
252
+ " df[col_name] = df[col_name].apply(lambda x: th.remove_accented_chars(x))\n",
253
+ " df[col_name] = df[col_name].apply(lambda x: th.remove_urls(x))\n",
254
+ "\n",
255
+ " return df\n",
256
+ "\n",
257
+ "train_df = text_proccessing(train_df, 'text')\n"
258
+ ],
259
+ "metadata": {
260
+ "id": "YEMq7SUiS28e"
261
+ },
262
+ "execution_count": null,
263
+ "outputs": []
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "source": [
268
+ "# Print the first sample after cleaning data\n",
269
+ "train_df['text'].iloc[0:10]"
270
+ ],
271
+ "metadata": {
272
+ "id": "VD92IEhPZQHm"
273
+ },
274
+ "execution_count": null,
275
+ "outputs": []
276
+ },
277
+ {
278
+ "cell_type": "markdown",
279
+ "source": [
280
+ "###Loading PreTrained BERT Model###"
281
+ ],
282
+ "metadata": {
283
+ "id": "YfH0H1W6c0Bb"
284
+ }
285
+ },
286
+ {
287
+ "cell_type": "code",
288
+ "source": [
289
+ "from transformers import AutoTokenizer, TFBertModel\n",
290
+ "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
291
+ "bert = TFBertModel.from_pretrained('bert-base-uncased')\n"
292
+ ],
293
+ "metadata": {
294
+ "id": "ejMMzCOecze9"
295
+ },
296
+ "execution_count": null,
297
+ "outputs": []
298
+ },
299
+ {
300
+ "cell_type": "code",
301
+ "source": [
302
+ "tokenizer(train_df['text'].iloc[0])"
303
+ ],
304
+ "metadata": {
305
+ "id": "PVWkIfE5gLOV"
306
+ },
307
+ "execution_count": null,
308
+ "outputs": []
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "source": [
313
+ "max_len = max([len(x.split()) for x in train_df.text])\n",
314
+ "print(f'Max len of tweets: {max_len}')"
315
+ ],
316
+ "metadata": {
317
+ "id": "dGANUQVdhHH7"
318
+ },
319
+ "execution_count": null,
320
+ "outputs": []
321
+ },
322
+ {
323
+ "cell_type": "code",
324
+ "source": [
325
+ "x_train = tokenizer(\n",
326
+ " text = train_df.text.tolist(),\n",
327
+ " padding = True,\n",
328
+ " max_length= 36,\n",
329
+ " truncation= True,\n",
330
+ " return_tensors = 'tf')\n",
331
+ "\n",
332
+ "print(x_train)"
333
+ ],
334
+ "metadata": {
335
+ "id": "q9b4iDZ0jW5-"
336
+ },
337
+ "execution_count": null,
338
+ "outputs": []
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "source": [
343
+ "print(x_train['input_ids'].shape)\n",
344
+ "print(x_train['attention_mask'].shape)"
345
+ ],
346
+ "metadata": {
347
+ "id": "PUMeXfO8lgNd"
348
+ },
349
+ "execution_count": null,
350
+ "outputs": []
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "source": [
355
+ "print(train_df.label.value_counts())"
356
+ ],
357
+ "metadata": {
358
+ "id": "RMM1QI3DlpmD"
359
+ },
360
+ "execution_count": null,
361
+ "outputs": []
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "source": [
366
+ "y_train = train_df.label.values\n",
367
+ "y_train\n"
368
+ ],
369
+ "metadata": {
370
+ "id": "4zFkagLml80z"
371
+ },
372
+ "execution_count": null,
373
+ "outputs": []
374
+ },
375
+ {
376
+ "cell_type": "markdown",
377
+ "source": [
378
+ "### Building the Model Architecture ###"
379
+ ],
380
+ "metadata": {
381
+ "id": "fFQNe5Cimwxn"
382
+ }
383
+ },
384
+ {
385
+ "cell_type": "code",
386
+ "source": [
387
+ "from keras import layers, Model\n",
388
+ "\n",
389
+ "max_length = 36\n",
390
+ "\n",
391
+ "input_ids = layers.Input(shape=(max_length,), dtype=tf.int32, name=\"input_ids\")\n",
392
+ "input_mask = layers.Input(shape=(max_length,), dtype=tf.int32, name=\"attention_mask\")\n",
393
+ "\n",
394
+ "embeddings = bert(input_ids,attention_mask = input_mask)[1] #(0 is the last hidden states,1 means pooler_output)\n",
395
+ "\n",
396
+ "out = layers.Dropout(0.1)(embeddings)\n",
397
+ "out = layers.Dense(128, activation='relu')(out)\n",
398
+ "out = layers.Dropout(0.1)(out)\n",
399
+ "out = layers.Dense(32,activation = 'relu')(out)\n",
400
+ "\n",
401
+ "y = layers.Dense(3,activation = 'softmax')(out)\n",
402
+ "\n",
403
+ "model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)\n",
404
+ "model.layers[2].trainable = False"
405
+ ],
406
+ "metadata": {
407
+ "id": "DE1XbnVomwMc"
408
+ },
409
+ "execution_count": null,
410
+ "outputs": []
411
+ },
412
+ {
413
+ "cell_type": "code",
414
+ "source": [
415
+ "model.summary()"
416
+ ],
417
+ "metadata": {
418
+ "id": "GuxGCjYjrTyY"
419
+ },
420
+ "execution_count": null,
421
+ "outputs": []
422
+ },
423
+ {
424
+ "cell_type": "code",
425
+ "source": [
426
+ "from keras.optimizers import Adam\n",
427
+ "\n",
428
+ "optimizer = Adam(\n",
429
+ " learning_rate = 6e-06, # this learning rate is for bert model , taken from huggingface website\n",
430
+ " epsilon=1e-08,\n",
431
+ " weight_decay=0.01)\n",
432
+ "\n",
433
+ "# Compile the model\n",
434
+ "model.compile(\n",
435
+ " optimizer = optimizer,\n",
436
+ " loss = 'sparse_categorical_crossentropy',\n",
437
+ " metrics = [\"sparse_categorical_accuracy\"])"
438
+ ],
439
+ "metadata": {
440
+ "id": "FyyNrAAf7QMP"
441
+ },
442
+ "execution_count": null,
443
+ "outputs": []
444
+ },
445
+ {
446
+ "cell_type": "code",
447
+ "source": [
448
+ "train_history = model.fit(\n",
449
+ " x = {'input_ids':x_train['input_ids'], 'attention_mask':x_train['attention_mask']} ,\n",
450
+ " y = y_train,\n",
451
+ " validation_split = 0.1,\n",
452
+ " epochs= 3,\n",
453
+ " batch_size= 32)"
454
+ ],
455
+ "metadata": {
456
+ "colab": {
457
+ "base_uri": "https://localhost:8080/"
458
+ },
459
+ "id": "bEnttT2rA8Yw",
460
+ "outputId": "644c03fd-0cc0-40ff-8108-e059e3a4a0dd"
461
+ },
462
+ "execution_count": null,
463
+ "outputs": [
464
+ {
465
+ "output_type": "stream",
466
+ "name": "stdout",
467
+ "text": [
468
+ "Epoch 1/3\n",
469
+ "118/269 [============>.................] - ETA: 10:10 - loss: 0.9140 - sparse_categorical_accuracy: 0.6261"
470
+ ]
471
+ }
472
+ ]
473
+ },
474
+ {
475
+ "cell_type": "markdown",
476
+ "source": [
477
+ "#### TESTING PHASE\n",
478
+ "on this phase we will make predictions out of our model"
479
+ ],
480
+ "metadata": {
481
+ "id": "hgiDVRwSBtCN"
482
+ }
483
+ },
484
+ {
485
+ "cell_type": "code",
486
+ "source": [
487
+ "x_test = tokenizer(\n",
488
+ " text = test_df.text.tolist(),\n",
489
+ " padding= True,\n",
490
+ " max_length= 36,\n",
491
+ " truncation = True,\n",
492
+ " return_tensors= 'tf')"
493
+ ],
494
+ "metadata": {
495
+ "id": "xaKYd2PRBySe"
496
+ },
497
+ "execution_count": null,
498
+ "outputs": []
499
+ },
500
+ {
501
+ "cell_type": "code",
502
+ "source": [
503
+ "y_test = test_df.label.values\n",
504
+ "y_test"
505
+ ],
506
+ "metadata": {
507
+ "id": "OpvHTg3atflb"
508
+ },
509
+ "execution_count": null,
510
+ "outputs": []
511
+ },
512
+ {
513
+ "cell_type": "code",
514
+ "source": [
515
+ "predicted = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})"
516
+ ],
517
+ "metadata": {
518
+ "id": "nWgCdpKvCSWm"
519
+ },
520
+ "execution_count": null,
521
+ "outputs": []
522
+ },
523
+ {
524
+ "cell_type": "code",
525
+ "source": [
526
+ "from sklearn.metrics import confusion_matrix\n",
527
+ "import seaborn as sns\n",
528
+ "\n",
529
+ "# Convert the predictions to binary values (0 or 1)\n",
530
+ "y_pred_binary = [int(round(x[0])) for x in predicted]\n",
531
+ "\n",
532
+ "# Generate the confusion matrix\n",
533
+ "cm = confusion_matrix(test_df['label'], y_pred_binary)\n",
534
+ "\n",
535
+ "# Create a heatmap of the confusion matrix\n",
536
+ "sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\")\n",
537
+ "plt.xlabel(\"Predicted Label\")\n",
538
+ "plt.ylabel(\"True Label\")\n",
539
+ "plt.title(\"Confusion Matrix\")\n",
540
+ "plt.show()"
541
+ ],
542
+ "metadata": {
543
+ "id": "-BICUoNs_8qI"
544
+ },
545
+ "execution_count": null,
546
+ "outputs": []
547
+ }
548
+ ]
549
+ }