MarieAngeA13 commited on
Commit
21605a9
·
1 Parent(s): 6550819

Upload Sentiment_analysis_with_bert.py

Browse files
Files changed (1) hide show
  1. Sentiment_analysis_with_bert.py +523 -0
Sentiment_analysis_with_bert.py ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install -q -U watermark
2
+
3
+ !pip install -qq transformers
4
+
5
+
6
+ import transformers
7
+ from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
8
+ import torch
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ import seaborn as sns
13
+ from pylab import rcParams
14
+ import matplotlib.pyplot as plt
15
+ from matplotlib import rc
16
+ from sklearn.model_selection import train_test_split
17
+ from sklearn.metrics import confusion_matrix, classification_report
18
+ from collections import defaultdict
19
+ from textwrap import wrap
20
+
21
+ from torch import nn, optim
22
+ from torch.utils.data import Dataset, DataLoader
23
+ import torch.nn.functional as F
24
+
25
+
26
+
27
+ sns.set(style='whitegrid', palette='muted', font_scale=1.2)
28
+
29
+ HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
30
+
31
+ sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
32
+
33
+ rcParams['figure.figsize'] = 12, 8
34
+
35
+ RANDOM_SEED = 42
36
+ np.random.seed(RANDOM_SEED)
37
+ torch.manual_seed(RANDOM_SEED)
38
+
39
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
40
+
41
+
42
+ !gdown --id 1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
43
+ !gdown --id 1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv
44
+
45
+ df = pd.read_csv("reviews.csv")
46
+
47
+
48
+ sns.countplot(x='score', data = df)
49
+ plt.xlabel('review score');
50
+
51
+ def to_sentiment(rating):
52
+ rating = int(rating)
53
+ if rating <= 2:
54
+ return 0
55
+ elif rating == 3:
56
+ return 1
57
+ else:
58
+ return 2
59
+
60
+ df['sentiment'] = df.score.apply(to_sentiment)
61
+
62
+ class_names = ['negative', 'neutral', 'positive']
63
+
64
+ print(df.sentiment)
65
+
66
+ ax = sns.countplot(x='sentiment', data = df)
67
+ plt.xlabel('review sentiment')
68
+ ax.set_xticklabels(class_names);
69
+
70
+ PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
71
+
72
+ tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
73
+
74
+ sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'
75
+
76
+ tokens = tokenizer.tokenize(sample_txt)
77
+ token_ids = tokenizer.convert_tokens_to_ids(tokens)
78
+
79
+ print(f' Sentence: {sample_txt}')
80
+ print(f' Tokens: {tokens}')
81
+ print(f'Token IDs: {token_ids}')
82
+
83
+ tokenizer.sep_token, tokenizer.sep_token_id
84
+
85
+ tokenizer.cls_token, tokenizer.cls_token_id
86
+
87
+ tokenizer.pad_token, tokenizer.pad_token_id
88
+
89
+ tokenizer.unk_token, tokenizer.unk_token_id
90
+
91
+ encoding = tokenizer.encode_plus(
92
+ sample_txt,
93
+ max_length=32,
94
+ add_special_tokens=True, # Add '[CLS]' and '[SEP]'
95
+ return_token_type_ids=False,
96
+ pad_to_max_length=True,
97
+ return_attention_mask=True,
98
+ return_tensors='pt', # Return PyTorch tensors
99
+ )
100
+
101
+ encoding.keys()
102
+
103
+ print(len(encoding['input_ids'][0]))
104
+ encoding['input_ids'][0]
105
+
106
+ print(len(encoding['attention_mask'][0]))
107
+ encoding['attention_mask']
108
+
109
+ tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
110
+
111
+ token_lens = []
112
+
113
+ for txt in df.content:
114
+ tokens = tokenizer.encode(txt, max_length=512)
115
+ token_lens.append(len(tokens))
116
+
117
+ sns.distplot(token_lens)
118
+ plt.xlim([0, 256]);
119
+ plt.xlabel('Token count');
120
+
121
+ MAX_LEN = 160
122
+
123
+ class GPReviewDataset(Dataset):
124
+
125
+ def __init__(self, reviews, targets, tokenizer, max_len):
126
+ self.reviews = reviews
127
+ self.targets = targets
128
+ self.tokenizer = tokenizer
129
+ self.max_len = max_len
130
+
131
+ def __len__(self):
132
+ return len(self.reviews)
133
+
134
+ def __getitem__(self, item):
135
+ review = str(self.reviews[item])
136
+ target = self.targets[item]
137
+
138
+ encoding = self.tokenizer.encode_plus(
139
+ review,
140
+ add_special_tokens=True,
141
+ max_length=self.max_len,
142
+ return_token_type_ids=False,
143
+ pad_to_max_length=True,
144
+ return_attention_mask=True,
145
+ return_tensors='pt',
146
+ )
147
+
148
+ return {
149
+ 'review_text': review,
150
+ 'input_ids': encoding['input_ids'].flatten(),
151
+ 'attention_mask': encoding['attention_mask'].flatten(),
152
+ 'targets': torch.tensor(target, dtype=torch.long)
153
+ }
154
+
155
+ df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
156
+ df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
157
+
158
+ df_train.shape, df_val.shape, df_test.shape
159
+
160
+ def create_data_loader(df, tokenizer, max_len, batch_size):
161
+ ds = GPReviewDataset(
162
+ reviews=df.content.to_numpy(),
163
+ targets=df.sentiment.to_numpy(),
164
+ tokenizer=tokenizer,
165
+ max_len=max_len
166
+ )
167
+
168
+ return DataLoader(
169
+ ds,
170
+ batch_size=batch_size,
171
+ num_workers=4
172
+ )
173
+
174
+ BATCH_SIZE = 16
175
+
176
+ train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
177
+ val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
178
+ test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
179
+
180
+ data = next(iter(train_data_loader))
181
+ data.keys()
182
+
183
+ print(data['input_ids'].shape)
184
+ print(data['attention_mask'].shape)
185
+ print(data['targets'].shape)
186
+
187
+ bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
188
+
189
+ last_hidden_state, pooled_output = bert_model(
190
+ input_ids=encoding['input_ids'],
191
+ attention_mask=encoding['attention_mask'],
192
+ return_dict = False
193
+ )
194
+
195
+ last_hidden_state.shape
196
+
197
+ bert_model.config.hidden_size
198
+
199
+ pooled_output.shape
200
+
201
+ class SentimentClassifier(nn.Module):
202
+
203
+ def __init__(self, n_classes):
204
+ super(SentimentClassifier, self).__init__()
205
+ self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
206
+ self.drop = nn.Dropout(p=0.3)
207
+ self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
208
+
209
+ def forward(self, input_ids, attention_mask):
210
+ returned = self.bert(
211
+ input_ids=input_ids,
212
+ attention_mask=attention_mask
213
+ )
214
+ pooled_output = returned["pooler_output"]
215
+ output = self.drop(pooled_output)
216
+ return self.out(output)
217
+
218
+ model = SentimentClassifier(len(class_names))
219
+ model = model.to(device)
220
+
221
+ input_ids = data['input_ids'].to(device)
222
+ attention_mask = data['attention_mask'].to(device)
223
+
224
+ print(input_ids.shape) # batch size x seq length
225
+ print(attention_mask.shape) # batch size x seq length
226
+
227
+ F.softmax(model(input_ids, attention_mask), dim=1)
228
+
229
+
230
+ EPOCHS = 6
231
+
232
+ optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
233
+ total_steps = len(train_data_loader) * EPOCHS
234
+
235
+ scheduler = get_linear_schedule_with_warmup(
236
+ optimizer,
237
+ num_warmup_steps=0,
238
+ num_training_steps=total_steps
239
+ )
240
+
241
+ loss_fn = nn.CrossEntropyLoss().to(device)
242
+
243
+ def train_epoch(
244
+ model,
245
+ data_loader,
246
+ loss_fn,
247
+ optimizer,
248
+ device,
249
+ scheduler,
250
+ n_examples
251
+ ):
252
+ model = model.train()
253
+
254
+ losses = []
255
+ correct_predictions = 0
256
+
257
+ for d in data_loader:
258
+ input_ids = d["input_ids"].to(device)
259
+ attention_mask = d["attention_mask"].to(device)
260
+ targets = d["targets"].to(device)
261
+
262
+ outputs = model(
263
+ input_ids=input_ids,
264
+ attention_mask=attention_mask
265
+ )
266
+
267
+ _, preds = torch.max(outputs, dim=1)
268
+ loss = loss_fn(outputs, targets)
269
+
270
+ correct_predictions += torch.sum(preds == targets)
271
+ losses.append(loss.item())
272
+
273
+ loss.backward()
274
+ nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
275
+ optimizer.step()
276
+ scheduler.step()
277
+ optimizer.zero_grad()
278
+
279
+ return correct_predictions.double() / n_examples, np.mean(losses)
280
+
281
+ def eval_model(model, data_loader, loss_fn, device, n_examples):
282
+ model = model.eval()
283
+
284
+ losses = []
285
+ correct_predictions = 0
286
+
287
+ with torch.no_grad():
288
+ for d in data_loader:
289
+ input_ids = d["input_ids"].to(device)
290
+ attention_mask = d["attention_mask"].to(device)
291
+ targets = d["targets"].to(device)
292
+
293
+ outputs = model(
294
+ input_ids=input_ids,
295
+ attention_mask=attention_mask
296
+ )
297
+ _, preds = torch.max(outputs, dim=1)
298
+
299
+ loss = loss_fn(outputs, targets)
300
+
301
+ correct_predictions += torch.sum(preds == targets)
302
+ losses.append(loss.item())
303
+
304
+ return correct_predictions.double() / n_examples, np.mean(losses)
305
+
306
+ # Commented out IPython magic to ensure Python compatibility.
307
+ # %%time
308
+ #
309
+ # history = defaultdict(list)
310
+ # best_accuracy = 0
311
+ #
312
+ # for epoch in range(EPOCHS):
313
+ #
314
+ # print(f'Epoch {epoch + 1}/{EPOCHS}')
315
+ # print('-' * 10)
316
+ #
317
+ # train_acc, train_loss = train_epoch(
318
+ # model,
319
+ # train_data_loader,
320
+ # loss_fn,
321
+ # optimizer,
322
+ # device,
323
+ # scheduler,
324
+ # len(df_train)
325
+ # )
326
+ #
327
+ # print(f'Train loss {train_loss} accuracy {train_acc}')
328
+ #
329
+ # val_acc, val_loss = eval_model(
330
+ # model,
331
+ # val_data_loader,
332
+ # loss_fn,
333
+ # device,
334
+ # len(df_val)
335
+ # )
336
+ #
337
+ # print(f'Val loss {val_loss} accuracy {val_acc}')
338
+ # print()
339
+ #
340
+ # history['train_acc'].append(train_acc)
341
+ # history['train_loss'].append(train_loss)
342
+ # history['val_acc'].append(val_acc)
343
+ # history['val_loss'].append(val_loss)
344
+ #
345
+ # if val_acc > best_accuracy:
346
+ # torch.save(model.state_dict(), 'best_model_state.bin')
347
+ # best_accuracy = val_acc
348
+
349
+ print(history['train_acc'])
350
+
351
+ list_of_train_accuracy= [t.cpu().numpy() for t in history['train_acc']]
352
+ list_of_train_accuracy
353
+
354
+ print(history['val_acc'])
355
+
356
+ list_of_val_accuracy= [t.cpu().numpy() for t in history['val_acc']]
357
+ list_of_val_accuracy
358
+
359
+ plt.plot(list_of_train_accuracy, label='train accuracy')
360
+ plt.plot(list_of_val_accuracy, label='validation accuracy')
361
+
362
+ plt.title('Training history')
363
+ plt.ylabel('Accuracy')
364
+ plt.xlabel('Epoch')
365
+ plt.legend()
366
+ plt.ylim([0, 1]);
367
+
368
+ test_acc, _ = eval_model(
369
+ model,
370
+ test_data_loader,
371
+ loss_fn,
372
+ device,
373
+ len(df_test)
374
+ )
375
+
376
+ print(('\n'))
377
+ print('Test Accuracy : ', test_acc.item())
378
+
379
+ def get_predictions(model, data_loader):
380
+ model = model.eval()
381
+
382
+ review_texts = []
383
+ predictions = []
384
+ prediction_probs = []
385
+ real_values = []
386
+
387
+ with torch.no_grad():
388
+ for d in data_loader:
389
+
390
+ texts = d["review_text"]
391
+ input_ids = d["input_ids"].to(device)
392
+ attention_mask = d["attention_mask"].to(device)
393
+ targets = d["targets"].to(device)
394
+
395
+ outputs = model(
396
+ input_ids=input_ids,
397
+ attention_mask=attention_mask
398
+ )
399
+ _, preds = torch.max(outputs, dim=1)
400
+
401
+ probs = F.softmax(outputs, dim=1)
402
+
403
+ review_texts.extend(texts)
404
+ predictions.extend(preds)
405
+ prediction_probs.extend(probs)
406
+ real_values.extend(targets)
407
+
408
+ predictions = torch.stack(predictions).cpu()
409
+ prediction_probs = torch.stack(prediction_probs).cpu()
410
+ real_values = torch.stack(real_values).cpu()
411
+ return review_texts, predictions, prediction_probs, real_values
412
+
413
+ y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
414
+ model,
415
+ test_data_loader
416
+ )
417
+
418
+ print(classification_report(y_test, y_pred, target_names=class_names))
419
+
420
+ def show_confusion_matrix(confusion_matrix):
421
+ hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
422
+ hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
423
+ hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
424
+ plt.ylabel('True sentiment')
425
+ plt.xlabel('Predicted sentiment');
426
+
427
+ cm = confusion_matrix(y_test, y_pred)
428
+ df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
429
+ show_confusion_matrix(df_cm)
430
+
431
+ idx = 2
432
+
433
+ review_text = y_review_texts[idx]
434
+ true_sentiment = y_test[idx]
435
+ pred_df = pd.DataFrame({
436
+ 'class_names': class_names,
437
+ 'values': y_pred_probs[idx]
438
+ })
439
+
440
+ print("\n".join(wrap(review_text)))
441
+ print()
442
+ print(f'True sentiment: {class_names[true_sentiment]}')
443
+
444
+ sns.barplot(x='values', y='class_names', data=pred_df, orient='h')
445
+ plt.ylabel('sentiment')
446
+ plt.xlabel('probability')
447
+ plt.xlim([0, 1]);
448
+
449
+ review_text = input("Enter a comment for sentiment analysis: ")
450
+
451
+ encoded_review = tokenizer.encode_plus(
452
+ review_text,
453
+ max_length=MAX_LEN,
454
+ add_special_tokens=True,
455
+ return_token_type_ids=False,
456
+ pad_to_max_length=True,
457
+ return_attention_mask=True,
458
+ return_tensors='pt',
459
+ )
460
+
461
+ input_ids = encoded_review['input_ids'].to(device)
462
+ attention_mask = encoded_review['attention_mask'].to(device)
463
+
464
+ output = model(input_ids, attention_mask)
465
+ _, prediction = torch.max(output, dim=1)
466
+
467
+ print(f'Review text: {review_text}')
468
+ print(f'Sentiment : {class_names[prediction]}')
469
+
470
+ def suggest_improved_text(review_text, model, tokenizer):
471
+ # Analyse du sentiment du texte d'origine
472
+ sentiment = analyze_sentiment(review_text, model, tokenizer)
473
+
474
+ # Si le sentiment est négatif ou neutre, générer une version améliorée plus positive
475
+ if sentiment in ['negative', 'neutral']:
476
+ # Prétraitement du texte
477
+ encoded_input = tokenizer.encode_plus(
478
+ review_text,
479
+ max_length=MAX_LEN,
480
+ add_special_tokens=True,
481
+ return_token_type_ids=False,
482
+ pad_to_max_length=True,
483
+ return_attention_mask=True,
484
+ return_tensors='pt'
485
+ )
486
+
487
+ input_ids = encoded_input['input_ids'].to(device)
488
+ attention_mask = encoded_input['attention_mask'].to(device)
489
+ outputs = model(input_ids, attention_mask)
490
+ _, predicted_sentiment = torch.max(outputs, dim=1)
491
+
492
+ improved_text = generate_improved_text(text, predicted_sentiment)
493
+
494
+ return improved_text
495
+
496
+ return review_text
497
+
498
+ def analyze_sentiment(review_text, model, tokenizer):
499
+ encoded_input = tokenizer.encode_plus(
500
+ review_text,
501
+ max_length=MAX_LEN,
502
+ add_special_tokens=True,
503
+ return_token_type_ids=False,
504
+ pad_to_max_length=True,
505
+ return_attention_mask=True,
506
+ return_tensors='pt'
507
+ )
508
+
509
+ input_ids = encoded_input['input_ids'].to(device)
510
+ attention_mask = encoded_input['attention_mask'].to(device)
511
+ outputs = model(input_ids, attention_mask)
512
+ _, predicted_sentiment = torch.max(outputs, dim=1)
513
+
514
+ return class_names[predicted_sentiment]
515
+ def generate_improved_text(review_text, predicted_sentiment):
516
+ positive_words = ["marvellous", "fantastic", "excellent", "admirable", "formidable"]
517
+
518
+ if predicted_sentiment == 0:
519
+ improved_text = review_text + " " + " ".join(positive_words)
520
+ else:
521
+ improved_text = review_text
522
+
523
+ return improved_text