In [None]:
# Define the path to the saved model file
model_file_path = "/content/drive/MyDrive/sentiment_analysis_BERT_finetune/sentiment_analysis_finetune_bert.pkl"

In [None]:
# Install Transformers and gradio
!pip install transformers
!pip install gradio

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/7.4 MB[0m [31m13.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m5.5/7.4 MB[0m [31m80.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.4/7.4 MB[0m [31m94.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#import all reqiured package
import numpy as np
import pandas as pd
import seaborn as sns
import re
import torch
import random
import torch.nn as nn
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
import pickle
from tqdm import tqdm
import gradio as gr

In [None]:
# specify GPU
device = torch.device("cuda")

The code employs the BERT language model for breaking down text into tokens and translating them into numerical IDs. Initially, the chosen BERT model, 'bert-base-cased', is set, and a tokenizer is initialized accordingly.

Subsequently, the script takes a sample text, "originally gave this a 2 star," and processes it through the tokenizer. This procedure involves transforming the text into a sequence of tokens, which are the fundamental units that BERT comprehends.

In [None]:
MODEL_NAME = 'bert-base-cased'
tokenizer = transformers.BertTokenizer.from_pretrained(MODEL_NAME)

sample_text = "originally gave this a 2 star"

tokens = tokenizer.tokenize(sample_text)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(f'{sample_text}')
print('='*60)
print(tokens)
print('='*60)
print(ids)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

originally gave this a 2 star
['originally', 'gave', 'this', 'a', '2', 'star']
[2034, 1522, 1142, 170, 123, 2851]


The code prints out several special tokens used by the tokenizer along with their corresponding numerical IDs. Here's a breakdown of what each line does:

$\textbf{print(tokenizer.sep_token , tokenizer.sep_token_id):}$

Prints the separator token used by the tokenizer (often used to separate two segments of text, like question and answer).
Prints the corresponding numerical ID of the separator token.

$\textbf{print(tokenizer.cls_token, tokenizer.cls_token_id):}$

Prints the classification token used by the tokenizer (often used at the beginning of input text).
Prints the corresponding numerical ID of the classification token.

$\textbf{print(tokenizer.unk_token, tokenizer.unk_token_id):}$

Prints the unknown token used by the tokenizer (represents out-of-vocabulary or unrecognized words).
Prints the corresponding numerical ID of the unknown token.

$\textbf{print(tokenizer.pad_token, tokenizer.pad_token_id):}$

Prints the padding token used by the tokenizer (used to pad sequences to a uniform length).
Prints the corresponding numerical ID of the padding token.

In [None]:
print(tokenizer.sep_token , tokenizer.sep_token_id)
print(tokenizer.cls_token,tokenizer.cls_token_id)
print(tokenizer.unk_token,tokenizer.unk_token_id)
print(tokenizer.pad_token,tokenizer.pad_token_id)

[SEP] 102
[CLS] 101
[UNK] 100
[PAD] 0


In [None]:
MAX_LEN = 250

In [None]:
class GPReviewDataset(Dataset):
    """
    Custom PyTorch Dataset for text-based tasks like sentiment analysis or text classification.

    Args:
        reviews (list): List of text reviews.
        targets (list): List of corresponding target labels.
        tokenizer: Tokenizer instance (e.g., from Hugging Face transformers library).
        max_len (int): Maximum length for tokenized sequences.

    Attributes:
        reviews (list): List of text reviews.
        targets (list): List of corresponding target labels.
        tokenizer: Tokenizer instance.
        max_len (int): Maximum length for tokenized sequences.
    """

    def __init__(self, reviews, targets, tokenizer, max_len):
        """
        Initializes the dataset with provided data and parameters.
        """
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        """
        Returns the number of samples (reviews) in the dataset.
        """
        return len(self.reviews)

    def __getitem__(self, item):
        """
        Retrieves and prepares a specific sample from the dataset.

        Args:
            item (int): Index of the sample to retrieve.

        Returns:
            dict: A dictionary containing review text, input IDs, attention mask, and target label.
        """
        review = str(self.reviews[item])
        target = self.targets[item]

        # Encoded format to be returned
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:
from torch.utils.data import DataLoader

def create_data_loader(df, tokenizer, max_len, batch_size):
    """
    Creates a DataLoader for text data and sentiment labels.

    Args:
        df (pandas.DataFrame): DataFrame containing text data and sentiment labels.
        tokenizer: Tokenizer instance (e.g., from Hugging Face transformers library).
        max_len (int): Maximum length for tokenized sequences.
        batch_size (int): Number of samples in each batch.

    Returns:
        DataLoader: DataLoader instance with prepared batches of data.
    """
    # Create an instance of GPReviewDataset
    ds = GPReviewDataset(
        reviews=df.Text.to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    # Create a DataLoader with the dataset
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0  # Number of worker threads for loading data (set to 0 for Windows compatibility)
    )


In [None]:
# Load the basic BERT model
bert_model = BertModel.from_pretrained(MODEL_NAME)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
class SentimentClassifier(nn.Module):
    """
    A sentiment analysis classifier based on the BERT model.

    Args:
        n_classes (int): Number of classes for classification.

    Attributes:
        bert: Pre-trained BERT model for feature extraction.
        drop: Dropout layer to reduce overfitting.
        out: Linear layer for classification output.
    """

    def __init__(self, n_classes):
        """
        Initializes the SentimentClassifier module.

        Args:
            n_classes (int): Number of classes for classification.
        """
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        """
        Defines the forward pass of the SentimentClassifier.

        Args:
            input_ids (tensor): Token IDs of input sequences.
            attention_mask (tensor): Attention mask indicating valid tokens.

        Returns:
            tensor: Class probabilities for each sentiment class.
        """
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Apply dropout
        output = self.drop(output.pooler_output)
        return self.out(output)

In [None]:
class_names = ['negative', 'neutral', 'positive']

In [None]:
#softmax of array
def softmax(array_):
    return np.exp(array_)/ np.sum(np.exp(array_))

In [None]:
def sentiment_analyse(review_text):
    """
    Analyzes the sentiment of a given review text using a sentiment classifier model.

    Args:
        review_text (str): The input review text.

    Returns:
        str: Sentiment classification result with confidence percentage.
    """
    # Tokenize and preprocess the review text
    encoded_review = tokenizer.encode_plus(
        review_text,
        max_length=MAX_LEN,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)

    # Make predictions using the model
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)

    # Convert softmax output to percentage confidence
    confidence = np.max(softmax(output.detach().cpu().numpy())) * 100

    # Return sentiment analysis result
    return f"The given review is {class_names[prediction]} with {confidence:.2f}% confidence."


In [None]:


# Open and load the saved model using pickle
with open(model_file_path, "rb") as f:
    model = pickle.load(f)

In [None]:
model=model['model']

In [None]:
#one example
sentiment_analyse("good product")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'The given review is positive with 99.50% confidence.'

In [None]:
# Define the Gradio interface
demo = gr.Interface(
    fn=sentiment_analyse,
    inputs="text",
    outputs="text",
    title="Amazon Fine Foods Sentiment Analysis",
    description="Description: This demo uses a sentiment analysis model to predict the sentiment of food reviews from Amazon. The model is developed by Vivek and is finetuned on the Bert Base model from Hugging Face. The input to the demo is a review of a fine food product from Amazon. The output of the demo is the sentiment of the review (positive, Neutral or negative) and the level of confidence in the prediction.",
    examples=[
        ["This product is amazing! I love it."],
        ["I'm very disappointed with this item. It's not what I expected."]
    ]
)

# Launch the interface
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://e32185bc57884e7abf.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


