# Sentiment Analysis

## Imports, constants and setup

In [15]:
from collections import Counter

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from tqdm.notebook import tqdm
from wordcloud import WordCloud

from app.constants import CACHE_DIR
from app.data import load_data, tokenize
from app.model import _get_vectorizer

In [2]:
tqdm.pandas()

In [3]:
SEED = 42
CACHE = joblib.Memory(CACHE_DIR, verbose=0)

## Data loading

In [4]:
# Load data and convert to pandas DataFrame
text_data, label_data = load_data("test")
dataset = pd.DataFrame({"text": text_data, "sentiment": label_data})
dataset.head()

Unnamed: 0,text,sentiment
0,"MC, happy mother`s day to your mom ;).. love yah",1
1,A year from now is graduation....i am pretty s...,0
2,Great for organising my work life balance,1
3,remember the guy who 1st #tweetbud you! ~> _2...,1
4,She! Maybe that was our first mistake. Not e...,0


In [5]:
# Tokenize text data
tokens = tokenize(dataset["text"].tolist(), batch_size=1024, n_jobs=2, show_progress=True)
dataset["tokens"] = tokens.apply(" ".join)
dataset.head()

Cleaning: 100%|██████████| 3276/3276 [00:02<00:00, 1205.57doc/s]
Lemmatization: 100%|██████████| 3276/3276 [00:06<00:00, 508.76doc/s] 


Unnamed: 0,text,sentiment,tokens
0,"MC, happy mother`s day to your mom ;).. love yah",1,happy mother day mom love yah
1,A year from now is graduation....i am pretty s...,0,year graduationi pretty sure ready
2,Great for organising my work life balance,1,great organise work life balance
3,remember the guy who 1st #tweetbud you! ~> _2...,1,remember guy help flwrs smile
4,She! Maybe that was our first mistake. Not e...,0,maybe mistake cool brown nose moment


## Data exploration

### Sentiment distribution

In [None]:
_, ax = plt.subplots(figsize=(6, 4))

dataset["sentiment"].value_counts().plot(kind="bar", ax=ax)
ax.set_xticklabels(["Negative", "Positive"], rotation=0)
ax.set_xlabel("Sentiment")
ax.set_ylabel("Count")

plt.show()

### Word cloud (before tokenization)

In [None]:
# Gather all the unique words in the dataset
word_freq = Counter()
dataset["text"].str.lower().str.split().progress_apply(word_freq.update)

# Now get the most common words
common_words = word_freq.most_common(100)

# Create a word cloud of the most common words
wrd_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_words))

# Display the word cloud
plt.figure(figsize=(20, 20))
plt.imshow(wrd_cloud, interpolation="bilinear")
plt.axis("off")
plt.show()

### Word cloud (after tokenization)

In [None]:
# Gather all the unique tokens in the dataset
token_freq = Counter()
dataset["tokens"].str.split().progress_apply(token_freq.update)

# Now get the most common tokens
common_tokens = token_freq.most_common(100)

# Create a word cloud of the most common tokens
tkn_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_tokens))

# Display the word cloud
plt.figure(figsize=(20, 20))
plt.imshow(tkn_cloud, interpolation="bilinear")
plt.axis("off")
plt.show()

### Token association

In [None]:
_, ax = plt.subplots(2, 1, figsize=(20, 20))

for i, sentiment in enumerate(["Negative", "Positive"]):
    freq = Counter()
    dataset[dataset["sentiment"] == i]["tokens"].str.split().progress_apply(freq.update)
    most_common = freq.most_common(100)

    cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(most_common))
    ax[i].imshow(cloud, interpolation="bilinear")
    ax[i].axis("off")
    ax[i].set_title(sentiment)

plt.show()

### Token frequency

In [None]:
_, ax = plt.subplots(figsize=(6, 4))

dataset["text"].str.split().str.len().plot(kind="hist", bins=50, ax=ax, alpha=0.5, label="Before Tokenization")
dataset["tokens"].str.split().str.len().plot(kind="hist", bins=50, ax=ax, alpha=0.5, label="After Tokenization")

ax.set_xlabel("Number of tokens")
ax.set_ylabel("Count")
ax.legend()
plt.show()

### Vocabulary size

In [None]:
print(f"Vocabulary size before tokenization: {len(word_freq)}")
print(f"Vocabulary size after tokenization: {len(token_freq)}")

## Vectorization

The `count` vectorizer is a simple vectorizer that counts the number of times a token appears in a document. The `tfidf` does the same as `count` but also normalizes the counts by the inverse document frequency. The `hashing` vectorizer is a memory efficient vectorizer that uses a hash function to map tokens to features. The `hashing` vectorizer does not store the vocabulary in memory, which makes it possible to vectorize large datasets.

In [6]:
# Define vectorizers
vectorizers = {
    "hashing": _get_vectorizer("hashing", n_features=2**20),
    "count": _get_vectorizer("count", 20_000),
    "tfidf": _get_vectorizer("tfidf", 20_000),
}

In [7]:
# Fit and vectorize the tokens
token_list = dataset["tokens"].str.split().tolist()
X = {name: vectorizer.fit_transform(token_list) for name, vectorizer in vectorizers.items()}

# Display the shape of the vectorized data
for name, data in X.items():
    print(f"{name}: {data.shape}")

hashing: (3276, 1048576)
count: (3276, 1084)
tfidf: (3276, 1084)


In [12]:
# Print the first 10 features of count and tfidf vectorizers
features = vectorizers["count"].get_feature_names_out()[:10]
print(features)

['ability' 'able' 'absolutely' 'access' 'accomplish' 'account' 'ace'
 'active' 'activity' 'actually']


## Classification

In [14]:
# Define classifiers
classifiers = [
    (LogisticRegression(max_iter=1000, random_state=SEED), {"C": np.logspace(-3, 3, 20)}),
    (LinearSVC(max_iter=10000, dual=False, random_state=SEED), {"C": np.logspace(-3, 3, 20)}),
    (KNeighborsClassifier(), {"n_neighbors": np.arange(1, 10)}),
    (DecisionTreeClassifier(random_state=SEED), {"max_depth": np.arange(1, 10)}),
    (RandomForestClassifier(random_state=SEED), {"n_estimators": np.arange(10, 500, 50)}),
    (GradientBoostingClassifier(random_state=SEED), {"n_estimators": np.arange(100, 500, 25)}),
    (
        VotingClassifier(
            estimators=[
                ("lr", LogisticRegression(max_iter=1000, random_state=SEED)),
                ("svc", LinearSVC(max_iter=10000, dual=False, random_state=SEED)),
                ("rf", RandomForestClassifier(random_state=SEED)),
            ],
        ),
        {
            "lr__C": np.logspace(-3, 3, 20),
            "svc__C": np.logspace(-3, 3, 20),
            "rf__n_estimators": np.arange(10, 500, 50),
        },
    ),
]

In [17]:
# Split the data into training and testing sets
X_split = {}
for name, data in X.items():
    X_train, X_test, y_train, y_test = train_test_split(data, dataset["sentiment"], test_size=0.2, random_state=SEED)
    X_split[name] = (X_train, X_test, y_train, y_test)

In [18]:
# Define the cross-validation strategy
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

### Search

## Evaluation