{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Sentiment Analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports, constants and setup" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from __future__ import annotations\n", "\n", "from collections import Counter\n", "\n", "import joblib\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from tqdm.notebook import tqdm\n", "from wordcloud import WordCloud\n", "\n", "from app.constants import CACHE_DIR\n", "from app.data import load_data, tokenize" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "tqdm.pandas()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "SEED = 42\n", "CACHE = joblib.Memory(CACHE_DIR, verbose=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data loading" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textsentiment
0Cooking microwave pizzas, yummy1
1I love the humor, I just reworded it. Like sa...1
2That sucks to hear. I hate days like that0
3Umm yeah. That`s probably a pretty good note ...1
4That would panic me a little! Maybe you can ...0
\n", "
" ], "text/plain": [ " text sentiment\n", "0 Cooking microwave pizzas, yummy 1\n", "1 I love the humor, I just reworded it. Like sa... 1\n", "2 That sucks to hear. I hate days like that 0\n", "3 Umm yeah. That`s probably a pretty good note ... 1\n", "4 That would panic me a little! Maybe you can ... 0" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load data and convert to pandas DataFrame\n", "text_data, label_data = load_data(\"test\")\n", "dataset = pd.DataFrame({\"text\": text_data, \"sentiment\": label_data})\n", "dataset.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Cleaning: 100%|██████████| 19583/19583 [00:06<00:00, 3243.25doc/s]\n", "Lemmatization: 100%|██████████| 19583/19583 [01:08<00:00, 284.25doc/s]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textsentimenttokens
0Cooking microwave pizzas, yummy1cook microwave pizza yummy
1I love the humor, I just reworded it. Like sa...1love humor reword like say group therapy inste...
2That sucks to hear. I hate days like that0suck hear hate day like
3Umm yeah. That`s probably a pretty good note ...1umm yeah probably pretty good note self eeeeee...
4That would panic me a little! Maybe you can ...0panic little maybe read orbitron gym like dowh...
\n", "
" ], "text/plain": [ " text sentiment \\\n", "0 Cooking microwave pizzas, yummy 1 \n", "1 I love the humor, I just reworded it. Like sa... 1 \n", "2 That sucks to hear. I hate days like that 0 \n", "3 Umm yeah. That`s probably a pretty good note ... 1 \n", "4 That would panic me a little! Maybe you can ... 0 \n", "\n", " tokens \n", "0 cook microwave pizza yummy \n", "1 love humor reword like say group therapy inste... \n", "2 suck hear hate day like \n", "3 umm yeah probably pretty good note self eeeeee... \n", "4 panic little maybe read orbitron gym like dowh... " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Tokenize text data\n", "tokens = tokenize(dataset[\"text\"].tolist(), batch_size=1024, n_jobs=2, show_progress=True)\n", "dataset[\"tokens\"] = tokens.apply(\" \".join)\n", "dataset.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data exploration" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sentiment distribution" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "_, ax = plt.subplots(figsize=(6, 4))\n", "\n", "dataset[\"sentiment\"].value_counts().plot(kind=\"bar\", ax=ax)\n", "ax.set_xticklabels([\"Negative\", \"Positive\"], rotation=0)\n", "ax.set_xlabel(\"Sentiment\")\n", "ax.set_ylabel(\"Count\")\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Word cloud (before tokenization)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a885e681eaf14751b11088566e643a3e", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/19583 [00:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Gather all the unique words in the dataset\n", "word_freq = Counter()\n", "dataset[\"text\"].str.lower().str.split().progress_apply(word_freq.update)\n", "\n", "# Now get the most common words\n", "common_words = word_freq.most_common(100)\n", "\n", "# Create a word cloud of the most common words\n", "wrd_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_words))\n", "\n", "# Display the word cloud\n", "plt.figure(figsize=(20, 20))\n", "plt.imshow(wrd_cloud, interpolation=\"bilinear\")\n", "plt.axis(\"off\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Word cloud (after tokenization)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "04f9a50519654e7188f59c62645572ff", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/19583 [00:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Gather all the unique tokens in the dataset\n", "token_freq = Counter()\n", "dataset[\"tokens\"].str.split().progress_apply(token_freq.update)\n", "\n", "# Now get the most common tokens\n", "common_tokens = token_freq.most_common(100)\n", "\n", "# Create a word cloud of the most common tokens\n", "tkn_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_tokens))\n", "\n", "# Display the word cloud\n", "plt.figure(figsize=(20, 20))\n", "plt.imshow(tkn_cloud, interpolation=\"bilinear\")\n", "plt.axis(\"off\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Token association" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "283ee0b586574489bf14a8ef0105ef78", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/9105 [00:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "_, ax = plt.subplots(2, 1, figsize=(20, 20))\n", "\n", "for i, sentiment in enumerate([\"Negative\", \"Positive\"]):\n", " freq = Counter()\n", " dataset[dataset[\"sentiment\"] == i][\"tokens\"].str.split().progress_apply(freq.update)\n", " most_common = freq.most_common(100)\n", "\n", " cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(most_common))\n", " ax[i].imshow(cloud, interpolation=\"bilinear\")\n", " ax[i].axis(\"off\")\n", " ax[i].set_title(sentiment)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Vectorization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }