diff --git "a/experiment.ipynb" "b/experiment.ipynb" new file mode 100644--- /dev/null +++ "b/experiment.ipynb" @@ -0,0 +1,507 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sentiment Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports, constants and setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import annotations\n", + "\n", + "from collections import Counter\n", + "\n", + "import joblib\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from tqdm.notebook import tqdm\n", + "from wordcloud import WordCloud\n", + "\n", + "from app.constants import CACHE_DIR\n", + "from app.data import load_data, tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "tqdm.pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "SEED = 42\n", + "CACHE = joblib.Memory(CACHE_DIR, verbose=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data loading" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textsentiment
0Cooking microwave pizzas, yummy1
1I love the humor, I just reworded it. Like sa...1
2That sucks to hear. I hate days like that0
3Umm yeah. That`s probably a pretty good note ...1
4That would panic me a little! Maybe you can ...0
\n", + "
" + ], + "text/plain": [ + " text sentiment\n", + "0 Cooking microwave pizzas, yummy 1\n", + "1 I love the humor, I just reworded it. Like sa... 1\n", + "2 That sucks to hear. I hate days like that 0\n", + "3 Umm yeah. That`s probably a pretty good note ... 1\n", + "4 That would panic me a little! Maybe you can ... 0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load data and convert to pandas DataFrame\n", + "text_data, label_data = load_data(\"test\")\n", + "dataset = pd.DataFrame({\"text\": text_data, \"sentiment\": label_data})\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Cleaning: 100%|██████████| 19583/19583 [00:06<00:00, 3243.25doc/s]\n", + "Lemmatization: 100%|██████████| 19583/19583 [01:08<00:00, 284.25doc/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textsentimenttokens
0Cooking microwave pizzas, yummy1cook microwave pizza yummy
1I love the humor, I just reworded it. Like sa...1love humor reword like say group therapy inste...
2That sucks to hear. I hate days like that0suck hear hate day like
3Umm yeah. That`s probably a pretty good note ...1umm yeah probably pretty good note self eeeeee...
4That would panic me a little! Maybe you can ...0panic little maybe read orbitron gym like dowh...
\n", + "
" + ], + "text/plain": [ + " text sentiment \\\n", + "0 Cooking microwave pizzas, yummy 1 \n", + "1 I love the humor, I just reworded it. Like sa... 1 \n", + "2 That sucks to hear. I hate days like that 0 \n", + "3 Umm yeah. That`s probably a pretty good note ... 1 \n", + "4 That would panic me a little! Maybe you can ... 0 \n", + "\n", + " tokens \n", + "0 cook microwave pizza yummy \n", + "1 love humor reword like say group therapy inste... \n", + "2 suck hear hate day like \n", + "3 umm yeah probably pretty good note self eeeeee... \n", + "4 panic little maybe read orbitron gym like dowh... " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Tokenize text data\n", + "tokens = tokenize(dataset[\"text\"].tolist(), batch_size=1024, n_jobs=2, show_progress=True)\n", + "dataset[\"tokens\"] = tokens.apply(\" \".join)\n", + "dataset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data exploration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sentiment distribution" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_, ax = plt.subplots(figsize=(6, 4))\n", + "\n", + "dataset[\"sentiment\"].value_counts().plot(kind=\"bar\", ax=ax)\n", + "ax.set_xticklabels([\"Negative\", \"Positive\"], rotation=0)\n", + "ax.set_xlabel(\"Sentiment\")\n", + "ax.set_ylabel(\"Count\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Word cloud (before tokenization)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a885e681eaf14751b11088566e643a3e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/19583 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Gather all the unique words in the dataset\n", + "word_freq = Counter()\n", + "dataset[\"text\"].str.lower().str.split().progress_apply(word_freq.update)\n", + "\n", + "# Now get the most common words\n", + "common_words = word_freq.most_common(100)\n", + "\n", + "# Create a word cloud of the most common words\n", + "wrd_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_words))\n", + "\n", + "# Display the word cloud\n", + "plt.figure(figsize=(20, 20))\n", + "plt.imshow(wrd_cloud, interpolation=\"bilinear\")\n", + "plt.axis(\"off\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Word cloud (after tokenization)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "04f9a50519654e7188f59c62645572ff", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/19583 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Gather all the unique tokens in the dataset\n", + "token_freq = Counter()\n", + "dataset[\"tokens\"].str.split().progress_apply(token_freq.update)\n", + "\n", + "# Now get the most common tokens\n", + "common_tokens = token_freq.most_common(100)\n", + "\n", + "# Create a word cloud of the most common tokens\n", + "tkn_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_tokens))\n", + "\n", + "# Display the word cloud\n", + "plt.figure(figsize=(20, 20))\n", + "plt.imshow(tkn_cloud, interpolation=\"bilinear\")\n", + "plt.axis(\"off\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Token association" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "283ee0b586574489bf14a8ef0105ef78", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/9105 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_, ax = plt.subplots(2, 1, figsize=(20, 20))\n", + "\n", + "for i, sentiment in enumerate([\"Negative\", \"Positive\"]):\n", + " freq = Counter()\n", + " dataset[dataset[\"sentiment\"] == i][\"tokens\"].str.split().progress_apply(freq.update)\n", + " most_common = freq.most_common(100)\n", + "\n", + " cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(most_common))\n", + " ax[i].imshow(cloud, interpolation=\"bilinear\")\n", + " ax[i].axis(\"off\")\n", + " ax[i].set_title(sentiment)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Vectorization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}