diff --git "a/experiment.ipynb" "b/experiment.ipynb"
new file mode 100644--- /dev/null
+++ "b/experiment.ipynb"
@@ -0,0 +1,507 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Sentiment Analysis"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Imports, constants and setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from __future__ import annotations\n",
+ "\n",
+ "from collections import Counter\n",
+ "\n",
+ "import joblib\n",
+ "import matplotlib.pyplot as plt\n",
+ "import pandas as pd\n",
+ "from tqdm.notebook import tqdm\n",
+ "from wordcloud import WordCloud\n",
+ "\n",
+ "from app.constants import CACHE_DIR\n",
+ "from app.data import load_data, tokenize"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tqdm.pandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "SEED = 42\n",
+ "CACHE = joblib.Memory(CACHE_DIR, verbose=0)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Data loading"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " text | \n",
+ " sentiment | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Cooking microwave pizzas, yummy | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " I love the humor, I just reworded it. Like sa... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " That sucks to hear. I hate days like that | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Umm yeah. That`s probably a pretty good note ... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " That would panic me a little! Maybe you can ... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " text sentiment\n",
+ "0 Cooking microwave pizzas, yummy 1\n",
+ "1 I love the humor, I just reworded it. Like sa... 1\n",
+ "2 That sucks to hear. I hate days like that 0\n",
+ "3 Umm yeah. That`s probably a pretty good note ... 1\n",
+ "4 That would panic me a little! Maybe you can ... 0"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Load data and convert to pandas DataFrame\n",
+ "text_data, label_data = load_data(\"test\")\n",
+ "dataset = pd.DataFrame({\"text\": text_data, \"sentiment\": label_data})\n",
+ "dataset.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Cleaning: 100%|██████████| 19583/19583 [00:06<00:00, 3243.25doc/s]\n",
+ "Lemmatization: 100%|██████████| 19583/19583 [01:08<00:00, 284.25doc/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " text | \n",
+ " sentiment | \n",
+ " tokens | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Cooking microwave pizzas, yummy | \n",
+ " 1 | \n",
+ " cook microwave pizza yummy | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " I love the humor, I just reworded it. Like sa... | \n",
+ " 1 | \n",
+ " love humor reword like say group therapy inste... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " That sucks to hear. I hate days like that | \n",
+ " 0 | \n",
+ " suck hear hate day like | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Umm yeah. That`s probably a pretty good note ... | \n",
+ " 1 | \n",
+ " umm yeah probably pretty good note self eeeeee... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " That would panic me a little! Maybe you can ... | \n",
+ " 0 | \n",
+ " panic little maybe read orbitron gym like dowh... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " text sentiment \\\n",
+ "0 Cooking microwave pizzas, yummy 1 \n",
+ "1 I love the humor, I just reworded it. Like sa... 1 \n",
+ "2 That sucks to hear. I hate days like that 0 \n",
+ "3 Umm yeah. That`s probably a pretty good note ... 1 \n",
+ "4 That would panic me a little! Maybe you can ... 0 \n",
+ "\n",
+ " tokens \n",
+ "0 cook microwave pizza yummy \n",
+ "1 love humor reword like say group therapy inste... \n",
+ "2 suck hear hate day like \n",
+ "3 umm yeah probably pretty good note self eeeeee... \n",
+ "4 panic little maybe read orbitron gym like dowh... "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Tokenize text data\n",
+ "tokens = tokenize(dataset[\"text\"].tolist(), batch_size=1024, n_jobs=2, show_progress=True)\n",
+ "dataset[\"tokens\"] = tokens.apply(\" \".join)\n",
+ "dataset.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Data exploration"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Sentiment distribution"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "