diff --git "a/experiment.ipynb" "b/experiment.ipynb" new file mode 100644--- /dev/null +++ "b/experiment.ipynb" @@ -0,0 +1,507 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sentiment Analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports, constants and setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import annotations\n", + "\n", + "from collections import Counter\n", + "\n", + "import joblib\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from tqdm.notebook import tqdm\n", + "from wordcloud import WordCloud\n", + "\n", + "from app.constants import CACHE_DIR\n", + "from app.data import load_data, tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "tqdm.pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "SEED = 42\n", + "CACHE = joblib.Memory(CACHE_DIR, verbose=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data loading" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textsentiment
0Cooking microwave pizzas, yummy1
1I love the humor, I just reworded it. Like sa...1
2That sucks to hear. I hate days like that0
3Umm yeah. That`s probably a pretty good note ...1
4That would panic me a little! Maybe you can ...0
\n", + "
" + ], + "text/plain": [ + " text sentiment\n", + "0 Cooking microwave pizzas, yummy 1\n", + "1 I love the humor, I just reworded it. Like sa... 1\n", + "2 That sucks to hear. I hate days like that 0\n", + "3 Umm yeah. That`s probably a pretty good note ... 1\n", + "4 That would panic me a little! Maybe you can ... 0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load data and convert to pandas DataFrame\n", + "text_data, label_data = load_data(\"test\")\n", + "dataset = pd.DataFrame({\"text\": text_data, \"sentiment\": label_data})\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Cleaning: 100%|██████████| 19583/19583 [00:06<00:00, 3243.25doc/s]\n", + "Lemmatization: 100%|██████████| 19583/19583 [01:08<00:00, 284.25doc/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textsentimenttokens
0Cooking microwave pizzas, yummy1cook microwave pizza yummy
1I love the humor, I just reworded it. Like sa...1love humor reword like say group therapy inste...
2That sucks to hear. I hate days like that0suck hear hate day like
3Umm yeah. That`s probably a pretty good note ...1umm yeah probably pretty good note self eeeeee...
4That would panic me a little! Maybe you can ...0panic little maybe read orbitron gym like dowh...
\n", + "
" + ], + "text/plain": [ + " text sentiment \\\n", + "0 Cooking microwave pizzas, yummy 1 \n", + "1 I love the humor, I just reworded it. Like sa... 1 \n", + "2 That sucks to hear. I hate days like that 0 \n", + "3 Umm yeah. That`s probably a pretty good note ... 1 \n", + "4 That would panic me a little! Maybe you can ... 0 \n", + "\n", + " tokens \n", + "0 cook microwave pizza yummy \n", + "1 love humor reword like say group therapy inste... \n", + "2 suck hear hate day like \n", + "3 umm yeah probably pretty good note self eeeeee... \n", + "4 panic little maybe read orbitron gym like dowh... " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Tokenize text data\n", + "tokens = tokenize(dataset[\"text\"].tolist(), batch_size=1024, n_jobs=2, show_progress=True)\n", + "dataset[\"tokens\"] = tokens.apply(\" \".join)\n", + "dataset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data exploration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sentiment distribution" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAi4AAAFzCAYAAAAZsoJrAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAtUUlEQVR4nO3de1RVdd7H8Q8XuXg5B68ghUKaF8o0tVGsbEwSi1paTGWSWpGWgaVmJjPeairU8pKXR3J68jKjj1ZTjqmRhKml5IXGvOStRtNSIFM4YQnI2c8fDXt5kkwRO/z0/VrrrOXZv+/+ne8+q50ff2effXwsy7IEAABgAF9vNwAAAHCuCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGP4e7uBS4Xb7dbhw4dVp04d+fj4eLsdAACMYVmWfvjhB4WHh8vX9+xrKgSXKnL48GFFRER4uw0AAIx16NAhXXnllWetIbhUkTp16kj6+U13OBxe7gYAAHO4XC5FRETYf5eeDcGlipR/PORwOAguAABUwrlcasHFuQAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBr9VhPMWOWqFt1tAFTswId7bLQDAOWHFBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBsEFAAAYw6vBZd26dbrrrrsUHh4uHx8fLV261GPcsiyNHTtWjRs3VnBwsGJjY7Vv3z6PmmPHjikxMVEOh0MhISFKSkpSUVGRR822bdt08803KygoSBEREZo0adIZvbz11ltq1aqVgoKC1KZNG61cubLKjxcAAFwYrwaXEydOqG3btpo1a1aF45MmTdL06dOVnp6ujRs3qlatWoqLi9PJkyftmsTERO3cuVOZmZlavny51q1bp0GDBtnjLpdLPXr0UNOmTZWTk6OXX35Z48eP15w5c+yaDRs26IEHHlBSUpL+/e9/q3fv3urdu7d27Nhx8Q4eAACcNx/LsixvNyFJPj4+evfdd9W7d29JP6+2hIeH6+mnn9aIESMkSYWFhQoNDdW8efPUp08f7dq1S9HR0dq8ebM6duwoScrIyNAdd9yhb775RuHh4Zo9e7b+8pe/KDc3VwEBAZKkUaNGaenSpdq9e7ck6f7779eJEye0fPlyu5/OnTurXbt2Sk9PP6f+XS6XnE6nCgsL5XA4quptqZb4raJLD79VBMCbzufv0Gp7jcv+/fuVm5ur2NhYe5vT6VSnTp2UnZ0tScrOzlZISIgdWiQpNjZWvr6+2rhxo13TtWtXO7RIUlxcnPbs2aPjx4/bNae/TnlN+etUpLi4WC6Xy+MBAAAurmobXHJzcyVJoaGhHttDQ0PtsdzcXDVq1Mhj3N/fX/Xq1fOoqWiO01/j12rKxyuSlpYmp9NpPyIiIs73EAEAwHmqtsGluktNTVVhYaH9OHTokLdbAgDgkldtg0tYWJgkKS8vz2N7Xl6ePRYWFqb8/HyP8VOnTunYsWMeNRXNcfpr/FpN+XhFAgMD5XA4PB4AAODiqrbBJSoqSmFhYcrKyrK3uVwubdy4UTExMZKkmJgYFRQUKCcnx65ZvXq13G63OnXqZNesW7dOpaWldk1mZqZatmypunXr2jWnv055TfnrAACA6sGrwaWoqEhbt27V1q1bJf18Qe7WrVt18OBB+fj4aOjQoXrhhRe0bNkybd++Xf3791d4eLj9zaPWrVurZ8+eGjhwoDZt2qT169crJSVFffr0UXh4uCSpb9++CggIUFJSknbu3KklS5bo1Vdf1fDhw+0+nnrqKWVkZGjy5MnavXu3xo8fry1btiglJeX3fksAAMBZ+Hvzxbds2aJu3brZz8vDxIABAzRv3jyNHDlSJ06c0KBBg1RQUKCbbrpJGRkZCgoKsvdZuHChUlJS1L17d/n6+iohIUHTp0+3x51Op1atWqXk5GR16NBBDRo00NixYz3u9dKlSxctWrRIo0eP1p///GddffXVWrp0qa699trf4V0AAADnqtrcx8V03McFJuM+LgC86ZK4jwsAAMAvEVwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMfy93QAAoGpFjlrh7RZQhQ5MiPd2C9UKKy4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBsEFAAAYg+ACAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGCMah1cysrKNGbMGEVFRSk4OFjNmjXTX//6V1mWZddYlqWxY8eqcePGCg4OVmxsrPbt2+cxz7Fjx5SYmCiHw6GQkBAlJSWpqKjIo2bbtm26+eabFRQUpIiICE2aNOl3OUYAAHDuqnVwmThxombPnq2ZM2dq165dmjhxoiZNmqQZM2bYNZMmTdL06dOVnp6ujRs3qlatWoqLi9PJkyftmsTERO3cuVOZmZlavny51q1bp0GDBtnjLpdLPXr0UNOmTZWTk6OXX35Z48eP15w5c37X4wUAAGfn7+0GzmbDhg3q1auX4uPjJUmRkZH6v//7P23atEnSz6st06ZN0+jRo9WrVy9J0oIFCxQaGqqlS5eqT58+2rVrlzIyMrR582Z17NhRkjRjxgzdcccdeuWVVxQeHq6FCxeqpKREb7zxhgICAnTNNddo69atmjJlikfAAQAA3lWtV1y6dOmirKws7d27V5L0+eef65NPPtHtt98uSdq/f79yc3MVGxtr7+N0OtWpUydlZ2dLkrKzsxUSEmKHFkmKjY2Vr6+vNm7caNd07dpVAQEBdk1cXJz27Nmj48ePV9hbcXGxXC6XxwMAAFxc1XrFZdSoUXK5XGrVqpX8/PxUVlamF198UYmJiZKk3NxcSVJoaKjHfqGhofZYbm6uGjVq5DHu7++vevXqedRERUWdMUf5WN26dc/oLS0tTc8991wVHCUAADhX1XrF5c0339TChQu1aNEiffbZZ5o/f75eeeUVzZ8/39utKTU1VYWFhfbj0KFD3m4JAIBLXrVecXnmmWc0atQo9enTR5LUpk0bff3110pLS9OAAQMUFhYmScrLy1Pjxo3t/fLy8tSuXTtJUlhYmPLz8z3mPXXqlI4dO2bvHxYWpry8PI+a8uflNb8UGBiowMDACz9IAABwzqr1isuPP/4oX1/PFv38/OR2uyVJUVFRCgsLU1ZWlj3ucrm0ceNGxcTESJJiYmJUUFCgnJwcu2b16tVyu93q1KmTXbNu3TqVlpbaNZmZmWrZsmWFHxMBAADvqNbB5a677tKLL76oFStW6MCBA3r33Xc1ZcoU3X333ZIkHx8fDR06VC+88IKWLVum7du3q3///goPD1fv3r0lSa1bt1bPnj01cOBAbdq0SevXr1dKSor69Omj8PBwSVLfvn0VEBCgpKQk7dy5U0uWLNGrr76q4cOHe+vQAQBABar1R0UzZszQmDFj9MQTTyg/P1/h4eF67LHHNHbsWLtm5MiROnHihAYNGqSCggLddNNNysjIUFBQkF2zcOFCpaSkqHv37vL19VVCQoKmT59ujzudTq1atUrJycnq0KGDGjRooLFjx/JVaAAAqhkf6/Tb0KLSXC6XnE6nCgsL5XA4vN3ORRU5aoW3W0AVOzAh3tstoApxjl5aLofz83z+Dq3WHxUBAACcjuACAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBsEFAAAYg+ACAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMSoVXK666ip9//33Z2wvKCjQVVdddcFNne7bb7/Vgw8+qPr16ys4OFht2rTRli1b7HHLsjR27Fg1btxYwcHBio2N1b59+zzmOHbsmBITE+VwOBQSEqKkpCQVFRV51Gzbtk0333yzgoKCFBERoUmTJlXpcQAAgAtXqeBy4MABlZWVnbG9uLhY33777QU3Ve748eO68cYbVaNGDb3//vv64osvNHnyZNWtW9eumTRpkqZPn6709HRt3LhRtWrVUlxcnE6ePGnXJCYmaufOncrMzNTy5cu1bt06DRo0yB53uVzq0aOHmjZtqpycHL388ssaP3685syZU2XHAgAALpz/+RQvW7bM/vMHH3wgp9NpPy8rK1NWVpYiIyOrrLmJEycqIiJCc+fOtbdFRUXZf7YsS9OmTdPo0aPVq1cvSdKCBQsUGhqqpUuXqk+fPtq1a5cyMjK0efNmdezYUZI0Y8YM3XHHHXrllVcUHh6uhQsXqqSkRG+88YYCAgJ0zTXXaOvWrZoyZYpHwAEAAN51XsGld+/ekiQfHx8NGDDAY6xGjRqKjIzU5MmTq6y5ZcuWKS4uTvfee6/Wrl2rK664Qk888YQGDhwoSdq/f79yc3MVGxtr7+N0OtWpUydlZ2erT58+ys7OVkhIiB1aJCk2Nla+vr7auHGj7r77bmVnZ6tr164KCAiwa+Li4jRx4kQdP37cY4WnXHFxsYqLi+3nLperyo4bAABU7Lw+KnK73XK73WrSpIny8/Pt5263W8XFxdqzZ4/uvPPOKmvuP//5j2bPnq2rr75aH3zwgQYPHqwnn3xS8+fPlyTl5uZKkkJDQz32Cw0Ntcdyc3PVqFEjj3F/f3/Vq1fPo6aiOU5/jV9KS0uT0+m0HxERERd4tAAA4LdU6hqX/fv3q0GDBlXdyxncbrfat2+vl156Sddff70GDRqkgQMHKj09/aK/9m9JTU1VYWGh/Th06JC3WwIA4JJ3Xh8VnS4rK0tZWVn2ysvp3njjjQtuTJIaN26s6Ohoj22tW7fWP//5T0lSWFiYJCkvL0+NGze2a/Ly8tSuXTu7Jj8/32OOU6dO6dixY/b+YWFhysvL86gpf15e80uBgYEKDAys5JEBAIDKqNSKy3PPPacePXooKytLR48e1fHjxz0eVeXGG2/Unj17PLbt3btXTZs2lfTzhbphYWHKysqyx10ulzZu3KiYmBhJUkxMjAoKCpSTk2PXrF69Wm63W506dbJr1q1bp9LSUrsmMzNTLVu2rPD6FgAA4B2VWnFJT0/XvHnz1K9fv6rux8OwYcPUpUsXvfTSS7rvvvu0adMmzZkzx/6aso+Pj4YOHaoXXnhBV199taKiojRmzBiFh4fbFxK3bt1aPXv2tD9iKi0tVUpKivr06aPw8HBJUt++ffXcc88pKSlJzz77rHbs2KFXX31VU6dOvajHBwAAzk+lgktJSYm6dOlS1b2c4YYbbtC7776r1NRUPf/884qKitK0adOUmJho14wcOVInTpzQoEGDVFBQoJtuukkZGRkKCgqyaxYuXKiUlBR1795dvr6+SkhI0PTp0+1xp9OpVatWKTk5WR06dFCDBg00duxYvgoNAEA142NZlnW+Oz377LOqXbu2xowZczF6MpLL5ZLT6VRhYaEcDoe327moIket8HYLqGIHJsR7uwVUIc7RS8vlcH6ez9+hlVpxOXnypObMmaMPP/xQ1113nWrUqOExPmXKlMpMCwAAcFaVCi7btm2zv7WzY8cOjzEfH58LbgoAAKAilQouH330UVX3AQAA8Jsq9XVoAAAAb6jUiku3bt3O+pHQ6tWrK90QAADAr6lUcCm/vqVcaWmptm7dqh07dpzx44sAAABVpVLB5dduzDZ+/HgVFRVdUEMAAAC/pkqvcXnwwQer7HeKAAAAfqlKg0t2drbHHWsBAACqUqU+Krrnnns8nluWpSNHjmjLli3cTRcAAFw0lQouTqfT47mvr69atmyp559/Xj169KiSxgAAAH6pUsFl7ty5Vd0HAADAb6pUcCmXk5OjXbt2SZKuueYaXX/99VXSFAAAQEUqFVzy8/PVp08frVmzRiEhIZKkgoICdevWTYsXL1bDhg2rskcAAABJlfxW0ZAhQ/TDDz9o586dOnbsmI4dO6YdO3bI5XLpySefrOoeAQAAJFVyxSUjI0MffvihWrdubW+Ljo7WrFmzuDgXAABcNJVacXG73apRo8YZ22vUqCG3233BTQEAAFSkUsHl1ltv1VNPPaXDhw/b27799lsNGzZM3bt3r7LmAAAATlep4DJz5ky5XC5FRkaqWbNmatasmaKiouRyuTRjxoyq7hEAAEBSJa9xiYiI0GeffaYPP/xQu3fvliS1bt1asbGxVdocAADA6c5rxWX16tWKjo6Wy+WSj4+PbrvtNg0ZMkRDhgzRDTfcoGuuuUYff/zxxeoVAABc5s4ruEybNk0DBw6Uw+E4Y8zpdOqxxx7TlClTqqw5AACA051XcPn888/Vs2fPXx3v0aOHcnJyLrgpAACAipxXcMnLy6vwa9Dl/P399d13311wUwAAABU5r+ByxRVXaMeOHb86vm3bNjVu3PiCmwIAAKjIeQWXO+64Q2PGjNHJkyfPGPvpp580btw43XnnnVXWHAAAwOnO6+vQo0eP1jvvvKMWLVooJSVFLVu2lCTt3r1bs2bNUllZmf7yl79clEYBAADOK7iEhoZqw4YNGjx4sFJTU2VZliTJx8dHcXFxmjVrlkJDQy9KowAAAOd9A7qmTZtq5cqVOn78uL788ktZlqWrr75adevWvRj9AQAA2Cp151xJqlu3rm644Yaq7AUAAOCsKvVbRQAAAN5AcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwhlHBZcKECfLx8dHQoUPtbSdPnlRycrLq16+v2rVrKyEhQXl5eR77HTx4UPHx8apZs6YaNWqkZ555RqdOnfKoWbNmjdq3b6/AwEA1b95c8+bN+x2OCAAAnA9jgsvmzZv12muv6brrrvPYPmzYML333nt66623tHbtWh0+fFj33HOPPV5WVqb4+HiVlJRow4YNmj9/vubNm6exY8faNfv371d8fLy6deumrVu3aujQoXr00Uf1wQcf/G7HBwAAfpsRwaWoqEiJiYn629/+prp169rbCwsL9b//+7+aMmWKbr31VnXo0EFz587Vhg0b9Omnn0qSVq1apS+++EL/+Mc/1K5dO91+++3661//qlmzZqmkpESSlJ6erqioKE2ePFmtW7dWSkqK/vSnP2nq1KleOV4AAFAxI4JLcnKy4uPjFRsb67E9JydHpaWlHttbtWqlJk2aKDs7W5KUnZ2tNm3aKDQ01K6Ji4uTy+XSzp077Zpfzh0XF2fPUZHi4mK5XC6PBwAAuLj8vd3Ab1m8eLE+++wzbd68+Yyx3NxcBQQEKCQkxGN7aGiocnNz7ZrTQ0v5ePnY2WpcLpd++uknBQcHn/HaaWlpeu655yp9XAAA4PxV6xWXQ4cO6amnntLChQsVFBTk7XY8pKamqrCw0H4cOnTI2y0BAHDJq9bBJScnR/n5+Wrfvr38/f3l7++vtWvXavr06fL391doaKhKSkpUUFDgsV9eXp7CwsIkSWFhYWd8y6j8+W/VOByOCldbJCkwMFAOh8PjAQAALq5qHVy6d++u7du3a+vWrfajY8eOSkxMtP9co0YNZWVl2fvs2bNHBw8eVExMjCQpJiZG27dvV35+vl2TmZkph8Oh6Ohou+b0OcpryucAAADVQ7W+xqVOnTq69tprPbbVqlVL9evXt7cnJSVp+PDhqlevnhwOh4YMGaKYmBh17txZktSjRw9FR0erX79+mjRpknJzczV69GglJycrMDBQkvT4449r5syZGjlypB555BGtXr1ab775plasWPH7HjAAADirah1czsXUqVPl6+urhIQEFRcXKy4uTv/zP/9jj/v5+Wn58uUaPHiwYmJiVKtWLQ0YMEDPP/+8XRMVFaUVK1Zo2LBhevXVV3XllVfq9ddfV1xcnDcOCQAA/Aofy7IsbzdxKXC5XHI6nSosLLzkr3eJHMVK1KXmwIR4b7eAKsQ5emm5HM7P8/k7tFpf4wIAAHA6ggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBsEFAAAYg+ACAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBsEFAAAYg+ACAACMQXABAADGqNbBJS0tTTfccIPq1KmjRo0aqXfv3tqzZ49HzcmTJ5WcnKz69eurdu3aSkhIUF5enkfNwYMHFR8fr5o1a6pRo0Z65plndOrUKY+aNWvWqH379goMDFTz5s01b968i314AADgPFXr4LJ27VolJyfr008/VWZmpkpLS9WjRw+dOHHCrhk2bJjee+89vfXWW1q7dq0OHz6se+65xx4vKytTfHy8SkpKtGHDBs2fP1/z5s3T2LFj7Zr9+/crPj5e3bp109atWzV06FA9+uij+uCDD37X4wUAAGfnY1mW5e0mztV3332nRo0aae3ateratasKCwvVsGFDLVq0SH/6058kSbt371br1q2VnZ2tzp076/3339edd96pw4cPKzQ0VJKUnp6uZ599Vt99950CAgL07LPPasWKFdqxY4f9Wn369FFBQYEyMjLOqTeXyyWn06nCwkI5HI6qP/hqJHLUCm+3gCp2YEK8t1tAFeIcvbRcDufn+fwdWq1XXH6psLBQklSvXj1JUk5OjkpLSxUbG2vXtGrVSk2aNFF2drYkKTs7W23atLFDiyTFxcXJ5XJp586dds3pc5TXlM9RkeLiYrlcLo8HAAC4uIwJLm63W0OHDtWNN96oa6+9VpKUm5urgIAAhYSEeNSGhoYqNzfXrjk9tJSPl4+drcblcumnn36qsJ+0tDQ5nU77ERERccHHCAAAzs6Y4JKcnKwdO3Zo8eLF3m5FkpSamqrCwkL7cejQIW+3BADAJc/f2w2ci5SUFC1fvlzr1q3TlVdeaW8PCwtTSUmJCgoKPFZd8vLyFBYWZtds2rTJY77ybx2dXvPLbyLl5eXJ4XAoODi4wp4CAwMVGBh4wccGAADOXbVecbEsSykpKXr33Xe1evVqRUVFeYx36NBBNWrUUFZWlr1tz549OnjwoGJiYiRJMTEx2r59u/Lz8+2azMxMORwORUdH2zWnz1FeUz4HAACoHqr1iktycrIWLVqkf/3rX6pTp459TYrT6VRwcLCcTqeSkpI0fPhw1atXTw6HQ0OGDFFMTIw6d+4sSerRo4eio6PVr18/TZo0Sbm5uRo9erSSk5PtFZPHH39cM2fO1MiRI/XII49o9erVevPNN7ViBVfmAwBQnVTrFZfZs2ersLBQf/zjH9W4cWP7sWTJErtm6tSpuvPOO5WQkKCuXbsqLCxM77zzjj3u5+en5cuXy8/PTzExMXrwwQfVv39/Pf/883ZNVFSUVqxYoczMTLVt21aTJ0/W66+/rri4uN/1eAEAwNkZdR+X6oz7uMBkl8N9Ii4nnKOXlsvh/Lxk7+MCAAAubwQXAABgDIILAAAwBsEFAAAYg+ACAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFwAQAAxiC4AAAAYxBcAACAMQguAADAGAQXAABgDIILAAAwBsEFAAAYg+ACAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDEILgAAwBgEFwAAYAyCCwAAMAbBBQAAGIPgAgAAjEFw+YVZs2YpMjJSQUFB6tSpkzZt2uTtlgAAwH8RXE6zZMkSDR8+XOPGjdNnn32mtm3bKi4uTvn5+d5uDQAAiODiYcqUKRo4cKAefvhhRUdHKz09XTVr1tQbb7zh7dYAAIAkf283UF2UlJQoJydHqamp9jZfX1/FxsYqOzv7jPri4mIVFxfbzwsLCyVJLpfr4jfrZe7iH73dAqrY5fDf7eWEc/TScjmcn+XHaFnWb9YSXP7r6NGjKisrU2hoqMf20NBQ7d69+4z6tLQ0Pffcc2dsj4iIuGg9AheLc5q3OwDway6n8/OHH36Q0+k8aw3BpZJSU1M1fPhw+7nb7daxY8dUv359+fj4eLEzVAWXy6WIiAgdOnRIDofD2+0A+AXO0UuLZVn64YcfFB4e/pu1BJf/atCggfz8/JSXl+exPS8vT2FhYWfUBwYGKjAw0GNbSEjIxWwRXuBwOPifIlCNcY5eOn5rpaUcF+f+V0BAgDp06KCsrCx7m9vtVlZWlmJiYrzYGQAAKMeKy2mGDx+uAQMGqGPHjvrDH/6gadOm6cSJE3r44Ye93RoAABDBxcP999+v7777TmPHjlVubq7atWunjIyMMy7YxaUvMDBQ48aNO+PjQADVA+fo5cvHOpfvHgEAAFQDXOMCAACMQXABAADGILgAAABjEFyAKhIZGalp06Z5uw3gkrZmzRr5+PiooKDgrHWcj5cugguM8NBDD8nHx0cTJkzw2L506dLf/U7F8+bNq/Bmg5s3b9agQYN+116A6qr8nPXx8VFAQICaN2+u559/XqdOnbqgebt06aIjR47YNyvjfLz8EFxgjKCgIE2cOFHHjx/3disVatiwoWrWrOntNoBqo2fPnjpy5Ij27dunp59+WuPHj9fLL798QXMGBAQoLCzsN//Bwvl46SK4wBixsbEKCwtTWlrar9Z88sknuvnmmxUcHKyIiAg9+eSTOnHihD1+5MgRxcfHKzg4WFFRUVq0aNEZS8pTpkxRmzZtVKtWLUVEROiJJ55QUVGRpJ+XqR9++GEVFhba/5ocP368JM+l6b59++r+++/36K20tFQNGjTQggULJP18Z+a0tDRFRUUpODhYbdu21dtvv10F7xRQPQQGBiosLExNmzbV4MGDFRsbq2XLlun48ePq37+/6tatq5o1a+r222/Xvn377P2+/vpr3XXXXapbt65q1aqla665RitXrpTk+VER5+PlieACY/j5+emll17SjBkz9M0335wx/tVXX6lnz55KSEjQtm3btGTJEn3yySdKSUmxa/r376/Dhw9rzZo1+uc//6k5c+YoPz/fYx5fX19Nnz5dO3fu1Pz587V69WqNHDlS0s/L1NOmTZPD4dCRI0d05MgRjRgx4oxeEhMT9d5779mBR5I++OAD/fjjj7r77rsl/fwL4wsWLFB6erp27typYcOG6cEHH9TatWur5P0Cqpvg4GCVlJTooYce0pYtW7Rs2TJlZ2fLsizdcccdKi0tlSQlJyeruLhY69at0/bt2zVx4kTVrl37jPk4Hy9TFmCAAQMGWL169bIsy7I6d+5sPfLII5ZlWda7775rlf9nnJSUZA0aNMhjv48//tjy9fW1fvrpJ2vXrl2WJGvz5s32+L59+yxJ1tSpU3/1td966y2rfv369vO5c+daTqfzjLqmTZva85SWlloNGjSwFixYYI8/8MAD1v33329ZlmWdPHnSqlmzprVhwwaPOZKSkqwHHnjg7G8GYIDTz1m3221lZmZagYGBVu/evS1J1vr16+3ao0ePWsHBwdabb75pWZZltWnTxho/fnyF83700UeWJOv48eOWZXE+Xo645T+MM3HiRN16661n/Mvq888/17Zt27Rw4UJ7m2VZcrvd2r9/v/bu3St/f3+1b9/eHm/evLnq1q3rMc+HH36otLQ07d69Wy6XS6dOndLJkyf1448/nvNn5v7+/rrvvvu0cOFC9evXTydOnNC//vUvLV68WJL05Zdf6scff9Rtt93msV9JSYmuv/7683o/gOpq+fLlql27tkpLS+V2u9W3b1/dc889Wr58uTp16mTX1a9fXy1bttSuXbskSU8++aQGDx6sVatWKTY2VgkJCbruuusq3Qfn46WF4ALjdO3aVXFxcUpNTdVDDz1kby8qKtJjjz2mJ5988ox9mjRpor179/7m3AcOHNCdd96pwYMH68UXX1S9evX0ySefKCkpSSUlJed1sV9iYqJuueUW5efnKzMzU8HBwerZs6fdqyStWLFCV1xxhcd+/PYKLhXdunXT7NmzFRAQoPDwcPn7+2vZsmW/ud+jjz6quLg4rVixQqtWrVJaWpomT56sIUOGVLoXzsdLB8EFRpowYYLatWunli1b2tvat2+vL774Qs2bN69wn5YtW+rUqVP697//rQ4dOkj6+V9ap39LKScnR263W5MnT5av78+XgL355pse8wQEBKisrOw3e+zSpYsiIiK0ZMkSvf/++7r33ntVo0YNSVJ0dLQCAwN18OBB3XLLLed38IAhatWqdcb52Lp1a506dUobN25Uly5dJEnff/+99uzZo+joaLsuIiJCjz/+uB5//HGlpqbqb3/7W4XBhfPx8kNwgZHatGmjxMRETZ8+3d727LPPqnPnzkpJSdGjjz6qWrVq6YsvvlBmZqZmzpypVq1aKTY2VoMGDdLs2bNVo0YNPf300woODra/Wtm8eXOVlpZqxowZuuuuu7R+/Xqlp6d7vHZkZKSKioqUlZWltm3bqmbNmr+6EtO3b1+lp6dr7969+uijj+ztderU0YgRIzRs2DC53W7ddNNNKiws1Pr16+VwODRgwICL8K4B3nf11VerV69eGjhwoF577TXVqVNHo0aN0hVXXKFevXpJkoYOHarbb79dLVq00PHjx/XRRx+pdevWFc7H+XgZ8vZFNsC5OP1Cv3L79++3AgICrNP/M960aZN12223WbVr17Zq1aplXXfdddaLL75ojx8+fNi6/fbbrcDAQKtp06bWokWLrEaNGlnp6el2zZQpU6zGjRtbwcHBVlxcnLVgwQKPiwEty7Ief/xxq379+pYka9y4cZZleV4MWO6LL76wJFlNmza13G63x5jb7bamTZtmtWzZ0qpRo4bVsGFDKy4uzlq7du2FvVlANVDROVvu2LFjVr9+/Syn02mfZ3v37rXHU1JSrGbNmlmBgYFWw4YNrX79+llHjx61LOvMi3Mti/PxcuNjWZblxdwEeNU333yjiIgIffjhh+revbu32wEA/AaCCy4rq1evVlFRkdq0aaMjR45o5MiR+vbbb7V37177824AQPXFNS64rJSWlurPf/6z/vOf/6hOnTrq0qWLFi5cSGgBAEOw4gIAAIzBLf8BAIAxCC4AAMAYBBcAAGAMggsAADAGwQXAJWvNmjXy8fFRQUGBt1sBUEUILgAuuu+++06DBw9WkyZNFBgYqLCwMMXFxWn9+vVV9hp//OMfNXToUI9tXbp00ZEjR+R0OqvsdSrroYceUu/evb3dBmA87uMC4KJLSEhQSUmJ5s+fr6uuukp5eXnKysrS999/f1FfNyAgQGFhYRf1NQD8zrz5ewMALn3Hjx+3JFlr1qw5a01SUpLVoEEDq06dOla3bt2srVu32uPjxo2z2rZtay1YsMBq2rSp5XA4rPvvv99yuVyWZf38uziSPB779+8/43dt5s6dazmdTuu9996zWrRoYQUHB1sJCQnWiRMnrHnz5llNmza1QkJCrCFDhlinTp2yX//kyZPW008/bYWHh1s1a9a0/vCHP1gfffSRPV4+b0ZGhtWqVSurVq1aVlxcnHX48GG7/1/2d/r+AM4dHxUBuKhq166t2rVra+nSpSouLq6w5t5771V+fr7ef/995eTkqH379urevbuOHTtm13z11VdaunSpli9fruXLl2vt2rWaMGGCJOnVV19VTEyMBg4cqCNHjujIkSOKiIio8LV+/PFHTZ8+XYsXL1ZGRobWrFmju+++WytXrtTKlSv197//Xa+99prefvtte5+UlBRlZ2dr8eLF2rZtm+6991717NlT+/bt85j3lVde0d///netW7dOBw8e1IgRIyRJI0aM0H333aeePXva/XXp0uWC31vgsuTt5ATg0vf2229bdevWtYKCgqwuXbpYqamp1ueff25ZlmV9/PHHlsPhsE6ePOmxT7NmzazXXnvNsqyfVyxq1qxpr7BYlmU988wzVqdOneznt9xyi/XUU095zFHRiosk68svv7RrHnvsMatmzZrWDz/8YG+Li4uzHnvsMcuyLOvrr7+2/Pz8rG+//dZj7u7du1upqam/Ou+sWbOs0NBQ+/nZfi0ZwLnjGhcAF11CQoLi4+P18ccf69NPP9X777+vSZMm6fXXX9eJEydUVFSk+vXre+zz008/6auvvrKfR0ZGqk6dOvbzxo0bKz8//7x7qVmzppo1a2Y/Dw0NVWRkpGrXru2xrXzu7du3q6ysTC1atPCYp7i42KPnX85b2f4AnB3BBcDvIigoSLfddptuu+02jRkzRo8++qjGjRunJ554Qo0bN9aaNWvO2CckJMT+8y9/CNPHx0dut/u8+6honrPNXVRUJD8/P+Xk5MjPz8+j7vSwU9EcFj8FB1Q5ggsAr4iOjtbSpUvVvn175ebmyt/fX5GRkZWeLyAgQGVlZVXX4H9df/31KisrU35+vm6++eZKz3Ox+gMuN1ycC+Ci+v7773XrrbfqH//4h7Zt26b9+/frrbfe0qRJk9SrVy/FxsYqJiZGvXv31qpVq3TgwAFt2LBBf/nLX7Rly5Zzfp3IyEht3LhRBw4c0NGjRyu1GlORFi1aKDExUf3799c777yj/fv3a9OmTUpLS9OKFSvOq79t27Zpz549Onr0qEpLS6ukP+ByQ3ABcFHVrl1bnTp10tSpU9W1a1dde+21GjNmjAYOHKiZM2fKx8dHK1euVNeuXfXwww+rRYsW6tOnj77++muFhoae8+uMGDFCfn5+io6OVsOGDXXw4MEqO4a5c+eqf//+evrpp9WyZUv17t1bmzdvVpMmTc55joEDB6ply5bq2LGjGjZsWKU33wMuJz4WH8ICAABDsOICAACMQXABAADGILgAAABjEFwAAIAxCC4AAMAYBBcAAGAMggsAADAGwQUAABiD4AIAAIxBcAEAAMYguAAAAGMQXAAAgDH+H3RDbpqsrEWNAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_, ax = plt.subplots(figsize=(6, 4))\n", + "\n", + "dataset[\"sentiment\"].value_counts().plot(kind=\"bar\", ax=ax)\n", + "ax.set_xticklabels([\"Negative\", \"Positive\"], rotation=0)\n", + "ax.set_xlabel(\"Sentiment\")\n", + "ax.set_ylabel(\"Count\")\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Word cloud (before tokenization)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a885e681eaf14751b11088566e643a3e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/19583 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Gather all the unique words in the dataset\n", + "word_freq = Counter()\n", + "dataset[\"text\"].str.lower().str.split().progress_apply(word_freq.update)\n", + "\n", + "# Now get the most common words\n", + "common_words = word_freq.most_common(100)\n", + "\n", + "# Create a word cloud of the most common words\n", + "wrd_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_words))\n", + "\n", + "# Display the word cloud\n", + "plt.figure(figsize=(20, 20))\n", + "plt.imshow(wrd_cloud, interpolation=\"bilinear\")\n", + "plt.axis(\"off\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Word cloud (after tokenization)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "04f9a50519654e7188f59c62645572ff", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/19583 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Gather all the unique tokens in the dataset\n", + "token_freq = Counter()\n", + "dataset[\"tokens\"].str.split().progress_apply(token_freq.update)\n", + "\n", + "# Now get the most common tokens\n", + "common_tokens = token_freq.most_common(100)\n", + "\n", + "# Create a word cloud of the most common tokens\n", + "tkn_cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(common_tokens))\n", + "\n", + "# Display the word cloud\n", + "plt.figure(figsize=(20, 20))\n", + "plt.imshow(tkn_cloud, interpolation=\"bilinear\")\n", + "plt.axis(\"off\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Token association" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "283ee0b586574489bf14a8ef0105ef78", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/9105 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "_, ax = plt.subplots(2, 1, figsize=(20, 20))\n", + "\n", + "for i, sentiment in enumerate([\"Negative\", \"Positive\"]):\n", + " freq = Counter()\n", + " dataset[dataset[\"sentiment\"] == i][\"tokens\"].str.split().progress_apply(freq.update)\n", + " most_common = freq.most_common(100)\n", + "\n", + " cloud = WordCloud(width=800, height=400, random_state=SEED).generate_from_frequencies(dict(most_common))\n", + " ax[i].imshow(cloud, interpolation=\"bilinear\")\n", + " ax[i].axis(\"off\")\n", + " ax[i].set_title(sentiment)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Vectorization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}