Spaces:

Tymec
/

sentiment-analysis

Running

App Files Files

Tymec commited on May 15, 2024

Commit

85ac990

1 Parent(s): 667fe9d

Completely change the structure of the project

Browse files

Files changed (15) hide show

.vscode/settings.json +1 -0
README.md +4 -0
app/__main__.py +6 -0
app/cli.py +144 -0
app/constants.py +27 -11
app/gui.py +39 -73
app/model.py +273 -113
app/utils.py +0 -164
deprecated/__init__.py +0 -0
deprecated/main.py +0 -44
deprecated/train.py +0 -152
justfile +4 -6
notebook.ipynb +152 -0
poetry.lock +114 -1
pyproject.toml +2 -1

.vscode/settings.json CHANGED Viewed

@@ -23,5 +23,6 @@
     "**/__pycache__": true,
     "**/.ruff_cache": true,
     "**/.venv": true,
   }
 }

     "**/__pycache__": true,
     "**/.ruff_cache": true,
     "**/.venv": true,
+    "**/.cache": true,
   }
 }

README.md CHANGED Viewed

@@ -7,6 +7,10 @@ Sentiment Analysis
 3. Run `just install` to install the dependencies
 4. Run `just run --help` to see the available commands
 ### TODO
 - [ ] CLI using `click` (commands: predict, train, evaluate) with settings set via flags or environment variables

 3. Run `just install` to install the dependencies
 4. Run `just run --help` to see the available commands
+### Datasets
+- [Sentiment140](https://www.kaggle.com/datasets/kazanova/sentiment140)
+- [IMDb](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
+- [Amazon Reviews](https://www.kaggle.com/datasets/bittlingmayer/amazonreviews)
 ### TODO
 - [ ] CLI using `click` (commands: predict, train, evaluate) with settings set via flags or environment variables

app/__main__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from __future__ import annotations
+from app.cli import cli_wrapper as cli
+if __name__ == "__main__":
+    cli()

app/cli.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import Literal
+import click
+__all__ = ["cli_wrapper"]
+ERROR_STR = click.style("ERROR", fg="red")
+DONE_STR = click.style("DONE", fg="green")
+POSITIVE_STR = click.style("POSITIVE", fg="green")
+NEUTRAL_STR = click.style("NEUTRAL", fg="yellow")
+NEGATIVE_STR = click.style("NEGATIVE", fg="red")
+@click.group()
+def cli() -> None: ...
+@cli.command()
+@click.option(
+    "--model",
+    "model_path",
+    required=True,
+    help="Path to the trained model",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True, path_type=Path),
+)
+@click.option(
+    "--share/--no-share",
+    default=False,
+    help="Whether to create a shareable link",
+)
+def gui(model_path: Path, share: bool) -> None:
+    """Launch the Gradio GUI"""
+    from app.gui import launch_gui
+    launch_gui(model_path, share)
+@cli.command()
+@click.option(
+    "--model",
+    "model_path",
+    required=True,
+    help="Path to the trained model",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True, path_type=Path),
+)
+@click.argument("text", nargs=-1)
+def predict(model_path: Path, text: list[str]) -> None:
+    """Perform sentiment analysis on the provided text.
+    Note: Piped input takes precedence over the text argument
+    """
+    import sys
+    import joblib
+    text = " ".join(text).strip()
+    if not sys.stdin.isatty():
+        piped_text = sys.stdin.read().strip()
+        text = piped_text or text
+    if not text:
+        click.echo(f"{ERROR_STR}: No text provided")
+        return
+    click.echo("Loading model... ", nl=False)
+    model = joblib.load(model_path)
+    click.echo(DONE_STR)
+    click.echo("Performing sentiment analysis... ", nl=False)
+    prediction = model.predict([text])[0]
+    if prediction == 0:
+        sentiment = NEGATIVE_STR
+    elif prediction == 1:
+        sentiment = POSITIVE_STR
+    else:
+        sentiment = NEUTRAL_STR
+    click.echo(sentiment)
+@cli.command()
+@click.option(
+    "--dataset",
+    required=True,
+    help="Dataset to train the model on",
+    type=click.Choice(["sentiment140", "amazonreviews", "imdb50k"]),
+)
+@click.option(
+    "--max-features",
+    default=20000,
+    help="Maximum number of features",
+    show_default=True,
+    type=click.IntRange(1, None),
+)
+@click.option(
+    "--seed",
+    default=42,
+    help="Random seed (-1 for random seed)",
+    show_default=True,
+    type=click.IntRange(-1, None),
+)
+def train(
+    dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
+    max_features: int,
+    seed: int,
+) -> None:
+    """Train the model on the provided dataset"""
+    import joblib
+    from app.constants import MODELS_DIR
+    from app.model import create_model, load_data, train_model
+    model_path = MODELS_DIR / f"{dataset}_tfidf_ft-{max_features}.pkl"
+    if model_path.exists():
+        click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
+    click.echo("Preprocessing dataset... ", nl=False)
+    text_data, label_data = load_data(dataset)
+    click.echo(DONE_STR)
+    click.echo("Creating model... ", nl=False)
+    model = create_model(max_features, seed=None if seed == -1 else seed)
+    click.echo(DONE_STR)
+    click.echo("Training model... ", nl=False)
+    accuracy = train_model(model, text_data, label_data)
+    joblib.dump(model, model_path)
+    click.echo(DONE_STR)
+    click.echo("Model accuracy: ")
+    click.secho(f"{accuracy:.2%}", fg="blue")
+    # TODO: Add hyperparameter options
+    # TODO: Random/grid search for finding best classifier and hyperparameters
+def cli_wrapper() -> None:
+    cli(max_content_width=120)
+if __name__ == "__main__":
+    cli_wrapper()

app/constants.py CHANGED Viewed

@@ -1,16 +1,32 @@
 from pathlib import Path
-DEFAULT_SEED: int = 42
-MAX_TOKENIZER_FEATURES: int = 500000
-CLF_MAX_ITER: int = 1000
-DATASET_PATH: Path = Path("data/training.1600000.processed.noemoticon.csv")
-STOPWORDS_PATH: Path = Path("data/stopwords-en.txt")
-MODELS_DIR: Path = Path("models")
-CACHE_DIR: Path = Path("cache")
-CHECKPOINT_PATH: Path = CACHE_DIR / "pipeline.pkl"
-# Create directories if they don't exist
-MODELS_DIR.mkdir(parents=True, exist_ok=True)
-CACHE_DIR.mkdir(parents=True, exist_ok=True)

+from __future__ import annotations
+import os
 from pathlib import Path
+CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
+DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
+MODELS_DIR = Path(os.getenv("MODELS_DIR", "models"))
+SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
+SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
+AMAZONREVIEWS_PATH = (DATA_DIR / "amazonreviews.test.txt.bz2", DATA_DIR / "amazonreviews.train.txt.bz2")
+AMAZONREVIEWS_URL = "https://www.kaggle.com/datasets/bittlingmayer/amazonreviews"
+IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
+IMDB50K_URL = "https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews"
+URL_REGEX = r"(https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z]{2,}(\.[a-zA-Z]{2,})(\.[a-zA-Z]{2,})?\/[a-zA-Z0-9]{2,}|((https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z]{2,}(\.[a-zA-Z]{2,})(\.[a-zA-Z]{2,})?)|(https:\/\/www\.|http:\/\/www\.|https:\/\/|http:\/\/)?[a-zA-Z0-9]{2,}\.[a-zA-Z0-9]{2,}\.[a-zA-Z0-9]{2,}(\.[a-zA-Z0-9]{2,})?"  # https://www.freecodecamp.org/news/how-to-write-a-regular-expression-for-a-url/
+EMOTICON_MAP = {
+    "SMILE": [":)", ":-)", ": )", ":D", ":-D", ": D", ";)", ";-)", "; )", ":>", ":->", ": >", ":]", ":-]", ": ]"],
+    "LOVE": ["<3", ":*", ":-*", ": *"],
+    "WINK": [";)", ";-)", "; )", ";>", ";->", "; >"],
+    "FROWN": [":(", ":-(", ": (", ":[", ":-[", ": ["],
+    "CRY": [":'(", ": (", ":' (", ":'[", ":' ["],
+    "SURPRISE": [":O", ":-O", ": O", ":0", ":-0", ": 0", ":o", ":-o", ": o"],
+    "ANGRY": [">:(", ">:-(", "> :(", ">:["],
+}
+CACHE_DIR.mkdir(exist_ok=True, parents=True)
+DATA_DIR.mkdir(exist_ok=True, parents=True)
+MODELS_DIR.mkdir(exist_ok=True, parents=True)

app/gui.py CHANGED Viewed

@@ -1,92 +1,58 @@
 from __future__ import annotations
-from pathlib import Path
 import gradio as gr
-from constants import MODELS_DIR
-from model import predict, tokenize
-CSS_PATH = Path("style.css")
-TOKENIZER_EXT = ".tokenizer.pkl"
-MODEL_EXT = ".model.pkl"
-POSITIVE_LABEL = "Positive 😊"
-NEGATIVE_LABEL = "Negative 😤"
-REFRESH_SYMBOL = "🔄"
-def load_style() -> str:
-    if not CSS_PATH.is_file():
-        return ""
-    with Path.open(CSS_PATH) as f:
-        return f.read()
-def predict_wrapper(text: str, tokenizer: str, model: str) -> str:
-    toks = tokenize(text, MODELS_DIR / f"{tokenizer}{TOKENIZER_EXT}")
-    pred = predict(toks, MODELS_DIR / f"{model}{MODEL_EXT}")
-    return POSITIVE_LABEL if pred else NEGATIVE_LABEL
-def train_wrapper() -> None:
-    msg = "Training is not supported in the GUI."
-    raise NotImplementedError(msg)
-def evaluate_wrapper() -> None:
-    msg = "Evaluation is not supported in the GUI."
-    raise NotImplementedError(msg)
-with gr.Blocks(css=load_style()) as demo:
-    gr.Markdown("## Sentiment Analysis")
-    with gr.Row(equal_height=True):
-        textbox = gr.Textbox(
-            lines=10,
-            label="Enter text to analyze",
-            placeholder="Enter text here",
-            key="input-textbox",
-        )
-        with gr.Column():
-            output = gr.Label()
-            with gr.Row(elem_classes="justify-between"):
-                clear_btn = gr.ClearButton([textbox, output], value="Clear 🧹")
-                analyze_btn = gr.Button(
-                    "Analyze 🔍",
-                    variant="primary",
-                    interactive=False,
-                )
-            with gr.Row():
-                tokenizer_selector = gr.Dropdown(
-                    choices=[tkn.stem[: -len(".tokenizer")] for tkn in MODELS_DIR.glob(f"*{TOKENIZER_EXT}")],
-                    label="Tokenizer",
-                    key="tokenizer-selector",
-                )
-                model_selector = gr.Dropdown(
-                    choices=[mdl.stem[: -len(".model")] for mdl in MODELS_DIR.glob(f"*{MODEL_EXT}")],
-                    label="Model",
-                    key="model-selector",
-                )
-                # TODO: Refresh button
-    # Event handlers
-    textbox.input(
-        fn=lambda text: gr.update(interactive=bool(text.strip())),
-        inputs=[textbox],
-        outputs=[analyze_btn],
-    )
-    analyze_btn.click(
-        fn=predict_wrapper,
-        inputs=[textbox, tokenizer_selector, model_selector],
-        outputs=[output],
-    )
-demo.queue()
-demo.launch()

 from __future__ import annotations
+import os
+from functools import lru_cache
+from typing import TYPE_CHECKING
 import gradio as gr
+import joblib
+if TYPE_CHECKING:
+    from sklearn.pipeline import Pipeline
+__all__ = ["launch_gui"]
+POSITIVE_LABEL = "Positive 😊"
+NEUTRAL_LABEL = "Neutral 😐"
+NEGATIVE_LABEL = "Negative 😤"
+@lru_cache(maxsize=1)
+def load_model() -> Pipeline:
+    """Load the trained model and cache it."""
+    model_path = os.environ.get("MODEL_PATH", None)
+    if model_path is None:
+        msg = "MODEL_PATH environment variable not set"
+        raise ValueError(msg)
+    return joblib.load(model_path)
+def sentiment_analysis(text: str) -> str:
+    """Perform sentiment analysis on the provided text."""
+    model = load_model()
+    prediction = model.predict([text])[0]
+    if prediction == 0:
+        return NEGATIVE_LABEL
+    if prediction == 1:
+        return POSITIVE_LABEL
+    return NEUTRAL_LABEL
+demo = gr.Interface(
+    fn=sentiment_analysis,
+    inputs="text",
+    outputs="label",
+    title="Sentiment Analysis",
+)
+def launch_gui(model_path: str, share: bool) -> None:
+    """Launch the Gradio GUI."""
+    os.environ["MODEL_PATH"] = model_path
+    demo.launch(share=share)
+if __name__ == "__main__":
+    demo.launch()

app/model.py CHANGED Viewed

@@ -1,144 +1,304 @@
 from __future__ import annotations
 import warnings
-from functools import lru_cache
-from typing import TYPE_CHECKING, Sequence
-import joblib
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import Pipeline
-from constants import CLF_MAX_ITER, MAX_TOKENIZER_FEATURES
-from utils import get_cache_memory, get_random_state
-if TYPE_CHECKING:
-    from pathlib import Path
-    from numpy import ndarray
-    from numpy.random import RandomState
-__all__ = ["predict", "tokenize"]
-@lru_cache(maxsize=1)
-def get_model(model_path: Path) -> Pipeline:
-    return joblib.load(model_path)
-@lru_cache(maxsize=1)
-def get_tokenizer(tokenizer_path: Path) -> Pipeline:
-    return joblib.load(tokenizer_path)
-def export_to_file(pipeline: Pipeline, path: Path) -> None:
-    joblib.dump(pipeline, path)
-def tokenize(text: str, tokenizer_path: Path) -> ndarray:
-    tokenizer = get_tokenizer(tokenizer_path)
-    return tokenizer.transform([text])[0]
-def predict(tokens: ndarray, model_path: Path) -> bool:
-    model = get_model(model_path)
-    prediction = model.predict([tokens])
-    return prediction[0] == 1
-def train_and_export(
-    steps: Sequence[tuple],
-    x: list[str],
-    y: list[int],
-    export_path: Path,
-    cache: joblib.Memory,
 ) -> Pipeline:
-    pipeline = Pipeline(steps, memory=cache)
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        pipeline.fit(x, y)
-    export_to_file(pipeline, export_path)
-    return pipeline
-def train_tokenizer_and_export(x: list[str], y: list[int], export_path: Path, cache: joblib.Memory) -> Pipeline:
-    return train_and_export(
         [
-            (
-                "vectorize",
-                CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES),
-            ),
             ("tfidf", TfidfTransformer()),
         ],
-        x,
-        y,
-        export_path,
-        cache,
     )
-def train_model_and_export(
-    x: ndarray,
-    y: list[int],
-    export_path: Path,
-    cache: joblib.Memory,
-    rs: RandomState,
-) -> Pipeline:
-    return train_and_export(
-        [("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs))],
-        x,
-        y,
-        export_path,
-        cache,
-    )
-def train(x: list[str], y: list[int]) -> Pipeline:
-    cache = get_cache_memory()
-    rs = get_random_state()
-    tokenizer = train_tokenizer(x, y, cache)
-    x_tr = tokenizer.transform(x)
-    model = train_model(x_tr, y, cache, rs)
-    return Pipeline([("tokenizer", tokenizer), ("model", model)])
-def train_tokenizer(x: list[str], y: list[int], cache: joblib.Memory) -> Pipeline:
-    # TODO: In the future, allow for different tokenizers
-    pipeline = Pipeline(
-        [
-            (
-                "vectorize",
-                CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_TOKENIZER_FEATURES),
-            ),
-            ("tfidf", TfidfTransformer()),
-        ],
-        memory=cache,
     )
     with warnings.catch_warnings():
-        warnings.simplefilter("ignore")  # Ignore joblib warnings
-        pipeline.fit(x, y)
-    return pipeline
-def train_model(x: list[str], y: list[int], cache: joblib.Memory, rs: RandomState) -> Pipeline:
-    # TODO: In the future, allow for different classifiers
-    pipeline = Pipeline(
-        [
-            ("clf", LogisticRegression(max_iter=CLF_MAX_ITER, random_state=rs)),
-        ],
-        memory=cache,
-    )
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")  # Ignore joblib warnings
-        pipeline.fit(x, y)
-    return pipeline

 from __future__ import annotations
+import bz2
+import re
 import warnings
+from typing import Literal
+import pandas as pd
+from joblib import Memory
+from nltk.stem import WordNetLemmatizer
+from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
+from app.constants import (
+    AMAZONREVIEWS_PATH,
+    AMAZONREVIEWS_URL,
+    CACHE_DIR,
+    EMOTICON_MAP,
+    IMDB50K_PATH,
+    IMDB50K_URL,
+    SENTIMENT140_PATH,
+    SENTIMENT140_URL,
+    URL_REGEX,
+)
+__all__ = ["load_data", "create_model", "train_model"]
+class TextCleaner(BaseEstimator, TransformerMixin):
+    def __init__(
+        self,
+        *,
+        replace_url: bool = True,
+        replace_hashtag: bool = True,
+        replace_emoticon: bool = True,
+        replace_emoji: bool = True,
+        lowercase: bool = True,
+        character_threshold: int = 2,
+        remove_special_characters: bool = True,
+        remove_extra_spaces: bool = True,
+    ):
+        self.replace_url = replace_url
+        self.replace_hashtag = replace_hashtag
+        self.replace_emoticon = replace_emoticon
+        self.replace_emoji = replace_emoji
+        self.lowercase = lowercase
+        self.character_threshold = character_threshold
+        self.remove_special_characters = remove_special_characters
+        self.remove_extra_spaces = remove_extra_spaces
+    def fit(self, _data: list[str], _labels: list[int] | None = None) -> TextCleaner:
+        return self
+    def transform(self, data: list[str], _labels: list[int] | None = None) -> list[str]:
+        # Replace URLs, hashtags, emoticons, and emojis
+        data = [re.sub(URL_REGEX, "URL", text) for text in data] if self.replace_url else data
+        data = [re.sub(r"#\w+", "HASHTAG", text) for text in data] if self.replace_hashtag else data
+        # Replace emoticons
+        if self.replace_emoticon:
+            for word, emoticons in EMOTICON_MAP.items():
+                for emoticon in emoticons:
+                    data = [text.replace(emoticon, f"EMOTE_{word}") for text in data]
+        # Basic text cleaning
+        data = [text.lower() for text in data] if self.lowercase else data  # Lowercase
+        threshold_pattern = re.compile(rf"\b\w{{1,{self.character_threshold}}}\b")
+        data = (
+            [re.sub(threshold_pattern, "", text) for text in data] if self.character_threshold > 0 else data
+        )  # Remove short words
+        data = (
+            [re.sub(r"[^a-zA-Z0-9\s]", "", text) for text in data] if self.remove_special_characters else data
+        )  # Remove special characters
+        data = [re.sub(r"\s+", " ", text) for text in data] if self.remove_extra_spaces else data  # Remove extra spaces
+        # Remove leading and trailing whitespace
+        return [text.strip() for text in data]
+class TextLemmatizer(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        self.lemmatizer = WordNetLemmatizer()
+    def fit(self, _data: list[str], _labels: list[int] | None = None) -> TextLemmatizer:
+        return self
+    def transform(self, data: list[str], _labels: list[int] | None = None) -> list[str]:
+        return [self.lemmatizer.lemmatize(text) for text in data]
+def load_sentiment140(include_neutral: bool = False) -> tuple[list[str], list[int]]:
+    """Load the sentiment140 dataset and make it suitable for use.
+    Args:
+        include_neutral: Whether to include neutral sentiment
+    Returns:
+        Text and label data
+    Raises:
+        FileNotFoundError: If the dataset is not found
+    """
+    # Check if the dataset exists
+    if not SENTIMENT140_PATH.exists():
+        msg = (
+            f"Sentiment140 dataset not found at: '{SENTIMENT140_PATH}'\n"
+            "Please download the dataset from:\n"
+            f"{SENTIMENT140_URL}"
+        )
+        raise FileNotFoundError(msg)
+    # Load the dataset
+    data = pd.read_csv(
+        SENTIMENT140_PATH,
+        encoding="ISO-8859-1",
+        names=[
+            "target",  # 0 = negative, 2 = neutral, 4 = positive
+            "id",  # The id of the tweet
+            "date",  # The date of the tweet
+            "flag",  # The query, NO_QUERY if not present
+            "user",  # The user that tweeted
+            "text",  # The text of the tweet
+        ],
+    )
+    # Ignore rows with neutral sentiment
+    if not include_neutral:
+        data = data[data["target"] != 2]
+    # Map sentiment values
+    data["sentiment"] = data["target"].map(
+        {
+            0: 0,  # Negative
+            4: 1,  # Positive
+            2: 2,  # Neutral
+        },
+    )
+    # Return as lists
+    return data["text"].tolist(), data["sentiment"].tolist()
+def load_amazonreviews(merge: bool = True) -> tuple[list[str], list[int]]:
+    """Load the amazonreviews dataset and make it suitable for use.
+    Args:
+        merge: Whether to merge the test and train datasets (otherwise ignore test)
+    Returns:
+        Text and label data
+    Raises:
+        FileNotFoundError: If the dataset is not found
+    """
+    # Check if the dataset exists
+    test_exists = AMAZONREVIEWS_PATH[0].exists() or not merge
+    train_exists = AMAZONREVIEWS_PATH[1].exists()
+    if not (test_exists and train_exists):
+        msg = (
+            f"Amazonreviews dataset not found at: '{AMAZONREVIEWS_PATH[0]}' and '{AMAZONREVIEWS_PATH[1]}'\n"
+            "Please download the dataset from:\n"
+            f"{AMAZONREVIEWS_URL}"
+        )
+        raise FileNotFoundError(msg)
+    # Load the datasets
+    with bz2.BZ2File(AMAZONREVIEWS_PATH[1]) as train_file:
+        train_data = [line.decode("utf-8") for line in train_file]
+    test_data = []
+    if merge:
+        with bz2.BZ2File(AMAZONREVIEWS_PATH[0]) as test_file:
+            test_data = [line.decode("utf-8") for line in test_file]
+    # Merge the datasets
+    data = train_data + test_data
+    # Split the data into labels and text
+    labels, texts = zip(*(line.split(" ", 1) for line in data))
+    # Map sentiment values
+    sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]
+    # Return as lists
+    return texts, sentiments
+def load_imdb50k() -> tuple[list[str], list[int]]:
+    """Load the imdb50k dataset and make it suitable for use.
+    Returns:
+        Text and label data
+    Raises:
+        FileNotFoundError: If the dataset is not found
+    """
+    # Check if the dataset exists
+    if not IMDB50K_PATH.exists():
+        msg = (
+            f"IMDB50K dataset not found at: '{IMDB50K_PATH}'\n"
+            "Please download the dataset from:\n"
+            f"{IMDB50K_URL}"
+        )  # fmt: off
+        raise FileNotFoundError(msg)
+    # Load the dataset
+    data = pd.read_csv(IMDB50K_PATH)
+    # Map sentiment values
+    data["sentiment"] = data["sentiment"].map(
+        {
+            "positive": 1,
+            "negative": 0,
+        },
+    )
+    # Return as lists
+    return data["review"].tolist(), data["sentiment"].tolist()
+def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k"]) -> tuple[list[str], list[int]]:
+    """Load and preprocess the specified dataset.
+    Args:
+        dataset: Dataset to load
+    Returns:
+        Text and label data
+    Raises:
+        ValueError: If the dataset is not recognized
+    """
+    match dataset:
+        case "sentiment140":
+            return load_sentiment140(include_neutral=False)
+        case "amazonreviews":
+            return load_amazonreviews(merge=True)
+        case "imdb50k":
+            return load_imdb50k()
+        case _:
+            msg = f"Unknown dataset: {dataset}"
+            raise ValueError(msg)
+def create_model(
+    max_features: int,
+    seed: int | None = None,
 ) -> Pipeline:
+    """Create a sentiment analysis model.
+    Args:
+        max_features: Maximum number of features
+        seed: Random seed (None for random seed)
+    Returns:
+        Untrained model
+    """
+    return Pipeline(
         [
+            # Text preprocessing
+            ("clean", TextCleaner()),
+            ("lemma", TextLemmatizer()),
+            # Preprocess (NOTE: Can be replaced with TfidfVectorizer, but left for clarity)
+            ("vectorize", CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=max_features)),
             ("tfidf", TfidfTransformer()),
+            # Classifier
+            ("clf", LogisticRegression(max_iter=1000, random_state=seed)),
         ],
+        memory=Memory(CACHE_DIR, verbose=0),
     )
+def train_model(
+    model: Pipeline,
+    text_data: list[str],
+    label_data: list[int],
+    seed: int = 42,
+) -> float:
+    """Train the sentiment analysis model.
+    Args:
+        model: Untrained model
+        text_data: Text data
+        label_data: Label data
+        seed: Random seed (None for random seed)
+    Returns:
+        Accuracy score
+    """
+    text_train, text_test, label_train, label_test = train_test_split(
+        text_data,
+        label_data,
+        test_size=0.2,
+        random_state=seed,
     )
     with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        model.fit(text_train, label_train)
+    return model.score(text_test, label_test)

app/utils.py DELETED Viewed

@@ -1,164 +0,0 @@
-"""Utility functions"""
-from __future__ import annotations
-import itertools
-import re
-import warnings
-from collections import deque
-from enum import Enum
-from functools import lru_cache
-from threading import Event, Lock
-from typing import Any
-from joblib import Memory
-from numpy.random import RandomState
-from constants import CACHE_DIR, DEFAULT_SEED
-__all__ = ["colorize", "wrap_queued_call", "get_random_state", "get_cache_memory"]
-ANSI_RESET = 0
-class Color(Enum):
-    """ANSI color codes."""
-    BLACK = 30
-    RED = 31
-    GREEN = 32
-    YELLOW = 33
-    BLUE = 34
-    MAGENTA = 35
-    CYAN = 36
-    WHITE = 37
-class Style(Enum):
-    """ANSI style codes."""
-    BOLD = 1
-    DIM = 2
-    ITALIC = 3
-    UNDERLINE = 4
-    BLINK = 5
-    INVERTED = 7
-    HIDDEN = 8
-# https://gist.github.com/vitaliyp/6d54dd76ca2c3cdfc1149d33007dc34a
-class FIFOLock:
-    def __init__(self):
-        self._lock = Lock()
-        self._inner_lock = Lock()
-        self._pending_threads = deque()
-    def acquire(self, blocking: bool = True) -> bool:
-        with self._inner_lock:
-            lock_acquired = self._lock.acquire(False)
-            if lock_acquired:
-                return True
-            if not blocking:
-                return False
-            release_event = Event()
-            self._pending_threads.append(release_event)
-        release_event.wait()
-        return self._lock.acquire()
-    def release(self) -> None:
-        with self._inner_lock:
-            if self._pending_threads:
-                release_event = self._pending_threads.popleft()
-                release_event.set()
-            self._lock.release()
-    __enter__ = acquire
-    def __exit__(self, _t, _v, _tb):  # noqa: ANN001
-        self.release()
-@lru_cache(maxsize=1)
-def get_queue_lock() -> FIFOLock:
-    return FIFOLock()
-@lru_cache(maxsize=1)
-def get_random_state(seed: int = DEFAULT_SEED) -> RandomState:
-    return RandomState(seed)
-@lru_cache(maxsize=1)
-def get_cache_memory() -> Memory:
-    return Memory(CACHE_DIR, verbose=0)
-def to_ansi(code: int) -> str:
-    """Convert an integer to an ANSI escape code."""
-    return f"\033[{code}m"
-@lru_cache(maxsize=None)
-def get_ansi_color(color: Color, bright: bool = False, background: bool = False) -> str:
-    """Get ANSI color code for the specified color, brightness and background."""
-    code = color.value
-    if bright:
-        code += 60
-    if background:
-        code += 10
-    return to_ansi(code)
-def replace_color_tag(color: Color, text: str) -> None:
-    """Replace both dark and light color tags for background and foreground."""
-    for bright, bg in itertools.product([False, True], repeat=2):
-        tag = f"{'BG_' if bg else ''}{'BRIGHT_' if bright else ''}{color.name}"
-        text = text.replace(f"[{tag}]", get_ansi_color(color, bright=bright, background=bg))
-        text = text.replace(f"[/{tag}]", to_ansi(ANSI_RESET))
-    return text
-@lru_cache(maxsize=256)
-def colorize(text: str, strip: bool = True) -> str:
-    """Format text with ANSI color codes using tags [COLOR], [BG_COLOR] and [STYLE].
-    Reset color/style with [/TAG].
-    Escape with double brackets [[]]. Strip leading and trailing whitespace if strip=True.
-    """
-    # replace foreground and background color tags
-    for color in Color:
-        text = replace_color_tag(color, text)
-    # replace style tags
-    for style in Style:
-        text = text.replace(f"[{style.name}]", to_ansi(style.value)).replace(f"[/{style.name}]", to_ansi(ANSI_RESET))
-    # if there are any tags left, remove them and throw a warning
-    pat1 = re.compile(r"((?<!\[)\[)([^\[\]]*)(\](?!\]))")
-    for match in pat1.finditer(text):
-        color = match.group(1)
-        text = text.replace(match.group(0), "")
-        warnings.warn(f"Invalid color tag: {color!r}", UserWarning, stacklevel=2)
-    # escape double brackets
-    pat2 = re.compile(r"\[\[[^\[\]\v]+\]\]")
-    text = pat2.sub("", text)
-    # reset color/style at the end
-    text += to_ansi(ANSI_RESET)
-    return text.strip() if strip else text
-# https://github.com/AUTOMATIC1111/stable-diffusion-webui/modules/call_queue.py
-def wrap_queued_call(func: callable) -> callable:
-    def f(*args, **kwargs) -> Any:  # noqa: ANN003, ANN002
-        with get_queue_lock():
-            return func(*args, **kwargs)
-    return f

deprecated/__init__.py DELETED Viewed

File without changes

deprecated/main.py DELETED Viewed

@@ -1,44 +0,0 @@
-from __future__ import annotations
-from pathlib import Path
-import click
-import joblib
-from app.utils import colorize
-@click.group()
-def cli() -> None: ...
-@cli.command("predict")
-@click.option(
-    "-m",
-    "--model",
-    "model_path",
-    default="models/model.pkl",
-    help="Path to the model file.",
-    type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True, path_type=Path),
-)
-@click.argument("text", nargs=-1)
-def predict(model_path: Path, text: list[str]) -> None:
-    input_text = " ".join(text).strip()
-    if not input_text:
-        click.echo("[RED]Error[/RED]: Input text is empty.")
-        return
-    # Load the model
-    click.echo("Loading model... ", nl=False)
-    model = joblib.load(model_path)
-    click.echo(colorize("[GREEN]DONE"))
-    # Run the model
-    click.echo("Performing sentiment analysis... ", nl=False)
-    prediction = model.predict([input_text])
-    sentiment = "[GREEN]POSITIVE" if prediction[0] == 1 else "[RED]NEGATIVE"
-    click.echo(colorize(sentiment))
-if __name__ == "__main__":
-    cli()

deprecated/train.py DELETED Viewed

@@ -1,152 +0,0 @@
-from __future__ import annotations
-import warnings
-from pathlib import Path
-from typing import TYPE_CHECKING
-import click
-import joblib
-import pandas as pd
-from numpy.random import RandomState
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import accuracy_score, classification_report
-from sklearn.model_selection import train_test_split
-from sklearn.pipeline import Pipeline
-if TYPE_CHECKING:
-    from sklearn.base import BaseEstimator
-SEED = 42
-DATASET_PATH = Path("data/training.1600000.processed.noemoticon.csv")
-STOPWORDS_PATH = Path("data/stopwords-en.txt")
-CHECKPOINT_PATH = Path("cache/pipeline.pkl")
-MODELS_DIR = Path("models")
-CACHE_DIR = Path("cache")
-MAX_FEATURES = 10000  # 500000
-# Make sure paths exist
-MODELS_DIR.mkdir(parents=True, exist_ok=True)
-CACHE_DIR.mkdir(parents=True, exist_ok=True)
-# Memory cache for sklearn pipelines
-mem = joblib.Memory(CACHE_DIR, verbose=0)
-# TODO: use xgboost
-def get_random_state(seed: int = SEED) -> RandomState:
-    return RandomState(seed)
-def load_data() -> tuple[list[str], list[int]]:
-    """The model takes in a list of strings and a list of integers where 1 is positive sentiment and 0 is negative sentiment."""
-    data = pd.read_csv(
-        DATASET_PATH,
-        encoding="ISO-8859-1",
-        names=[
-            "target",  # 0 = negative, 2 = neutral, 4 = positive
-            "id",  # The id of the tweet
-            "date",  # The date of the tweet
-            "flag",  # The query, NO_QUERY if not present
-            "user",  # The user that tweeted
-            "text",  # The text of the tweet
-        ],
-    )
-    # Ignore rows with neutral sentiment
-    data = data[data["target"] != 2]
-    # Create new column called "sentiment" with 1 for positive and 0 for negative
-    data["sentiment"] = data["target"] == 4
-    # Drop the columns we don't need
-    # data = data.drop(columns=["target", "id", "date", "flag", "user"]) # NOTE: No need, since we return the columns we need
-    # Return as lists
-    return list(data["text"]), list(data["sentiment"])
-def create_pipeline(clf: BaseEstimator) -> Pipeline:
-    return Pipeline(
-        [
-            # Preprocess
-            # ("vectorize", CountVectorizer(stop_words="english", ngram_range=(1, 2), max_features=MAX_FEATURES)),
-            # ("tfidf", TfidfTransformer()),
-            ("vectorize", TfidfVectorizer(ngram_range=(1, 2), max_features=MAX_FEATURES)),
-            # Classifier
-            ("clf", clf),
-        ],
-        memory=mem,
-    )
-def evaluate_pipeline(pipeline: Pipeline, x: list[str], y: list[int]) -> float:
-    y_pred = pipeline.predict(x)
-    report = classification_report(y, y_pred)
-    click.echo(report)
-    # TODO: Confusion matrix
-    return accuracy_score(y, y_pred)
-def export_pipeline(pipeline: Pipeline, name: str) -> None:
-    model_path = MODELS_DIR / f"{name}.pkl"
-    joblib.dump(pipeline, model_path)
-    click.echo(f"Model exported to {model_path!r}")
-@click.command()
-@click.option("--retrain", is_flag=True, help="Train the model even if a checkpoint exists.")
-@click.option("--evaluate", is_flag=True, help="Evaluate the model.")
-@click.option("--flush-cache", is_flag=True, help="Clear sklearn cache.")
-@click.option("--seed", type=int, default=SEED, help="Random seed.")
-def train(retrain: bool, evaluate: bool, flush_cache: bool, seed: int) -> None:
-    rng = get_random_state(seed)
-    # Clear sklearn cache
-    if flush_cache:
-        click.echo("Clearing cache... ", nl=False)
-        mem.clear(warn=False)
-        click.echo("DONE")
-    # Load and split data
-    click.echo("Loading data... ", nl=False)
-    x, y = load_data()
-    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=rng)
-    click.echo("DONE")
-    # Train model
-    if retrain or not CHECKPOINT_PATH.exists():
-        click.echo("Training model... ", nl=False)
-        clf = LogisticRegression(max_iter=1000, random_state=rng)
-        model = create_pipeline(clf)
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")  # Ignore joblib warnings
-            model.fit(x_train, y_train)
-        joblib.dump(model, CHECKPOINT_PATH)
-        click.echo("DONE")
-    else:
-        click.echo("Loading model... ", nl=False)
-        model = joblib.load(CHECKPOINT_PATH)
-        click.echo("DONE")
-    # Evaluate model
-    if evaluate:
-        evaluate_pipeline(model, x_test, y_test)
-    # Quick test
-    test_text = ["I love this movie", "I hate this movie"]
-    click.echo("Quick test:")
-    for text in test_text:
-        click.echo(f"\t{'positive' if model.predict([text])[0] else 'negative'}: {text}")
-    # Export model
-    click.echo("Exporting model... ", nl=False)
-    export_pipeline(model, "logistic_regression")
-    click.echo("DONE")
-if __name__ == "__main__":
-    train()

justfile CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env just --justfile
 @default:
-  echo "No target specified."
 @lint:
   poetry run pre-commit run --all-files
@@ -16,8 +16,6 @@
 @requirements:
   poetry export -f requirements.txt --output requirements.txt --without dev
-@run +TEXT:
-  poetry run python main.py predict --model models/logistic_regression.pkl "{{TEXT}}"
-@gui:
-  poetry run gradio app/gui.py

 #!/usr/bin/env just --justfile
 @default:
+  just --list
 @lint:
   poetry run pre-commit run --all-files
 @requirements:
   poetry export -f requirements.txt --output requirements.txt --without dev
+[no-exit-message]
+@app *ARGS:
+  poetry run python -m app {{ARGS}}

notebook.ipynb ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sentiment Analysis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from __future__ import annotations\n",
+    "\n",
+    "import re\n",
+    "from functools import cache\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data: pd.DataFrame = None  # TODO: load dataset\n",
+    "stopwords: set[str] = None  # TODO: load stopwords"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Explore the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot the distribution\n",
+    "_, ax = plt.subplots(figsize=(6, 4))\n",
+    "data[\"sentiment\"].value_counts().plot(kind=\"bar\", ax=ax)\n",
+    "ax.set_xticklabels([\"Negative\", \"Positive\"], rotation=0)\n",
+    "ax.set_xlabel(\"Sentiment\")\n",
+    "ax.grid(False)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cache\n",
+    "def extract_words(text: str) -> list[str]:\n",
+    "    return re.findall(r\"(\\b[^\\s]+\\b)\", text.lower())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract words and count them\n",
+    "words = data[\"text\"].apply(extract_words).explode()\n",
+    "word_counts = words.value_counts().reset_index()\n",
+    "word_counts.columns = [\"word\", \"count\"]\n",
+    "word_counts.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot the most common words\n",
+    "_, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))\n",
+    "\n",
+    "sns.barplot(data=word_counts.head(10), x=\"count\", y=\"word\", ax=ax1)\n",
+    "ax1.set_title(\"Most common words\")\n",
+    "ax1.grid(False)\n",
+    "ax1.tick_params(axis=\"x\", rotation=45)\n",
+    "\n",
+    "ax2.set_title(\"Most common words (excluding stopwords)\")\n",
+    "sns.barplot(\n",
+    "    data=word_counts[~word_counts[\"word\"].isin(stopwords)].head(10),\n",
+    "    x=\"count\",\n",
+    "    y=\"word\",\n",
+    "    ax=ax2,\n",
+    ")\n",
+    "ax2.grid(False)\n",
+    "ax2.tick_params(axis=\"x\", rotation=45)\n",
+    "ax2.set_ylabel(\"\")\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Find best classifier"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Find best hyperparameters"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

poetry.lock CHANGED Viewed

@@ -1479,6 +1479,31 @@ files = [
     {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
 ]
 [[package]]
 name = "nodeenv"
 version = "1.8.0"
@@ -2298,6 +2323,94 @@ files = [
 attrs = ">=22.2.0"
 rpds-py = ">=0.7.0"
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -3174,4 +3287,4 @@ files = [
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "531a4cc627428e6882c85538a196f5e65c14c5f39df6615ad310ed7190c9709d"

     {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
 ]
+[[package]]
+name = "nltk"
+version = "3.8.1"
+description = "Natural Language Toolkit"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"},
+    {file = "nltk-3.8.1.zip", hash = "sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3"},
+]
+[package.dependencies]
+click = "*"
+joblib = "*"
+regex = ">=2021.8.3"
+tqdm = "*"
+[package.extras]
+all = ["matplotlib", "numpy", "pyparsing", "python-crfsuite", "requests", "scikit-learn", "scipy", "twython"]
+corenlp = ["requests"]
+machine-learning = ["numpy", "python-crfsuite", "scikit-learn", "scipy"]
+plot = ["matplotlib"]
+tgrep = ["pyparsing"]
+twitter = ["twython"]
 [[package]]
 name = "nodeenv"
 version = "1.8.0"
 attrs = ">=22.2.0"
 rpds-py = ">=0.7.0"
+[[package]]
+name = "regex"
+version = "2024.5.15"
+description = "Alternative regular expression module, to replace re."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "regex-2024.5.15-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a81e3cfbae20378d75185171587cbf756015ccb14840702944f014e0d93ea09f"},
+    {file = "regex-2024.5.15-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b59138b219ffa8979013be7bc85bb60c6f7b7575df3d56dc1e403a438c7a3f6"},
+    {file = "regex-2024.5.15-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a0bd000c6e266927cb7a1bc39d55be95c4b4f65c5be53e659537537e019232b1"},
+    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5eaa7ddaf517aa095fa8da0b5015c44d03da83f5bd49c87961e3c997daed0de7"},
+    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ba68168daedb2c0bab7fd7e00ced5ba90aebf91024dea3c88ad5063c2a562cca"},
+    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6e8d717bca3a6e2064fc3a08df5cbe366369f4b052dcd21b7416e6d71620dca1"},
+    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1337b7dbef9b2f71121cdbf1e97e40de33ff114801263b275aafd75303bd62b5"},
+    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f9ebd0a36102fcad2f03696e8af4ae682793a5d30b46c647eaf280d6cfb32796"},
+    {file = "regex-2024.5.15-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9efa1a32ad3a3ea112224897cdaeb6aa00381627f567179c0314f7b65d354c62"},
+    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:1595f2d10dff3d805e054ebdc41c124753631b6a471b976963c7b28543cf13b0"},
+    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b802512f3e1f480f41ab5f2cfc0e2f761f08a1f41092d6718868082fc0d27143"},
+    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:a0981022dccabca811e8171f913de05720590c915b033b7e601f35ce4ea7019f"},
+    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:19068a6a79cf99a19ccefa44610491e9ca02c2be3305c7760d3831d38a467a6f"},
+    {file = "regex-2024.5.15-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1b5269484f6126eee5e687785e83c6b60aad7663dafe842b34691157e5083e53"},
+    {file = "regex-2024.5.15-cp310-cp310-win32.whl", hash = "sha256:ada150c5adfa8fbcbf321c30c751dc67d2f12f15bd183ffe4ec7cde351d945b3"},
+    {file = "regex-2024.5.15-cp310-cp310-win_amd64.whl", hash = "sha256:ac394ff680fc46b97487941f5e6ae49a9f30ea41c6c6804832063f14b2a5a145"},
+    {file = "regex-2024.5.15-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f5b1dff3ad008dccf18e652283f5e5339d70bf8ba7c98bf848ac33db10f7bc7a"},
+    {file = "regex-2024.5.15-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c6a2b494a76983df8e3d3feea9b9ffdd558b247e60b92f877f93a1ff43d26656"},
+    {file = "regex-2024.5.15-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a32b96f15c8ab2e7d27655969a23895eb799de3665fa94349f3b2fbfd547236f"},
+    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:10002e86e6068d9e1c91eae8295ef690f02f913c57db120b58fdd35a6bb1af35"},
+    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ec54d5afa89c19c6dd8541a133be51ee1017a38b412b1321ccb8d6ddbeb4cf7d"},
+    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10e4ce0dca9ae7a66e6089bb29355d4432caed736acae36fef0fdd7879f0b0cb"},
+    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e507ff1e74373c4d3038195fdd2af30d297b4f0950eeda6f515ae3d84a1770f"},
+    {file = "regex-2024.5.15-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1f059a4d795e646e1c37665b9d06062c62d0e8cc3c511fe01315973a6542e40"},
+    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0721931ad5fe0dda45d07f9820b90b2148ccdd8e45bb9e9b42a146cb4f695649"},
+    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:833616ddc75ad595dee848ad984d067f2f31be645d603e4d158bba656bbf516c"},
+    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:287eb7f54fc81546346207c533ad3c2c51a8d61075127d7f6d79aaf96cdee890"},
+    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:19dfb1c504781a136a80ecd1fff9f16dddf5bb43cec6871778c8a907a085bb3d"},
+    {file = "regex-2024.5.15-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:119af6e56dce35e8dfb5222573b50c89e5508d94d55713c75126b753f834de68"},
+    {file = "regex-2024.5.15-cp311-cp311-win32.whl", hash = "sha256:1c1c174d6ec38d6c8a7504087358ce9213d4332f6293a94fbf5249992ba54efa"},
+    {file = "regex-2024.5.15-cp311-cp311-win_amd64.whl", hash = "sha256:9e717956dcfd656f5055cc70996ee2cc82ac5149517fc8e1b60261b907740201"},
+    {file = "regex-2024.5.15-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:632b01153e5248c134007209b5c6348a544ce96c46005d8456de1d552455b014"},
+    {file = "regex-2024.5.15-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e64198f6b856d48192bf921421fdd8ad8eb35e179086e99e99f711957ffedd6e"},
+    {file = "regex-2024.5.15-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:68811ab14087b2f6e0fc0c2bae9ad689ea3584cad6917fc57be6a48bbd012c49"},
+    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8ec0c2fea1e886a19c3bee0cd19d862b3aa75dcdfb42ebe8ed30708df64687a"},
+    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d0c0c0003c10f54a591d220997dd27d953cd9ccc1a7294b40a4be5312be8797b"},
+    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2431b9e263af1953c55abbd3e2efca67ca80a3de8a0437cb58e2421f8184717a"},
+    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a605586358893b483976cffc1723fb0f83e526e8f14c6e6614e75919d9862cf"},
+    {file = "regex-2024.5.15-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:391d7f7f1e409d192dba8bcd42d3e4cf9e598f3979cdaed6ab11288da88cb9f2"},
+    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9ff11639a8d98969c863d4617595eb5425fd12f7c5ef6621a4b74b71ed8726d5"},
+    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:4eee78a04e6c67e8391edd4dad3279828dd66ac4b79570ec998e2155d2e59fd5"},
+    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8fe45aa3f4aa57faabbc9cb46a93363edd6197cbc43523daea044e9ff2fea83e"},
+    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:d0a3d8d6acf0c78a1fff0e210d224b821081330b8524e3e2bc5a68ef6ab5803d"},
+    {file = "regex-2024.5.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c486b4106066d502495b3025a0a7251bf37ea9540433940a23419461ab9f2a80"},
+    {file = "regex-2024.5.15-cp312-cp312-win32.whl", hash = "sha256:c49e15eac7c149f3670b3e27f1f28a2c1ddeccd3a2812cba953e01be2ab9b5fe"},
+    {file = "regex-2024.5.15-cp312-cp312-win_amd64.whl", hash = "sha256:673b5a6da4557b975c6c90198588181029c60793835ce02f497ea817ff647cb2"},
+    {file = "regex-2024.5.15-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:87e2a9c29e672fc65523fb47a90d429b70ef72b901b4e4b1bd42387caf0d6835"},
+    {file = "regex-2024.5.15-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c3bea0ba8b73b71b37ac833a7f3fd53825924165da6a924aec78c13032f20850"},
+    {file = "regex-2024.5.15-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bfc4f82cabe54f1e7f206fd3d30fda143f84a63fe7d64a81558d6e5f2e5aaba9"},
+    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5bb9425fe881d578aeca0b2b4b3d314ec88738706f66f219c194d67179337cb"},
+    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:64c65783e96e563103d641760664125e91bd85d8e49566ee560ded4da0d3e704"},
+    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cf2430df4148b08fb4324b848672514b1385ae3807651f3567871f130a728cc3"},
+    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5397de3219a8b08ae9540c48f602996aa6b0b65d5a61683e233af8605c42b0f2"},
+    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:455705d34b4154a80ead722f4f185b04c4237e8e8e33f265cd0798d0e44825fa"},
+    {file = "regex-2024.5.15-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b2b6f1b3bb6f640c1a92be3bbfbcb18657b125b99ecf141fb3310b5282c7d4ed"},
+    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:3ad070b823ca5890cab606c940522d05d3d22395d432f4aaaf9d5b1653e47ced"},
+    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5b5467acbfc153847d5adb21e21e29847bcb5870e65c94c9206d20eb4e99a384"},
+    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:e6662686aeb633ad65be2a42b4cb00178b3fbf7b91878f9446075c404ada552f"},
+    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:2b4c884767504c0e2401babe8b5b7aea9148680d2e157fa28f01529d1f7fcf67"},
+    {file = "regex-2024.5.15-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:3cd7874d57f13bf70078f1ff02b8b0aa48d5b9ed25fc48547516c6aba36f5741"},
+    {file = "regex-2024.5.15-cp38-cp38-win32.whl", hash = "sha256:e4682f5ba31f475d58884045c1a97a860a007d44938c4c0895f41d64481edbc9"},
+    {file = "regex-2024.5.15-cp38-cp38-win_amd64.whl", hash = "sha256:d99ceffa25ac45d150e30bd9ed14ec6039f2aad0ffa6bb87a5936f5782fc1569"},
+    {file = "regex-2024.5.15-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:13cdaf31bed30a1e1c2453ef6015aa0983e1366fad2667657dbcac7b02f67133"},
+    {file = "regex-2024.5.15-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cac27dcaa821ca271855a32188aa61d12decb6fe45ffe3e722401fe61e323cd1"},
+    {file = "regex-2024.5.15-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7dbe2467273b875ea2de38ded4eba86cbcbc9a1a6d0aa11dcf7bd2e67859c435"},
+    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64f18a9a3513a99c4bef0e3efd4c4a5b11228b48aa80743be822b71e132ae4f5"},
+    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d347a741ea871c2e278fde6c48f85136c96b8659b632fb57a7d1ce1872547600"},
+    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1878b8301ed011704aea4c806a3cadbd76f84dece1ec09cc9e4dc934cfa5d4da"},
+    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4babf07ad476aaf7830d77000874d7611704a7fcf68c9c2ad151f5d94ae4bfc4"},
+    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:35cb514e137cb3488bce23352af3e12fb0dbedd1ee6e60da053c69fb1b29cc6c"},
+    {file = "regex-2024.5.15-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:cdd09d47c0b2efee9378679f8510ee6955d329424c659ab3c5e3a6edea696294"},
+    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:72d7a99cd6b8f958e85fc6ca5b37c4303294954eac1376535b03c2a43eb72629"},
+    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:a094801d379ab20c2135529948cb84d417a2169b9bdceda2a36f5f10977ebc16"},
+    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:c0c18345010870e58238790a6779a1219b4d97bd2e77e1140e8ee5d14df071aa"},
+    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:16093f563098448ff6b1fa68170e4acbef94e6b6a4e25e10eae8598bb1694b5d"},
+    {file = "regex-2024.5.15-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e38a7d4e8f633a33b4c7350fbd8bad3b70bf81439ac67ac38916c4a86b465456"},
+    {file = "regex-2024.5.15-cp39-cp39-win32.whl", hash = "sha256:71a455a3c584a88f654b64feccc1e25876066c4f5ef26cd6dd711308aa538694"},
+    {file = "regex-2024.5.15-cp39-cp39-win_amd64.whl", hash = "sha256:cab12877a9bdafde5500206d1020a584355a97884dfd388af3699e9137bf7388"},
+    {file = "regex-2024.5.15.tar.gz", hash = "sha256:d3ee02d9e5f482cc8309134a91eeaacbdd2261ba111b0fef3748eeb4913e6a2c"},
+]
 [[package]]
 name = "requests"
 version = "2.31.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
+content-hash = "988f4561272067771efc60acdb2687f0586be48c1bf401452696c51e8f69b534"

pyproject.toml CHANGED Viewed

@@ -1,13 +1,14 @@
 [tool.poetry]
 name = "sentiment-analysis"
 package-mode = false
-packages = [{ include = "app" }]
 [tool.poetry.dependencies]
 python = "^3.12"
 click = "^8.1.7"
 scikit-learn = "^1.4.2"
 gradio = "^4.31.0"
 [tool.poetry.group.train.dependencies]
 pandas = "^2.2.2"

 [tool.poetry]
 name = "sentiment-analysis"
 package-mode = false
 [tool.poetry.dependencies]
 python = "^3.12"
 click = "^8.1.7"
 scikit-learn = "^1.4.2"
 gradio = "^4.31.0"
+colorama = "^0.4.6"
+nltk = "^3.8.1"
 [tool.poetry.group.train.dependencies]
 pandas = "^2.2.2"