Spaces:

Tymec
/

sentiment-analysis

Running

App Files Files

Tymec commited on Jun 2, 2024

Commit

afaacd1

1 Parent(s): 18cc46a

Chunked serialization

Browse files

Files changed (3) hide show

Makefile +0 -20
app/cli.py +37 -21
app/utils.py +44 -0

Makefile DELETED Viewed

@@ -1,20 +0,0 @@
-#!/usr/bin/make -f
-default: install
-install:
-  @poetry install --only main
-  @poetry run spacy download en_core_web_sm
-install-dev:
-  @poetry self add poetry-plugin-export
-  @poetry install
-requirements:
-  @poetry export -f requirements.txt --output requirements.txt --without dev
-  @poetry export -f requirements.txt --output requirements-dev.txt
-lint:
-  @poetry run pre-commit run --all-files
-.PHONY: install install-dev requirements gradio lint run

app/cli.py CHANGED Viewed

@@ -136,28 +136,34 @@ def evaluate(
     from app.constants import CACHE_DIR
     from app.data import load_data, tokenize
     from app.model import evaluate_model
     cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
     use_cached_data = False
     if cached_data_path.exists():
         use_cached_data = click.confirm(f"Found existing tokenized data for '{dataset}'. Use it?", default=True)
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
-        token_data, label_data = joblib.load(cached_data_path)
         click.echo(DONE_STR)
     else:
-        click.echo("Loading dataset... ", nl=False)
-        text_data, label_data = load_data(dataset)
-        click.echo(DONE_STR)
         click.echo("Tokenizing data... ", nl=False)
         token_data = tokenize(text_data, batch_size=batch_size, n_jobs=processes, show_progress=True)
-        joblib.dump((token_data, label_data), cached_data_path, compress=3)
         click.echo(DONE_STR)
-        del text_data
-        gc.collect()
     click.echo("Loading model... ", nl=False)
     model = joblib.load(model_path)
@@ -221,9 +227,9 @@ def evaluate(
     help="Overwrite the model file if it already exists",
 )
 @click.option(
-    "--skip-cache",
     is_flag=True,
-    help="Ignore cached tokenized data",
 )
 @click.option(
     "--verbose",
@@ -238,7 +244,7 @@ def train(
     processes: int,
     seed: int,
     overwrite: bool,
-    skip_cache: bool,
     verbose: bool,
 ) -> None:
     """Train the model on the provided dataset"""
@@ -249,6 +255,7 @@ def train(
     from app.constants import CACHE_DIR, MODELS_DIR
     from app.data import load_data, tokenize
     from app.model import train_model
     model_path = MODELS_DIR / f"{dataset}_tfidf_ft-{max_features}.pkl"
     if model_path.exists() and not overwrite:
@@ -256,25 +263,34 @@ def train(
     cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
     use_cached_data = False
-    if cached_data_path.exists() and not skip_cache:
-        use_cached_data = click.confirm(f"Found existing tokenized data for '{dataset}'. Use it?", default=True)
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
-        token_data, label_data = joblib.load(cached_data_path)
         click.echo(DONE_STR)
     else:
-        click.echo("Loading dataset... ", nl=False)
-        text_data, label_data = load_data(dataset)
-        click.echo(DONE_STR)
         click.echo("Tokenizing data... ", nl=False)
         token_data = tokenize(text_data, batch_size=batch_size, n_jobs=processes, show_progress=True)
-        joblib.dump((token_data, label_data), cached_data_path, compress=3)
         click.echo(DONE_STR)
-        del text_data
-        gc.collect()
     click.echo("Training model... ")
     model, accuracy = train_model(

     from app.constants import CACHE_DIR
     from app.data import load_data, tokenize
     from app.model import evaluate_model
+    from app.utils import deserialize, serialize
     cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
     use_cached_data = False
     if cached_data_path.exists():
         use_cached_data = click.confirm(f"Found existing tokenized data for '{dataset}'. Use it?", default=True)
+    click.echo("Loading dataset... ", nl=False)
+    text_data, label_data = load_data(dataset)
+    click.echo(DONE_STR)
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
+        # token_data = joblib.load(cached_data_path)
+        token_data = deserialize(cached_data_path)
         click.echo(DONE_STR)
     else:
         click.echo("Tokenizing data... ", nl=False)
         token_data = tokenize(text_data, batch_size=batch_size, n_jobs=processes, show_progress=True)
         click.echo(DONE_STR)
+        click.echo("Caching tokenized data... ", nl=False)
+        # joblib.dump(token_data, cached_data_path, compress=3)
+        serialize(token_data, cached_data_path)
+        click.echo(DONE_STR)
+    del text_data
+    gc.collect()
     click.echo("Loading model... ", nl=False)
     model = joblib.load(model_path)
     help="Overwrite the model file if it already exists",
 )
 @click.option(
+    "--force-cache",
     is_flag=True,
+    help="Always use the cached tokenized data (if available)",
 )
 @click.option(
     "--verbose",
     processes: int,
     seed: int,
     overwrite: bool,
+    force_cache: bool,
     verbose: bool,
 ) -> None:
     """Train the model on the provided dataset"""
     from app.constants import CACHE_DIR, MODELS_DIR
     from app.data import load_data, tokenize
     from app.model import train_model
+    from app.utils import deserialize, serialize
     model_path = MODELS_DIR / f"{dataset}_tfidf_ft-{max_features}.pkl"
     if model_path.exists() and not overwrite:
     cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
     use_cached_data = False
+    if cached_data_path.exists():
+        use_cached_data = force_cache or click.confirm(
+            f"Found existing tokenized data for '{dataset}'. Use it?",
+            default=True,
+        )
+    click.echo("Loading dataset... ", nl=False)
+    text_data, label_data = load_data(dataset)
+    click.echo(DONE_STR)
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
+        # token_data = joblib.load(cached_data_path)
+        token_data = deserialize(cached_data_path)
         click.echo(DONE_STR)
     else:
         click.echo("Tokenizing data... ", nl=False)
         token_data = tokenize(text_data, batch_size=batch_size, n_jobs=processes, show_progress=True)
         click.echo(DONE_STR)
+        click.echo("Caching tokenized data... ", nl=False)
+        # joblib.dump(token_data, cached_data_path, compress=3)
+        serialize(token_data, cached_data_path)
+        click.echo(DONE_STR)
+    del text_data
+    gc.collect()
     click.echo("Training model... ")
     model, accuracy = train_model(

app/utils.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from __future__ import annotations
+from typing import TYPE_CHECKING
+import joblib
+from tqdm import tqdm
+if TYPE_CHECKING:
+    from pathlib import Path
+__all__ = ["serialize", "deserialize"]
+def serialize(data: list[list[str]], path: Path, max_size: int = 400) -> None:
+    """Serialize data to a file
+    Args:
+        data: The data to serialize
+        path: The path to save the serialized data
+        max_size: The maximum size a chunk can be (in elements)
+    """
+    # first file is path, next chunks have ".1", ".2", etc. appended
+    for i, chunk in enumerate(tqdm([data[i : i + max_size] for i in range(0, len(data), max_size)])):
+        fd = path.with_suffix(f".{i}.pkl" if i else ".pkl")
+        with fd.open("wb") as f:
+            joblib.dump(chunk, f, compress=3)
+def deserialize(path: Path) -> list[list[str]]:
+    """Deserialize data from a file
+    Args:
+        path: The path to the serialized data
+    Returns:
+        The deserialized data
+    """
+    data = []
+    i = 0
+    while (fd := path.with_suffix(f".{i}.pkl" if i else ".pkl")).exists():
+        with fd.open("rb") as f:
+            data.extend(joblib.load(f))
+        i += 1
+    return data