Spaces:

Tymec
/

sentiment-analysis

Running

App Files Files

Tymec commited on Jun 3, 2024

Commit

af84d9b

1 Parent(s): e3095cd

Cache label data along with tokenized text data

Browse files

Files changed (2) hide show

app/cli.py +26 -20
app/utils.py +2 -3

app/cli.py CHANGED Viewed

@@ -146,32 +146,35 @@ def evaluate(
     from app.model import evaluate_model
     from app.utils import deserialize, serialize
-    cached_data_path = TOKENIZER_CACHE_PATH / f"{dataset}_tokenized.pkl"
     use_cached_data = False
-    if cached_data_path.exists():
         use_cached_data = force_cache or click.confirm(
             f"Found existing tokenized data for '{dataset}'. Use it?",
             default=True,
         )
-    click.echo("Loading dataset... ", nl=False)
-    text_data, label_data = load_data(dataset)
-    click.echo(DONE_STR)
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
-        token_data = pd.Series(deserialize(cached_data_path))
         click.echo(DONE_STR)
     else:
         click.echo("Tokenizing data... ")
         token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
         click.echo("Caching tokenized data... ")
-        serialize(token_data, cached_data_path, show_progress=True)
-    del text_data
-    gc.collect()
     click.echo("Size of vocabulary: ", nl=False)
     vocab = token_data.explode().value_counts()
@@ -281,32 +284,35 @@ def train(
     if model_path.exists() and not overwrite:
         click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
-    cached_data_path = TOKENIZER_CACHE_PATH / f"{dataset}_tokenized.pkl"
     use_cached_data = False
-    if cached_data_path.exists():
         use_cached_data = force_cache or click.confirm(
             f"Found existing tokenized data for '{dataset}'. Use it?",
             default=True,
         )
-    click.echo("Loading dataset... ", nl=False)
-    text_data, label_data = load_data(dataset)
-    click.echo(DONE_STR)
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
-        token_data = pd.Series(deserialize(cached_data_path))
         click.echo(DONE_STR)
     else:
         click.echo("Tokenizing data... ")
         token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
         click.echo("Caching tokenized data... ")
-        serialize(token_data, cached_data_path, show_progress=True)
-    del text_data
-    gc.collect()
     click.echo("Size of vocabulary: ", nl=False)
     vocab = token_data.explode().value_counts()

     from app.model import evaluate_model
     from app.utils import deserialize, serialize
+    token_cache_path = TOKENIZER_CACHE_PATH / f"{dataset}_tokenized.pkl"
+    label_cache_path = TOKENIZER_CACHE_PATH / f"{dataset}_labels.pkl"
     use_cached_data = False
+    if token_cache_path.exists():
         use_cached_data = force_cache or click.confirm(
             f"Found existing tokenized data for '{dataset}'. Use it?",
             default=True,
         )
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
+        token_data = pd.Series(deserialize(token_cache_path))
+        label_data = joblib.load(label_cache_path)
         click.echo(DONE_STR)
     else:
+        click.echo("Loading dataset... ", nl=False)
+        text_data, label_data = load_data(dataset)
+        click.echo(DONE_STR)
         click.echo("Tokenizing data... ")
         token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
         click.echo("Caching tokenized data... ")
+        serialize(token_data, token_cache_path, show_progress=True)
+        joblib.dump(label_data, label_cache_path, compress=3)
+        del text_data
+        gc.collect()
     click.echo("Size of vocabulary: ", nl=False)
     vocab = token_data.explode().value_counts()
     if model_path.exists() and not overwrite:
         click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
+    token_cache_path = TOKENIZER_CACHE_PATH / f"{dataset}_tokenized.pkl"
+    label_cache_path = TOKENIZER_CACHE_PATH / f"{dataset}_labels.pkl"
     use_cached_data = False
+    if token_cache_path.exists():
         use_cached_data = force_cache or click.confirm(
             f"Found existing tokenized data for '{dataset}'. Use it?",
             default=True,
         )
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
+        token_data = pd.Series(deserialize(token_cache_path))
+        label_data = joblib.load(label_cache_path)
         click.echo(DONE_STR)
     else:
+        click.echo("Loading dataset... ", nl=False)
+        text_data, label_data = load_data(dataset)
+        click.echo(DONE_STR)
         click.echo("Tokenizing data... ")
         token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
         click.echo("Caching tokenized data... ")
+        serialize(token_data, token_cache_path, show_progress=True)
+        joblib.dump(label_data, label_cache_path, compress=3)
+        del text_data
+        gc.collect()
     click.echo("Size of vocabulary: ", nl=False)
     vocab = token_data.explode().value_counts()

app/utils.py CHANGED Viewed

@@ -11,7 +11,7 @@ if TYPE_CHECKING:
 __all__ = ["serialize", "deserialize"]
-def serialize(data: Sequence[str], path: Path, max_size: int = 100000, show_progress: bool = False) -> None:
     """Serialize data to a file
     Args:
@@ -20,7 +20,6 @@ def serialize(data: Sequence[str], path: Path, max_size: int = 100000, show_prog
         max_size: The maximum size a chunk can be (in elements)
         show_progress: Whether to show a progress bar
     """
-    # first file is path, next chunks have ".1", ".2", etc. appended
     for i, chunk in enumerate(
         tqdm(
             [data[i : i + max_size] for i in range(0, len(data), max_size)],
@@ -33,7 +32,7 @@ def serialize(data: Sequence[str], path: Path, max_size: int = 100000, show_prog
             joblib.dump(chunk, f, compress=3)
-def deserialize(path: Path) -> Sequence[str]:
     """Deserialize data from a file
     Args:

 __all__ = ["serialize", "deserialize"]
+def serialize(data: Sequence[str | int], path: Path, max_size: int = 100000, show_progress: bool = False) -> None:
     """Serialize data to a file
     Args:
         max_size: The maximum size a chunk can be (in elements)
         show_progress: Whether to show a progress bar
     """
     for i, chunk in enumerate(
         tqdm(
             [data[i : i + max_size] for i in range(0, len(data), max_size)],
             joblib.dump(chunk, f, compress=3)
+def deserialize(path: Path) -> Sequence[str | int]:
     """Deserialize data from a file
     Args: