Spaces:

Tymec
/

sentiment-analysis

Sleeping

App Files Files

Tymec commited on May 31, 2024

Commit

18cc46a

1 Parent(s): 8471e78

Update options, force GC, tweak parameters and add flags

Browse files

Files changed (2) hide show

app/cli.py +26 -7
app/model.py +7 -5

app/cli.py CHANGED Viewed

@@ -111,8 +111,8 @@ def predict(model_path: Path, text: list[str]) -> None:
 )
 @click.option(
     "--processes",
-    default=8,
-    help="Number of parallel jobs during tokenization",
     show_default=True,
 )
 @click.option(
@@ -129,6 +129,8 @@ def evaluate(
     verbose: bool,
 ) -> None:
     """Evaluate the model on the the specified dataset"""
     import joblib
     from app.constants import CACHE_DIR
@@ -155,13 +157,21 @@ def evaluate(
         click.echo(DONE_STR)
         del text_data
     click.echo("Loading model... ", nl=False)
     model = joblib.load(model_path)
     click.echo(DONE_STR)
     click.echo("Evaluating model... ", nl=False)
-    acc_mean, acc_std = evaluate_model(model, token_data, label_data, folds=cv, verbose=verbose)
     click.secho(f"{acc_mean:.2%} ± {acc_std:.2%}", fg="blue")
@@ -206,10 +216,15 @@ def evaluate(
     type=click.IntRange(-1, None),
 )
 @click.option(
-    "--force",
     is_flag=True,
     help="Overwrite the model file if it already exists",
 )
 @click.option(
     "--verbose",
     is_flag=True,
@@ -222,10 +237,13 @@ def train(
     batch_size: int,
     processes: int,
     seed: int,
-    force: bool,
     verbose: bool,
 ) -> None:
     """Train the model on the provided dataset"""
     import joblib
     from app.constants import CACHE_DIR, MODELS_DIR
@@ -233,12 +251,12 @@ def train(
     from app.model import train_model
     model_path = MODELS_DIR / f"{dataset}_tfidf_ft-{max_features}.pkl"
-    if model_path.exists() and not force:
         click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
     cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
     use_cached_data = False
-    if cached_data_path.exists():
         use_cached_data = click.confirm(f"Found existing tokenized data for '{dataset}'. Use it?", default=True)
     if use_cached_data:
@@ -256,6 +274,7 @@ def train(
         click.echo(DONE_STR)
         del text_data
     click.echo("Training model... ")
     model, accuracy = train_model(

 )
 @click.option(
     "--processes",
+    default=4,
+    help="Number of parallel jobs to run",
     show_default=True,
 )
 @click.option(
     verbose: bool,
 ) -> None:
     """Evaluate the model on the the specified dataset"""
+    import gc
     import joblib
     from app.constants import CACHE_DIR
         click.echo(DONE_STR)
         del text_data
+        gc.collect()
     click.echo("Loading model... ", nl=False)
     model = joblib.load(model_path)
     click.echo(DONE_STR)
     click.echo("Evaluating model... ", nl=False)
+    acc_mean, acc_std = evaluate_model(
+        model,
+        token_data,
+        label_data,
+        folds=cv,
+        n_jobs=processes,
+        verbose=verbose,
+    )
     click.secho(f"{acc_mean:.2%} ± {acc_std:.2%}", fg="blue")
     type=click.IntRange(-1, None),
 )
 @click.option(
+    "--overwrite",
     is_flag=True,
     help="Overwrite the model file if it already exists",
 )
+@click.option(
+    "--skip-cache",
+    is_flag=True,
+    help="Ignore cached tokenized data",
+)
 @click.option(
     "--verbose",
     is_flag=True,
     batch_size: int,
     processes: int,
     seed: int,
+    overwrite: bool,
+    skip_cache: bool,
     verbose: bool,
 ) -> None:
     """Train the model on the provided dataset"""
+    import gc
     import joblib
     from app.constants import CACHE_DIR, MODELS_DIR
     from app.model import train_model
     model_path = MODELS_DIR / f"{dataset}_tfidf_ft-{max_features}.pkl"
+    if model_path.exists() and not overwrite:
         click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
     cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
     use_cached_data = False
+    if cached_data_path.exists() and not skip_cache:
         use_cached_data = click.confirm(f"Found existing tokenized data for '{dataset}'. Use it?", default=True)
     if use_cached_data:
         click.echo(DONE_STR)
         del text_data
+        gc.collect()
     click.echo("Training model... ")
     model, accuracy = train_model(

app/model.py CHANGED Viewed

@@ -99,14 +99,14 @@ def train_model(
         cv=folds,
         random_state=seed,
         n_jobs=n_jobs,
-        verbose=verbose,
         scoring="accuracy",
         n_iter=10,
     )
-    # os.environ["PYTHONWARNINGS"] = "ignore"
     search.fit(text_train, label_train)
-    # del os.environ["PYTHONWARNINGS"]
     best_model = search.best_estimator_
     return best_model, best_model.score(text_test, label_test)
@@ -117,6 +117,7 @@ def evaluate_model(
     token_data: list[str],
     label_data: list[int],
     folds: int = 5,
     verbose: bool = False,
 ) -> tuple[float, float]:
     """Evaluate the model using cross-validation.
@@ -126,6 +127,7 @@ def evaluate_model(
         token_data: Tokenized text data
         label_data: Label data
         folds: Number of cross-validation folds
         verbose: Whether to output additional information
     Returns:
@@ -138,8 +140,8 @@ def evaluate_model(
         label_data,
         cv=folds,
         scoring="accuracy",
-        n_jobs=-1,
-        verbose=verbose,
     )
     del os.environ["PYTHONWARNINGS"]
     return scores.mean(), scores.std()

         cv=folds,
         random_state=seed,
         n_jobs=n_jobs,
+        verbose=2 if verbose else 0,
         scoring="accuracy",
         n_iter=10,
     )
+    os.environ["PYTHONWARNINGS"] = "ignore"
     search.fit(text_train, label_train)
+    del os.environ["PYTHONWARNINGS"]
     best_model = search.best_estimator_
     return best_model, best_model.score(text_test, label_test)
     token_data: list[str],
     label_data: list[int],
     folds: int = 5,
+    n_jobs: int = 4,
     verbose: bool = False,
 ) -> tuple[float, float]:
     """Evaluate the model using cross-validation.
         token_data: Tokenized text data
         label_data: Label data
         folds: Number of cross-validation folds
+        n_jobs: Number of parallel jobs
         verbose: Whether to output additional information
     Returns:
         label_data,
         cv=folds,
         scoring="accuracy",
+        n_jobs=n_jobs,
+        verbose=2 if verbose else 0,
     )
     del os.environ["PYTHONWARNINGS"]
     return scores.mean(), scores.std()