Spaces:

Tymec
/

sentiment-analysis

Sleeping

App Files Files

Tymec commited on Jun 2, 2024

Commit

d09d1f6

1 Parent(s): afaacd1

Ignore amazonreviews test

Browse files

Files changed (2) hide show

app/constants.py +1 -1
app/data.py +9 -18

app/constants.py CHANGED Viewed

@@ -10,7 +10,7 @@ MODELS_DIR = Path(os.getenv("MODELS_DIR", "models"))
 SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
 SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
-AMAZONREVIEWS_PATH = (DATA_DIR / "amazonreviews.test.txt.bz2", DATA_DIR / "amazonreviews.train.txt.bz2")
 AMAZONREVIEWS_URL = "https://www.kaggle.com/datasets/bittlingmayer/amazonreviews"
 IMDB50K_PATH = DATA_DIR / "imdb50k.csv"

 SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
 SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
+AMAZONREVIEWS_PATH = DATA_DIR / "amazonreviews.train.txt.bz2"
 AMAZONREVIEWS_URL = "https://www.kaggle.com/datasets/bittlingmayer/amazonreviews"
 IMDB50K_PATH = DATA_DIR / "imdb50k.csv"

app/data.py CHANGED Viewed

@@ -82,6 +82,7 @@ def tokenize(
             nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs),
             total=len(text_data),
             disable=not show_progress,
         )
     ]
@@ -138,12 +139,9 @@ def load_sentiment140(include_neutral: bool = False) -> tuple[list[str], list[in
     return data["text"].tolist(), data["sentiment"].tolist()
-def load_amazonreviews(merge: bool = True) -> tuple[list[str], list[int]]:
     """Load the amazonreviews dataset and make it suitable for use.
-    Args:
-        merge: Whether to merge the test and train datasets (otherwise ignore test)
     Returns:
         Text and label data
@@ -151,27 +149,20 @@ def load_amazonreviews(merge: bool = True) -> tuple[list[str], list[int]]:
         FileNotFoundError: If the dataset is not found
     """
     # Check if the dataset exists
-    test_exists = AMAZONREVIEWS_PATH[0].exists() or not merge
-    train_exists = AMAZONREVIEWS_PATH[1].exists()
-    if not (test_exists and train_exists):
         msg = (
-            f"Amazonreviews dataset not found at: '{AMAZONREVIEWS_PATH[0]}' and '{AMAZONREVIEWS_PATH[1]}'\n"
             "Please download the dataset from:\n"
             f"{AMAZONREVIEWS_URL}"
         )
         raise FileNotFoundError(msg)
-    # Load the datasets
-    dataset = []
-    with bz2.BZ2File(AMAZONREVIEWS_PATH[1]) as train_file:
-        dataset.extend([line.decode("utf-8") for line in train_file])
-    if merge:
-        with bz2.BZ2File(AMAZONREVIEWS_PATH[0]) as test_file:
-            dataset.extend([line.decode("utf-8") for line in test_file])
     # Split the data into labels and text
-    labels, texts = zip(*(line.split(" ", 1) for line in dataset))  # NOTE: Occasionally OOM
     # Map sentiment values
     sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]
@@ -270,7 +261,7 @@ def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test
         case "sentiment140":
             return load_sentiment140(include_neutral=False)
         case "amazonreviews":
-            return load_amazonreviews(merge=True)
         case "imdb50k":
             return load_imdb50k()
         case "test":

             nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs),
             total=len(text_data),
             disable=not show_progress,
+            unit="doc",
         )
     ]
     return data["text"].tolist(), data["sentiment"].tolist()
+def load_amazonreviews() -> tuple[list[str], list[int]]:
     """Load the amazonreviews dataset and make it suitable for use.
     Returns:
         Text and label data
         FileNotFoundError: If the dataset is not found
     """
     # Check if the dataset exists
+    if not AMAZONREVIEWS_PATH.exists():
         msg = (
+            f"Amazonreviews dataset not found at: '{AMAZONREVIEWS_PATH}'\n"
             "Please download the dataset from:\n"
             f"{AMAZONREVIEWS_URL}"
         )
         raise FileNotFoundError(msg)
+    # Load the dataset
+    with bz2.BZ2File(AMAZONREVIEWS_PATH) as f:
+        dataset = [line.decode("utf-8") for line in f]
     # Split the data into labels and text
+    labels, texts = zip(*(line.split(" ", 1) for line in dataset))
     # Map sentiment values
     sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]
         case "sentiment140":
             return load_sentiment140(include_neutral=False)
         case "amazonreviews":
+            return load_amazonreviews()
         case "imdb50k":
             return load_imdb50k()
         case "test":