Tymec commited on
Commit
d09d1f6
1 Parent(s): afaacd1

Ignore amazonreviews test

Browse files
Files changed (2) hide show
  1. app/constants.py +1 -1
  2. app/data.py +9 -18
app/constants.py CHANGED
@@ -10,7 +10,7 @@ MODELS_DIR = Path(os.getenv("MODELS_DIR", "models"))
10
  SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
11
  SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
12
 
13
- AMAZONREVIEWS_PATH = (DATA_DIR / "amazonreviews.test.txt.bz2", DATA_DIR / "amazonreviews.train.txt.bz2")
14
  AMAZONREVIEWS_URL = "https://www.kaggle.com/datasets/bittlingmayer/amazonreviews"
15
 
16
  IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
 
10
  SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
11
  SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
12
 
13
+ AMAZONREVIEWS_PATH = DATA_DIR / "amazonreviews.train.txt.bz2"
14
  AMAZONREVIEWS_URL = "https://www.kaggle.com/datasets/bittlingmayer/amazonreviews"
15
 
16
  IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
app/data.py CHANGED
@@ -82,6 +82,7 @@ def tokenize(
82
  nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs),
83
  total=len(text_data),
84
  disable=not show_progress,
 
85
  )
86
  ]
87
 
@@ -138,12 +139,9 @@ def load_sentiment140(include_neutral: bool = False) -> tuple[list[str], list[in
138
  return data["text"].tolist(), data["sentiment"].tolist()
139
 
140
 
141
- def load_amazonreviews(merge: bool = True) -> tuple[list[str], list[int]]:
142
  """Load the amazonreviews dataset and make it suitable for use.
143
 
144
- Args:
145
- merge: Whether to merge the test and train datasets (otherwise ignore test)
146
-
147
  Returns:
148
  Text and label data
149
 
@@ -151,27 +149,20 @@ def load_amazonreviews(merge: bool = True) -> tuple[list[str], list[int]]:
151
  FileNotFoundError: If the dataset is not found
152
  """
153
  # Check if the dataset exists
154
- test_exists = AMAZONREVIEWS_PATH[0].exists() or not merge
155
- train_exists = AMAZONREVIEWS_PATH[1].exists()
156
- if not (test_exists and train_exists):
157
  msg = (
158
- f"Amazonreviews dataset not found at: '{AMAZONREVIEWS_PATH[0]}' and '{AMAZONREVIEWS_PATH[1]}'\n"
159
  "Please download the dataset from:\n"
160
  f"{AMAZONREVIEWS_URL}"
161
  )
162
  raise FileNotFoundError(msg)
163
 
164
- # Load the datasets
165
- dataset = []
166
- with bz2.BZ2File(AMAZONREVIEWS_PATH[1]) as train_file:
167
- dataset.extend([line.decode("utf-8") for line in train_file])
168
-
169
- if merge:
170
- with bz2.BZ2File(AMAZONREVIEWS_PATH[0]) as test_file:
171
- dataset.extend([line.decode("utf-8") for line in test_file])
172
 
173
  # Split the data into labels and text
174
- labels, texts = zip(*(line.split(" ", 1) for line in dataset)) # NOTE: Occasionally OOM
175
 
176
  # Map sentiment values
177
  sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]
@@ -270,7 +261,7 @@ def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test
270
  case "sentiment140":
271
  return load_sentiment140(include_neutral=False)
272
  case "amazonreviews":
273
- return load_amazonreviews(merge=True)
274
  case "imdb50k":
275
  return load_imdb50k()
276
  case "test":
 
82
  nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs),
83
  total=len(text_data),
84
  disable=not show_progress,
85
+ unit="doc",
86
  )
87
  ]
88
 
 
139
  return data["text"].tolist(), data["sentiment"].tolist()
140
 
141
 
142
+ def load_amazonreviews() -> tuple[list[str], list[int]]:
143
  """Load the amazonreviews dataset and make it suitable for use.
144
 
 
 
 
145
  Returns:
146
  Text and label data
147
 
 
149
  FileNotFoundError: If the dataset is not found
150
  """
151
  # Check if the dataset exists
152
+ if not AMAZONREVIEWS_PATH.exists():
 
 
153
  msg = (
154
+ f"Amazonreviews dataset not found at: '{AMAZONREVIEWS_PATH}'\n"
155
  "Please download the dataset from:\n"
156
  f"{AMAZONREVIEWS_URL}"
157
  )
158
  raise FileNotFoundError(msg)
159
 
160
+ # Load the dataset
161
+ with bz2.BZ2File(AMAZONREVIEWS_PATH) as f:
162
+ dataset = [line.decode("utf-8") for line in f]
 
 
 
 
 
163
 
164
  # Split the data into labels and text
165
+ labels, texts = zip(*(line.split(" ", 1) for line in dataset))
166
 
167
  # Map sentiment values
168
  sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]
 
261
  case "sentiment140":
262
  return load_sentiment140(include_neutral=False)
263
  case "amazonreviews":
264
+ return load_amazonreviews()
265
  case "imdb50k":
266
  return load_imdb50k()
267
  case "test":