Spaces:
Sleeping
Sleeping
Ignore amazonreviews test
Browse files- app/constants.py +1 -1
- app/data.py +9 -18
app/constants.py
CHANGED
@@ -10,7 +10,7 @@ MODELS_DIR = Path(os.getenv("MODELS_DIR", "models"))
|
|
10 |
SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
|
11 |
SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
|
12 |
|
13 |
-
AMAZONREVIEWS_PATH =
|
14 |
AMAZONREVIEWS_URL = "https://www.kaggle.com/datasets/bittlingmayer/amazonreviews"
|
15 |
|
16 |
IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
|
|
|
10 |
SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
|
11 |
SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
|
12 |
|
13 |
+
AMAZONREVIEWS_PATH = DATA_DIR / "amazonreviews.train.txt.bz2"
|
14 |
AMAZONREVIEWS_URL = "https://www.kaggle.com/datasets/bittlingmayer/amazonreviews"
|
15 |
|
16 |
IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
|
app/data.py
CHANGED
@@ -82,6 +82,7 @@ def tokenize(
|
|
82 |
nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs),
|
83 |
total=len(text_data),
|
84 |
disable=not show_progress,
|
|
|
85 |
)
|
86 |
]
|
87 |
|
@@ -138,12 +139,9 @@ def load_sentiment140(include_neutral: bool = False) -> tuple[list[str], list[in
|
|
138 |
return data["text"].tolist(), data["sentiment"].tolist()
|
139 |
|
140 |
|
141 |
-
def load_amazonreviews(
|
142 |
"""Load the amazonreviews dataset and make it suitable for use.
|
143 |
|
144 |
-
Args:
|
145 |
-
merge: Whether to merge the test and train datasets (otherwise ignore test)
|
146 |
-
|
147 |
Returns:
|
148 |
Text and label data
|
149 |
|
@@ -151,27 +149,20 @@ def load_amazonreviews(merge: bool = True) -> tuple[list[str], list[int]]:
|
|
151 |
FileNotFoundError: If the dataset is not found
|
152 |
"""
|
153 |
# Check if the dataset exists
|
154 |
-
|
155 |
-
train_exists = AMAZONREVIEWS_PATH[1].exists()
|
156 |
-
if not (test_exists and train_exists):
|
157 |
msg = (
|
158 |
-
f"Amazonreviews dataset not found at: '{AMAZONREVIEWS_PATH
|
159 |
"Please download the dataset from:\n"
|
160 |
f"{AMAZONREVIEWS_URL}"
|
161 |
)
|
162 |
raise FileNotFoundError(msg)
|
163 |
|
164 |
-
# Load the
|
165 |
-
|
166 |
-
|
167 |
-
dataset.extend([line.decode("utf-8") for line in train_file])
|
168 |
-
|
169 |
-
if merge:
|
170 |
-
with bz2.BZ2File(AMAZONREVIEWS_PATH[0]) as test_file:
|
171 |
-
dataset.extend([line.decode("utf-8") for line in test_file])
|
172 |
|
173 |
# Split the data into labels and text
|
174 |
-
labels, texts = zip(*(line.split(" ", 1) for line in dataset))
|
175 |
|
176 |
# Map sentiment values
|
177 |
sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]
|
@@ -270,7 +261,7 @@ def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test
|
|
270 |
case "sentiment140":
|
271 |
return load_sentiment140(include_neutral=False)
|
272 |
case "amazonreviews":
|
273 |
-
return load_amazonreviews(
|
274 |
case "imdb50k":
|
275 |
return load_imdb50k()
|
276 |
case "test":
|
|
|
82 |
nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs),
|
83 |
total=len(text_data),
|
84 |
disable=not show_progress,
|
85 |
+
unit="doc",
|
86 |
)
|
87 |
]
|
88 |
|
|
|
139 |
return data["text"].tolist(), data["sentiment"].tolist()
|
140 |
|
141 |
|
142 |
+
def load_amazonreviews() -> tuple[list[str], list[int]]:
|
143 |
"""Load the amazonreviews dataset and make it suitable for use.
|
144 |
|
|
|
|
|
|
|
145 |
Returns:
|
146 |
Text and label data
|
147 |
|
|
|
149 |
FileNotFoundError: If the dataset is not found
|
150 |
"""
|
151 |
# Check if the dataset exists
|
152 |
+
if not AMAZONREVIEWS_PATH.exists():
|
|
|
|
|
153 |
msg = (
|
154 |
+
f"Amazonreviews dataset not found at: '{AMAZONREVIEWS_PATH}'\n"
|
155 |
"Please download the dataset from:\n"
|
156 |
f"{AMAZONREVIEWS_URL}"
|
157 |
)
|
158 |
raise FileNotFoundError(msg)
|
159 |
|
160 |
+
# Load the dataset
|
161 |
+
with bz2.BZ2File(AMAZONREVIEWS_PATH) as f:
|
162 |
+
dataset = [line.decode("utf-8") for line in f]
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
# Split the data into labels and text
|
165 |
+
labels, texts = zip(*(line.split(" ", 1) for line in dataset))
|
166 |
|
167 |
# Map sentiment values
|
168 |
sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]
|
|
|
261 |
case "sentiment140":
|
262 |
return load_sentiment140(include_neutral=False)
|
263 |
case "amazonreviews":
|
264 |
+
return load_amazonreviews()
|
265 |
case "imdb50k":
|
266 |
return load_imdb50k()
|
267 |
case "test":
|