Spaces:
Running
Running
Add min-df option
Browse files- app/cli.py +9 -0
- app/model.py +7 -1
app/cli.py
CHANGED
@@ -215,6 +215,13 @@ def evaluate(
|
|
215 |
show_default=True,
|
216 |
type=click.IntRange(1, None),
|
217 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
@click.option(
|
219 |
"--cv",
|
220 |
default=5,
|
@@ -261,6 +268,7 @@ def train(
|
|
261 |
dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
|
262 |
vectorizer: Literal["tfidf", "count", "hashing"],
|
263 |
max_features: int,
|
|
|
264 |
cv: int,
|
265 |
token_batch_size: int,
|
266 |
token_jobs: int,
|
@@ -324,6 +332,7 @@ def train(
|
|
324 |
label_data,
|
325 |
vectorizer=vectorizer,
|
326 |
max_features=max_features,
|
|
|
327 |
folds=cv,
|
328 |
n_jobs=train_jobs,
|
329 |
seed=seed,
|
|
|
215 |
show_default=True,
|
216 |
type=click.IntRange(1, None),
|
217 |
)
|
218 |
+
@click.option(
|
219 |
+
"--min-df",
|
220 |
+
default=0.1,
|
221 |
+
help="Minimum document frequency for the vectorizer (ignored for hashing)",
|
222 |
+
show_default=True,
|
223 |
+
type=click.FloatRange(0, 1),
|
224 |
+
)
|
225 |
@click.option(
|
226 |
"--cv",
|
227 |
default=5,
|
|
|
268 |
dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
|
269 |
vectorizer: Literal["tfidf", "count", "hashing"],
|
270 |
max_features: int,
|
271 |
+
min_df: float,
|
272 |
cv: int,
|
273 |
token_batch_size: int,
|
274 |
token_jobs: int,
|
|
|
332 |
label_data,
|
333 |
vectorizer=vectorizer,
|
334 |
max_features=max_features,
|
335 |
+
min_df=min_df,
|
336 |
folds=cv,
|
337 |
n_jobs=train_jobs,
|
338 |
seed=seed,
|
app/model.py
CHANGED
@@ -36,6 +36,7 @@ def _identity(x: list[str]) -> list[str]:
|
|
36 |
def _get_vectorizer(
|
37 |
name: Literal["tfidf", "count", "hashing"],
|
38 |
n_features: int,
|
|
|
39 |
ngram: tuple[int, int] = (1, 2),
|
40 |
) -> TransformerMixin:
|
41 |
"""Get the appropriate vectorizer.
|
@@ -43,6 +44,7 @@ def _get_vectorizer(
|
|
43 |
Args:
|
44 |
name: Type of vectorizer
|
45 |
n_features: Maximum number of features
|
|
|
46 |
ngram: N-gram range [min_n, max_n]
|
47 |
|
48 |
Returns:
|
@@ -64,11 +66,13 @@ def _get_vectorizer(
|
|
64 |
case "tfidf":
|
65 |
return TfidfVectorizer(
|
66 |
max_features=n_features,
|
|
|
67 |
**shared_params,
|
68 |
)
|
69 |
case "count":
|
70 |
return CountVectorizer(
|
71 |
max_features=n_features,
|
|
|
72 |
**shared_params,
|
73 |
)
|
74 |
case "hashing":
|
@@ -92,6 +96,7 @@ def train_model(
|
|
92 |
label_data: list[int],
|
93 |
vectorizer: Literal["tfidf", "count", "hashing"],
|
94 |
max_features: int,
|
|
|
95 |
folds: int = 5,
|
96 |
n_jobs: int = 4,
|
97 |
seed: int = 42,
|
@@ -103,6 +108,7 @@ def train_model(
|
|
103 |
label_data: Label data
|
104 |
vectorizer: Which vectorizer to use
|
105 |
max_features: Maximum number of features
|
|
|
106 |
folds: Number of cross-validation folds
|
107 |
n_jobs: Number of parallel jobs
|
108 |
seed: Random seed (None for random seed)
|
@@ -122,7 +128,7 @@ def train_model(
|
|
122 |
random_state=rs,
|
123 |
)
|
124 |
|
125 |
-
vectorizer = _get_vectorizer(vectorizer, max_features)
|
126 |
classifiers = [
|
127 |
(LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
|
128 |
# (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
|
|
|
36 |
def _get_vectorizer(
|
37 |
name: Literal["tfidf", "count", "hashing"],
|
38 |
n_features: int,
|
39 |
+
min_df: float = 0.1,
|
40 |
ngram: tuple[int, int] = (1, 2),
|
41 |
) -> TransformerMixin:
|
42 |
"""Get the appropriate vectorizer.
|
|
|
44 |
Args:
|
45 |
name: Type of vectorizer
|
46 |
n_features: Maximum number of features
|
47 |
+
min_df: Minimum document frequency (ignored for hashing)
|
48 |
ngram: N-gram range [min_n, max_n]
|
49 |
|
50 |
Returns:
|
|
|
66 |
case "tfidf":
|
67 |
return TfidfVectorizer(
|
68 |
max_features=n_features,
|
69 |
+
min_df=min_df,
|
70 |
**shared_params,
|
71 |
)
|
72 |
case "count":
|
73 |
return CountVectorizer(
|
74 |
max_features=n_features,
|
75 |
+
min_df=min_df,
|
76 |
**shared_params,
|
77 |
)
|
78 |
case "hashing":
|
|
|
96 |
label_data: list[int],
|
97 |
vectorizer: Literal["tfidf", "count", "hashing"],
|
98 |
max_features: int,
|
99 |
+
min_df: float = 0.1,
|
100 |
folds: int = 5,
|
101 |
n_jobs: int = 4,
|
102 |
seed: int = 42,
|
|
|
108 |
label_data: Label data
|
109 |
vectorizer: Which vectorizer to use
|
110 |
max_features: Maximum number of features
|
111 |
+
min_df: Minimum document frequency (ignored for hashing)
|
112 |
folds: Number of cross-validation folds
|
113 |
n_jobs: Number of parallel jobs
|
114 |
seed: Random seed (None for random seed)
|
|
|
128 |
random_state=rs,
|
129 |
)
|
130 |
|
131 |
+
vectorizer = _get_vectorizer(vectorizer, max_features, min_df)
|
132 |
classifiers = [
|
133 |
(LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
|
134 |
# (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
|