Tymec commited on
Commit
8b10b79
1 Parent(s): 7f29122

Add min-df option

Browse files
Files changed (2) hide show
  1. app/cli.py +9 -0
  2. app/model.py +7 -1
app/cli.py CHANGED
@@ -215,6 +215,13 @@ def evaluate(
215
  show_default=True,
216
  type=click.IntRange(1, None),
217
  )
 
 
 
 
 
 
 
218
  @click.option(
219
  "--cv",
220
  default=5,
@@ -261,6 +268,7 @@ def train(
261
  dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
262
  vectorizer: Literal["tfidf", "count", "hashing"],
263
  max_features: int,
 
264
  cv: int,
265
  token_batch_size: int,
266
  token_jobs: int,
@@ -324,6 +332,7 @@ def train(
324
  label_data,
325
  vectorizer=vectorizer,
326
  max_features=max_features,
 
327
  folds=cv,
328
  n_jobs=train_jobs,
329
  seed=seed,
 
215
  show_default=True,
216
  type=click.IntRange(1, None),
217
  )
218
+ @click.option(
219
+ "--min-df",
220
+ default=0.1,
221
+ help="Minimum document frequency for the vectorizer (ignored for hashing)",
222
+ show_default=True,
223
+ type=click.FloatRange(0, 1),
224
+ )
225
  @click.option(
226
  "--cv",
227
  default=5,
 
268
  dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
269
  vectorizer: Literal["tfidf", "count", "hashing"],
270
  max_features: int,
271
+ min_df: float,
272
  cv: int,
273
  token_batch_size: int,
274
  token_jobs: int,
 
332
  label_data,
333
  vectorizer=vectorizer,
334
  max_features=max_features,
335
+ min_df=min_df,
336
  folds=cv,
337
  n_jobs=train_jobs,
338
  seed=seed,
app/model.py CHANGED
@@ -36,6 +36,7 @@ def _identity(x: list[str]) -> list[str]:
36
  def _get_vectorizer(
37
  name: Literal["tfidf", "count", "hashing"],
38
  n_features: int,
 
39
  ngram: tuple[int, int] = (1, 2),
40
  ) -> TransformerMixin:
41
  """Get the appropriate vectorizer.
@@ -43,6 +44,7 @@ def _get_vectorizer(
43
  Args:
44
  name: Type of vectorizer
45
  n_features: Maximum number of features
 
46
  ngram: N-gram range [min_n, max_n]
47
 
48
  Returns:
@@ -64,11 +66,13 @@ def _get_vectorizer(
64
  case "tfidf":
65
  return TfidfVectorizer(
66
  max_features=n_features,
 
67
  **shared_params,
68
  )
69
  case "count":
70
  return CountVectorizer(
71
  max_features=n_features,
 
72
  **shared_params,
73
  )
74
  case "hashing":
@@ -92,6 +96,7 @@ def train_model(
92
  label_data: list[int],
93
  vectorizer: Literal["tfidf", "count", "hashing"],
94
  max_features: int,
 
95
  folds: int = 5,
96
  n_jobs: int = 4,
97
  seed: int = 42,
@@ -103,6 +108,7 @@ def train_model(
103
  label_data: Label data
104
  vectorizer: Which vectorizer to use
105
  max_features: Maximum number of features
 
106
  folds: Number of cross-validation folds
107
  n_jobs: Number of parallel jobs
108
  seed: Random seed (None for random seed)
@@ -122,7 +128,7 @@ def train_model(
122
  random_state=rs,
123
  )
124
 
125
- vectorizer = _get_vectorizer(vectorizer, max_features)
126
  classifiers = [
127
  (LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
128
  # (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
 
36
  def _get_vectorizer(
37
  name: Literal["tfidf", "count", "hashing"],
38
  n_features: int,
39
+ min_df: float = 0.1,
40
  ngram: tuple[int, int] = (1, 2),
41
  ) -> TransformerMixin:
42
  """Get the appropriate vectorizer.
 
44
  Args:
45
  name: Type of vectorizer
46
  n_features: Maximum number of features
47
+ min_df: Minimum document frequency (ignored for hashing)
48
  ngram: N-gram range [min_n, max_n]
49
 
50
  Returns:
 
66
  case "tfidf":
67
  return TfidfVectorizer(
68
  max_features=n_features,
69
+ min_df=min_df,
70
  **shared_params,
71
  )
72
  case "count":
73
  return CountVectorizer(
74
  max_features=n_features,
75
+ min_df=min_df,
76
  **shared_params,
77
  )
78
  case "hashing":
 
96
  label_data: list[int],
97
  vectorizer: Literal["tfidf", "count", "hashing"],
98
  max_features: int,
99
+ min_df: float = 0.1,
100
  folds: int = 5,
101
  n_jobs: int = 4,
102
  seed: int = 42,
 
108
  label_data: Label data
109
  vectorizer: Which vectorizer to use
110
  max_features: Maximum number of features
111
+ min_df: Minimum document frequency (ignored for hashing)
112
  folds: Number of cross-validation folds
113
  n_jobs: Number of parallel jobs
114
  seed: Random seed (None for random seed)
 
128
  random_state=rs,
129
  )
130
 
131
+ vectorizer = _get_vectorizer(vectorizer, max_features, min_df)
132
  classifiers = [
133
  (LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
134
  # (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),