Spaces:
Running
Running
Improved models
Browse files- README.md +2 -2
- app/model.py +0 -6
- models/imdb50k_tfidf_ft20000.pkl +2 -2
- models/sentiment140_tfidf_ft20000.pkl +2 -2
README.md
CHANGED
@@ -211,8 +211,8 @@ graph LR
|
|
211 |
The following pre-trained models are available for use:
|
212 |
| Dataset | Vectorizer | Classifier | Features | Accuracy on test | Accuracy on self | Model |
|
213 |
| --- | --- | --- | --- | --- | --- | --- |
|
214 |
-
| `imdb50k` | `tfidf` | `LinearRegression` | 20 000 |
|
215 |
-
| `sentiment140` | `tfidf` | `LinearRegression` | 20 000 |
|
216 |
| `amazonreviews` | `tfidf` | `LinearRegression` | 20 000 | ❌ | ❌ | [Here](models/amazonreviews_tfidf_ft1048576.pkl) |
|
217 |
|
218 |
|
|
|
211 |
The following pre-trained models are available for use:
|
212 |
| Dataset | Vectorizer | Classifier | Features | Accuracy on test | Accuracy on self | Model |
|
213 |
| --- | --- | --- | --- | --- | --- | --- |
|
214 |
+
| `imdb50k` | `tfidf` | `LinearRegression` | 20 000 | 83.24% ± 0.99% | 89.24% ± 0.13% | [Here](models/imdb50k_tfidf_ft20000.pkl) |
|
215 |
+
| `sentiment140` | `tfidf` | `LinearRegression` | 20 000 | 83.24% ± 0.99% | 77.32% ± 0.28% | [Here](models/sentiment140_tfidf_ft20000.pkl) |
|
216 |
| `amazonreviews` | `tfidf` | `LinearRegression` | 20 000 | ❌ | ❌ | [Here](models/amazonreviews_tfidf_ft1048576.pkl) |
|
217 |
|
218 |
|
app/model.py
CHANGED
@@ -36,7 +36,6 @@ def _identity(x: list[str]) -> list[str]:
|
|
36 |
def _get_vectorizer(
|
37 |
name: Literal["tfidf", "count", "hashing"],
|
38 |
n_features: int,
|
39 |
-
df: tuple[float, float] = (1.0, 1.0),
|
40 |
ngram: tuple[int, int] = (1, 2),
|
41 |
) -> TransformerMixin:
|
42 |
"""Get the appropriate vectorizer.
|
@@ -44,7 +43,6 @@ def _get_vectorizer(
|
|
44 |
Args:
|
45 |
name: Type of vectorizer
|
46 |
n_features: Maximum number of features
|
47 |
-
df: Document frequency range [min_df, max_df] (ignored for HashingVectorizer)
|
48 |
ngram: N-gram range [min_n, max_n]
|
49 |
|
50 |
Returns:
|
@@ -66,15 +64,11 @@ def _get_vectorizer(
|
|
66 |
case "tfidf":
|
67 |
return TfidfVectorizer(
|
68 |
max_features=n_features,
|
69 |
-
min_df=df[0],
|
70 |
-
max_df=df[1],
|
71 |
**shared_params,
|
72 |
)
|
73 |
case "count":
|
74 |
return CountVectorizer(
|
75 |
max_features=n_features,
|
76 |
-
min_df=df[0],
|
77 |
-
max_df=df[1],
|
78 |
**shared_params,
|
79 |
)
|
80 |
case "hashing":
|
|
|
36 |
def _get_vectorizer(
|
37 |
name: Literal["tfidf", "count", "hashing"],
|
38 |
n_features: int,
|
|
|
39 |
ngram: tuple[int, int] = (1, 2),
|
40 |
) -> TransformerMixin:
|
41 |
"""Get the appropriate vectorizer.
|
|
|
43 |
Args:
|
44 |
name: Type of vectorizer
|
45 |
n_features: Maximum number of features
|
|
|
46 |
ngram: N-gram range [min_n, max_n]
|
47 |
|
48 |
Returns:
|
|
|
64 |
case "tfidf":
|
65 |
return TfidfVectorizer(
|
66 |
max_features=n_features,
|
|
|
|
|
67 |
**shared_params,
|
68 |
)
|
69 |
case "count":
|
70 |
return CountVectorizer(
|
71 |
max_features=n_features,
|
|
|
|
|
72 |
**shared_params,
|
73 |
)
|
74 |
case "hashing":
|
models/imdb50k_tfidf_ft20000.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0c916d380fc84a33f3cb5892cd10e4aaa29330cbbac4243860e91fe9392df897
|
3 |
+
size 398706
|
models/sentiment140_tfidf_ft20000.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1308cb96bbee2befeb585c99fb3ad78b4bbef0504fcb5070d8c738289c212431
|
3 |
+
size 397501
|