Pietro Lesci commited on
Commit
02c2d7e
1 Parent(s): a66b528

divide into individual files

Browse files
Files changed (4) hide show
  1. src/plotting.py +71 -0
  2. src/preprocessing.py +200 -0
  3. src/utils.py +9 -260
  4. src/wordifier.py +87 -0
src/plotting.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import altair as alt
2
+ import pandas as pd
3
+ import streamlit as st
4
+ from stqdm import stqdm
5
+
6
+ stqdm.pandas()
7
+
8
+
9
+ def plot_labels_prop(data: pd.DataFrame, label_column: str):
10
+
11
+ unique_value_limit = 100
12
+
13
+ if data[label_column].nunique() > unique_value_limit:
14
+
15
+ st.warning(
16
+ f"""
17
+ The column you selected has more than {unique_value_limit}.
18
+ Are you sure it's the right column? If it is, please note that
19
+ this will impact __Wordify__ performance.
20
+ """
21
+ )
22
+
23
+ return
24
+
25
+ source = data[label_column].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
26
+ source["Props"] = source["Counts"] / source["Counts"].sum()
27
+ source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
28
+
29
+ bars = (
30
+ alt.Chart(source)
31
+ .mark_bar()
32
+ .encode(
33
+ x=alt.X("Labels:O", sort="-y"),
34
+ y="Counts:Q",
35
+ )
36
+ )
37
+
38
+ text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
39
+
40
+ return (bars + text).properties(height=300)
41
+
42
+
43
+ def plot_nchars(data: pd.DataFrame, text_column: str):
44
+ source = data[text_column].str.len().to_frame()
45
+
46
+ plot = (
47
+ alt.Chart(source)
48
+ .mark_bar()
49
+ .encode(
50
+ alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
51
+ alt.Y("count()", axis=alt.Axis(title="")),
52
+ )
53
+ )
54
+
55
+ return plot.properties(height=300)
56
+
57
+
58
+ def plot_score(data: pd.DataFrame, label_col: str, label: str):
59
+
60
+ source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
61
+
62
+ plot = (
63
+ alt.Chart(source)
64
+ .mark_bar()
65
+ .encode(
66
+ y=alt.Y("word:O", sort="-x"),
67
+ x="score:Q",
68
+ )
69
+ )
70
+
71
+ return plot.properties(height=max(30 * source.shape[0], 50))
src/preprocessing.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ from collections import OrderedDict
4
+ from typing import Callable, Dict, List
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import spacy
9
+ import streamlit as st
10
+ from pandas.core.series import Series
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
+ from sklearn.preprocessing import LabelEncoder
13
+ from stqdm import stqdm
14
+ from textacy.preprocessing import make_pipeline, normalize, remove, replace
15
+
16
+ from .configs import Languages
17
+
18
+ stqdm.pandas()
19
+
20
+
21
+ def encode(text: pd.Series, labels: pd.Series):
22
+ """
23
+ Encodes text in mathematical object ameanable to training algorithm
24
+ """
25
+ tfidf_vectorizer = TfidfVectorizer(
26
+ input="content", # default: file already in memory
27
+ encoding="utf-8", # default
28
+ decode_error="strict", # default
29
+ strip_accents=None, # do nothing
30
+ lowercase=False, # do nothing
31
+ preprocessor=None, # do nothing - default
32
+ tokenizer=None, # default
33
+ stop_words=None, # do nothing
34
+ analyzer="word",
35
+ ngram_range=(1, 3), # maximum 3-ngrams
36
+ min_df=0.001,
37
+ max_df=0.75,
38
+ sublinear_tf=True,
39
+ )
40
+ label_encoder = LabelEncoder()
41
+
42
+ with st.spinner("Encoding text using TF-IDF and Encoding labels"):
43
+ X = tfidf_vectorizer.fit_transform(text.values)
44
+ y = label_encoder.fit_transform(labels.values)
45
+
46
+ return {
47
+ "X": X,
48
+ "y": y,
49
+ "X_names": np.array(tfidf_vectorizer.get_feature_names()),
50
+ "y_names": label_encoder.classes_,
51
+ }
52
+
53
+
54
+ # more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
55
+ # and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
56
+ # fmt: off
57
+ _re_normalize_acronyms = re.compile(r"(?:[a-zA-Z]\.){2,}")
58
+ def normalize_acronyms(t):
59
+ return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
60
+
61
+
62
+ _re_non_word = re.compile(r"\W")
63
+ def remove_non_word(t):
64
+ return _re_non_word.sub(" ", t)
65
+
66
+
67
+ _re_space = re.compile(r" {2,}")
68
+ def normalize_useless_spaces(t):
69
+ return _re_space.sub(" ", t)
70
+
71
+
72
+ _re_rep = re.compile(r"(\S)(\1{2,})")
73
+ def normalize_repeating_chars(t):
74
+ def _replace_rep(m):
75
+ c, cc = m.groups()
76
+ return c
77
+
78
+ return _re_rep.sub(_replace_rep, t)
79
+
80
+
81
+ _re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
82
+ def normalize_repeating_words(t):
83
+ def _replace_wrep(m):
84
+ c, cc, e = m.groups()
85
+ return c
86
+
87
+ return _re_wrep.sub(_replace_wrep, t)
88
+
89
+ # fmt: on
90
+ class TextPreprocessor:
91
+ def __init__(
92
+ self,
93
+ language: str,
94
+ cleaning_steps: List[str],
95
+ lemmatizer_when: str = "last",
96
+ remove_stop: bool = True,
97
+ ) -> None:
98
+
99
+ # prepare lemmatizer
100
+ self.language = language
101
+ self.nlp = spacy.load(
102
+ Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
103
+ )
104
+ self.lemmatizer_when = self._lemmatization_options().get(lemmatizer_when, None)
105
+ self.remove_stop = remove_stop
106
+ self._lemmatize = self._get_lemmatizer()
107
+
108
+ # prepare cleaning
109
+ self.cleaning_steps = [
110
+ self._cleaning_options()[step]
111
+ for step in cleaning_steps
112
+ if step in self._cleaning_options()
113
+ ]
114
+ self.cleaning_pipeline = (
115
+ make_pipeline(*self.cleaning_steps) if self.cleaning_steps else lambda x: x
116
+ )
117
+
118
+ def _get_lemmatizer(self) -> Callable:
119
+ """Return the correct spacy Doc-level lemmatizer"""
120
+ if self.remove_stop:
121
+
122
+ def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
123
+ """Lemmatizes spacy Doc and removes stopwords"""
124
+ return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
125
+
126
+ else:
127
+
128
+ def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
129
+ """Lemmatizes spacy Doc"""
130
+ return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
131
+
132
+ return lemmatizer
133
+
134
+ @staticmethod
135
+ def _lemmatization_options() -> Dict[str, str]:
136
+ return {
137
+ "Before preprocessing": "first",
138
+ "After preprocessing": "last",
139
+ "Never! Let's do it quick and dirty": None,
140
+ }
141
+
142
+ def lemmatizer(self, series: pd.Series) -> pd.Series:
143
+ """
144
+ Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
145
+ """
146
+ res = []
147
+ pbar = stqdm(total=len(series))
148
+ for doc in self.nlp.pipe(series, batch_size=500):
149
+ res.append(self._lemmatize(doc))
150
+ pbar.update(1)
151
+ pbar.close()
152
+ return pd.Series(res)
153
+
154
+ @staticmethod
155
+ def _cleaning_options():
156
+ """Returns available cleaning steps in order"""
157
+ return OrderedDict(
158
+ [
159
+ ("lower", lambda x: x.lower()),
160
+ ("normalize_unicode", normalize.unicode),
161
+ ("normalize_bullet_points", normalize.bullet_points),
162
+ ("normalize_hyphenated_words", normalize.hyphenated_words),
163
+ ("normalize_quotation_marks", normalize.quotation_marks),
164
+ ("normalize_whitespace", normalize.whitespace),
165
+ ("replace_urls", replace.urls),
166
+ ("replace_currency_symbols", replace.currency_symbols),
167
+ ("replace_emails", replace.emails),
168
+ ("replace_emojis", replace.emojis),
169
+ ("replace_hashtags", replace.hashtags),
170
+ ("replace_numbers", replace.numbers),
171
+ ("replace_phone_numbers", replace.phone_numbers),
172
+ ("replace_user_handles", replace.user_handles),
173
+ ("normalize_acronyms", normalize_acronyms),
174
+ ("remove_accents", remove.accents),
175
+ ("remove_brackets", remove.brackets),
176
+ ("remove_html_tags", remove.html_tags),
177
+ ("remove_punctuation", remove.punctuation),
178
+ ("remove_non_words", remove_non_word),
179
+ ("normalize_useless_spaces", normalize_useless_spaces),
180
+ ("normalize_repeating_chars", normalize_repeating_chars),
181
+ ("normalize_repeating_words", normalize_repeating_words),
182
+ ("strip", lambda x: x.strip()),
183
+ ]
184
+ )
185
+
186
+ def fit_transform(self, series: pd.Series) -> Series:
187
+ """Applies text preprocessing"""
188
+
189
+ if self.lemmatizer_when == "first":
190
+ with st.spinner("Lemmatizing"):
191
+ series = self.lemmatizer(series)
192
+
193
+ with st.spinner("Cleaning"):
194
+ series = series.progress_map(self.cleaning_pipeline)
195
+
196
+ if self.lemmatizer_when == "last":
197
+ with st.spinner("Lemmatizing"):
198
+ series = self.lemmatizer(series)
199
+
200
+ return series
src/utils.py CHANGED
@@ -1,24 +1,12 @@
1
  import base64
2
- import re
3
- from collections import OrderedDict
4
- from typing import Callable, Dict, List
5
-
6
  import altair as alt
7
- import numpy as np
8
  import pandas as pd
9
- import spacy
10
  import streamlit as st
11
- from pandas.core.series import Series
12
  from PIL import Image
13
- from sklearn.feature_extraction.text import TfidfVectorizer
14
- from sklearn.linear_model import LogisticRegression
15
- from sklearn.preprocessing import LabelEncoder
16
- from sklearn.utils import resample
17
  from stqdm import stqdm
18
- from textacy.preprocessing import make_pipeline, normalize, remove, replace
19
 
20
- from .configs import Languages, ModelConfigs, SupportedFiles
21
- import string
22
  stqdm.pandas()
23
 
24
 
@@ -27,7 +15,7 @@ def get_logo(path):
27
  return Image.open(path)
28
 
29
 
30
- # @st.cache(suppress_st_warning=True)
31
  @st.cache(allow_output_mutation=True)
32
  def read_file(uploaded_file) -> pd.DataFrame:
33
 
@@ -51,258 +39,19 @@ def download_button(dataframe: pd.DataFrame, name: str):
51
  st.write(href, unsafe_allow_html=True)
52
 
53
 
54
- def encode(text: pd.Series, labels: pd.Series):
55
- tfidf_vectorizer = TfidfVectorizer(
56
- input="content", # default: file already in memory
57
- encoding="utf-8", # default
58
- decode_error="strict", # default
59
- strip_accents=None, # do nothing
60
- lowercase=False, # do nothing
61
- preprocessor=None, # do nothing - default
62
- tokenizer=None, # default
63
- stop_words=None, # do nothing
64
- analyzer="word",
65
- ngram_range=(1, 3), # maximum 3-ngrams
66
- min_df=0.001,
67
- max_df=0.75,
68
- sublinear_tf=True,
69
- )
70
- label_encoder = LabelEncoder()
71
-
72
- with st.spinner("Encoding text using TF-IDF and Encoding labels"):
73
- X = tfidf_vectorizer.fit_transform(text.values)
74
- y = label_encoder.fit_transform(labels.values)
75
-
76
- return {
77
- "X": X,
78
- "y": y,
79
- "X_names": np.array(tfidf_vectorizer.get_feature_names()),
80
- "y_names": label_encoder.classes_,
81
- }
82
-
83
-
84
- def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):
85
-
86
- n_instances, n_features = X.shape
87
- n_classes = len(y_names)
88
-
89
- # NOTE: the * 10 / 10 trick is to have "nice" round-ups
90
- sample_fraction = np.ceil((n_features / n_instances) * 10) / 10
91
-
92
- sample_size = min(
93
- # this is the maximum supported
94
- configs.MAX_SELECTION.value,
95
- # at minimum you want MIN_SELECTION but in general you want
96
- # n_instances * sample_fraction
97
- max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
98
- # however if previous one is bigger the the available instances take
99
- # the number of available instances
100
- n_instances,
101
- )
102
-
103
- # TODO: might want to try out something to subsample features at each iteration
104
-
105
- # initialize coefficient matrices
106
- pos_scores = np.zeros((n_classes, n_features), dtype=int)
107
- neg_scores = np.zeros((n_classes, n_features), dtype=int)
108
-
109
- with st.spinner("Wordifying!"):
110
-
111
- for _ in stqdm(range(configs.NUM_ITERS.value)):
112
-
113
- # run randomized regression
114
- clf = LogisticRegression(
115
- penalty="l1",
116
- C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
117
- solver="liblinear",
118
- multi_class="auto",
119
- max_iter=500,
120
- class_weight="balanced",
121
- )
122
-
123
- # sample indices to subsample matrix
124
- selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)
125
-
126
- # fit
127
- try:
128
- clf.fit(X[selection], y[selection])
129
- except ValueError:
130
- continue
131
-
132
- # record coefficients
133
- if n_classes == 2:
134
- pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
135
- neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
136
- pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
137
- neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
138
- else:
139
- pos_scores += clf.coef_ > 0
140
- neg_scores += clf.coef_ < 0
141
-
142
- # normalize
143
- pos_scores = pos_scores / configs.NUM_ITERS.value
144
- neg_scores = neg_scores / configs.NUM_ITERS.value
145
-
146
- # get only active features
147
- pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0)
148
- neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0)
149
-
150
- # prepare DataFrame
151
- pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
152
- neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]
153
-
154
- posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
155
- negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
156
-
157
- return posdf, negdf
158
-
159
-
160
- # more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
161
- # and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
162
- _re_normalize_acronyms = re.compile("(?:[a-zA-Z]\.){2,}")
163
- def normalize_acronyms(t):
164
- return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
165
-
166
- _re_non_word = re.compile("\W")
167
- def remove_non_word(t):
168
- return _re_non_word.sub(" ", t)
169
-
170
- _re_space = re.compile(" {2,}")
171
- def normalize_useless_spaces(t):
172
- return _re_space.sub(" ", t)
173
-
174
-
175
- _re_rep = re.compile(r"(\S)(\1{2,})")
176
- def normalize_repeating_chars(t):
177
- def _replace_rep(m):
178
- c, cc = m.groups()
179
- return c
180
-
181
- return _re_rep.sub(_replace_rep, t)
182
-
183
-
184
- _re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
185
- def normalize_repeating_words(t):
186
- def _replace_wrep(m):
187
- c, cc, e = m.groups()
188
- return c
189
-
190
- return _re_wrep.sub(_replace_wrep, t)
191
-
192
-
193
- class TextPreprocessor:
194
- def __init__(
195
- self, language: str, cleaning_steps: List[str], lemmatizer_when: str = "last", remove_stop: bool = True
196
- ) -> None:
197
- # prepare lemmatizer
198
- self.language = language
199
- self.nlp = spacy.load(Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"])
200
- self.lemmatizer_when = self._lemmatization_options().get(lemmatizer_when, None)
201
- self.remove_stop = remove_stop
202
- self._lemmatize = self._get_lemmatizer()
203
-
204
- # prepare cleaning
205
- self.cleaning_steps = [
206
- self._cleaning_options()[step] for step in cleaning_steps if step in self._cleaning_options()
207
- ]
208
- self.cleaning_pipeline = make_pipeline(*self.cleaning_steps) if self.cleaning_steps else lambda x: x
209
-
210
- def _get_lemmatizer(self) -> Callable:
211
- """Return the correct spacy Doc-level lemmatizer"""
212
- if self.remove_stop:
213
-
214
- def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
215
- """Lemmatizes spacy Doc and removes stopwords"""
216
- return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
217
-
218
- else:
219
-
220
- def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
221
- """Lemmatizes spacy Doc"""
222
- return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
223
-
224
- return lemmatizer
225
-
226
- @staticmethod
227
- def _lemmatization_options() -> Dict[str, str]:
228
- return {
229
- "Before preprocessing": "first",
230
- "After preprocessing": "last",
231
- "Never! Let's do it quick and dirty": None,
232
- }
233
-
234
- def lemmatizer(self, series: pd.Series) -> pd.Series:
235
- """
236
- Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
237
- """
238
- res = []
239
- pbar = stqdm(total=len(series))
240
- for doc in self.nlp.pipe(series, batch_size=500):
241
- res.append(self._lemmatize(doc))
242
- pbar.update(1)
243
- pbar.close()
244
- return pd.Series(res)
245
-
246
- @staticmethod
247
- def _cleaning_options():
248
- """Returns available cleaning steps in order"""
249
- return OrderedDict(
250
- [
251
- ("lower", lambda x: x.lower()),
252
- ("normalize_unicode", normalize.unicode),
253
- ("normalize_bullet_points", normalize.bullet_points),
254
- ("normalize_hyphenated_words", normalize.hyphenated_words),
255
- ("normalize_quotation_marks", normalize.quotation_marks),
256
- ("normalize_whitespace", normalize.whitespace),
257
- ("replace_urls", replace.urls),
258
- ("replace_currency_symbols", replace.currency_symbols),
259
- ("replace_emails", replace.emails),
260
- ("replace_emojis", replace.emojis),
261
- ("replace_hashtags", replace.hashtags),
262
- ("replace_numbers", replace.numbers),
263
- ("replace_phone_numbers", replace.phone_numbers),
264
- ("replace_user_handles", replace.user_handles),
265
- ("normalize_acronyms", normalize_acronyms),
266
- ("remove_accents", remove.accents),
267
- ("remove_brackets", remove.brackets),
268
- ("remove_html_tags", remove.html_tags),
269
- ("remove_punctuation", remove.punctuation),
270
- ("remove_non_words", remove_non_word),
271
- ("normalize_useless_spaces", normalize_useless_spaces),
272
- ("normalize_repeating_chars", normalize_repeating_chars),
273
- ("normalize_repeating_words", normalize_repeating_words),
274
- ("strip", lambda x: x.strip()),
275
- ]
276
- )
277
-
278
- def fit_transform(self, series: pd.Series) -> Series:
279
- """Applies text preprocessing"""
280
-
281
- if self.lemmatizer_when == "first":
282
- with st.spinner("Lemmatizing"):
283
- series = self.lemmatizer(series)
284
-
285
- with st.spinner("Cleaning"):
286
- series = series.progress_map(self.cleaning_pipeline)
287
-
288
- if self.lemmatizer_when == "last":
289
- with st.spinner("Lemmatizing"):
290
- series = self.lemmatizer(series)
291
-
292
- return series
293
-
294
-
295
  def plot_labels_prop(data: pd.DataFrame, label_column: str):
296
 
297
  unique_value_limit = 100
298
-
299
  if data[label_column].nunique() > unique_value_limit:
300
 
301
- st.warning(f"""
302
- The column you selected has more than {unique_value_limit}.
 
303
  Are you sure it's the right column? If it is, please note that
304
  this will impact __Wordify__ performance.
305
- """)
 
306
 
307
  return
308
 
 
1
  import base64
 
 
 
 
2
  import altair as alt
 
3
  import pandas as pd
 
4
  import streamlit as st
 
5
  from PIL import Image
 
 
 
 
6
  from stqdm import stqdm
 
7
 
8
+ from .configs import SupportedFiles
9
+
10
  stqdm.pandas()
11
 
12
 
 
15
  return Image.open(path)
16
 
17
 
18
+ # @st.cache(suppress_st_warning=True)
19
  @st.cache(allow_output_mutation=True)
20
  def read_file(uploaded_file) -> pd.DataFrame:
21
 
 
39
  st.write(href, unsafe_allow_html=True)
40
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def plot_labels_prop(data: pd.DataFrame, label_column: str):
43
 
44
  unique_value_limit = 100
45
+
46
  if data[label_column].nunique() > unique_value_limit:
47
 
48
+ st.warning(
49
+ f"""
50
+ The column you selected has more than {unique_value_limit}.
51
  Are you sure it's the right column? If it is, please note that
52
  this will impact __Wordify__ performance.
53
+ """
54
+ )
55
 
56
  return
57
 
src/wordifier.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import numpy as np
3
+ import pandas as pd
4
+ import streamlit as st
5
+ from sklearn.linear_model import LogisticRegression
6
+ from sklearn.utils import resample
7
+ from stqdm import stqdm
8
+
9
+ from .configs import ModelConfigs
10
+
11
+ stqdm.pandas()
12
+
13
+
14
+ def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):
15
+
16
+ n_instances, n_features = X.shape
17
+ n_classes = len(y_names)
18
+
19
+ # NOTE: the * 10 / 10 trick is to have "nice" round-ups
20
+ sample_fraction = np.ceil((n_features / n_instances) * 10) / 10
21
+
22
+ sample_size = min(
23
+ # this is the maximum supported
24
+ configs.MAX_SELECTION.value,
25
+ # at minimum you want MIN_SELECTION but in general you want
26
+ # n_instances * sample_fraction
27
+ max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
28
+ # however if previous one is bigger the the available instances take
29
+ # the number of available instances
30
+ n_instances,
31
+ )
32
+
33
+ # TODO: might want to try out something to subsample features at each iteration
34
+
35
+ # initialize coefficient matrices
36
+ pos_scores = np.zeros((n_classes, n_features), dtype=int)
37
+ neg_scores = np.zeros((n_classes, n_features), dtype=int)
38
+
39
+ with st.spinner("Wordifying!"):
40
+
41
+ for _ in stqdm(range(configs.NUM_ITERS.value)):
42
+
43
+ # run randomized regression
44
+ clf = LogisticRegression(
45
+ penalty="l1",
46
+ C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
47
+ solver="liblinear",
48
+ multi_class="auto",
49
+ max_iter=500,
50
+ class_weight="balanced",
51
+ )
52
+
53
+ # sample indices to subsample matrix
54
+ selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)
55
+
56
+ # fit
57
+ try:
58
+ clf.fit(X[selection], y[selection])
59
+ except ValueError:
60
+ continue
61
+
62
+ # record coefficients
63
+ if n_classes == 2:
64
+ pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
65
+ neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
66
+ pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
67
+ neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
68
+ else:
69
+ pos_scores += clf.coef_ > 0
70
+ neg_scores += clf.coef_ < 0
71
+
72
+ # normalize
73
+ pos_scores = pos_scores / configs.NUM_ITERS.value
74
+ neg_scores = neg_scores / configs.NUM_ITERS.value
75
+
76
+ # get only active features
77
+ pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0)
78
+ neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0)
79
+
80
+ # prepare DataFrame
81
+ pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
82
+ neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]
83
+
84
+ posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
85
+ negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
86
+
87
+ return posdf, negdf