Pietro Lesci commited on
Commit
355fd22
2 Parent(s): c718eb8 df4398a

Merge pull request #1 from pietrolesci/new_app

Browse files
.streamlit/config.toml CHANGED
@@ -1,4 +1,7 @@
1
  [server]
2
  # Max size, in megabytes, for files uploaded with the file_uploader.
3
  # Default: 200
4
- maxUploadSize = 10
 
 
 
 
1
  [server]
2
  # Max size, in megabytes, for files uploaded with the file_uploader.
3
  # Default: 200
4
+ maxUploadSize = 20
5
+
6
+ [browser]
7
+ gatherUsageStats = false
LICENSE CHANGED
@@ -1 +1,201 @@
1
- TODO: placeholder
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2018-2020 Grid AI
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
app.py DELETED
@@ -1,70 +0,0 @@
1
- import streamlit as st
2
- from src.utils import get_logo
3
- from src import session_state
4
- from src.pages import (
5
- home,
6
- faq,
7
- about,
8
- )
9
- from src.configs import SupportedFiles
10
-
11
- # app configs
12
- st.set_page_config(
13
- page_title="Wordify",
14
- layout="wide",
15
- page_icon="./assets/logo.png",
16
- )
17
-
18
- # session state
19
- session = session_state.get(
20
- process=False, run_id=0, posdf=None, negdf=None, uploaded_file_id=0
21
- )
22
-
23
-
24
- # ==== SIDEBAR ==== #
25
- # LOGO
26
- client_logo = get_logo("./assets/logo.png")
27
- with st.sidebar.beta_container():
28
- st.image(client_logo)
29
-
30
- # NAVIGATION
31
- PAGES = {
32
- "Home": home,
33
- "FAQ": faq,
34
- "About": about,
35
- }
36
-
37
- st.sidebar.header("Navigation")
38
- # with st.sidebar.beta_container():
39
- selection = st.sidebar.radio("Go to", list(PAGES.keys()))
40
- page = PAGES[selection]
41
-
42
- # FILE UPLOADER
43
- st.sidebar.markdown("")
44
- st.sidebar.markdown("")
45
- st.sidebar.header("Upload file")
46
- # with st.sidebar.beta_container():
47
- uploaded_file = st.sidebar.file_uploader(
48
- "Select file", type=[i.name for i in SupportedFiles]
49
- )
50
-
51
-
52
- # FOOTER
53
- # with st.sidebar.beta_container():
54
- st.sidebar.markdown("")
55
- st.sidebar.markdown("")
56
- st.sidebar.markdown(
57
- """
58
- <span style="font-size: 0.75em">Built with &hearts; by [`Pietro Lesci`](https://pietrolesci.github.io/) and [`MilaNLP`](https://twitter.com/MilaNLProc?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor)</span>
59
- """,
60
- unsafe_allow_html=True,
61
- )
62
- st.sidebar.info("Something not working? Consider [filing an issue](https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new)")
63
-
64
-
65
- # ==== MAIN ==== #
66
- with st.beta_container():
67
- st.title("Wordify")
68
-
69
-
70
- page.write(session, uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from src.components import faq, footer, form, presentation
4
+ from src.utils import convert_df, get_logo, read_file
5
+
6
+ # app configs
7
+ st.set_page_config(
8
+ page_title="Wordify",
9
+ initial_sidebar_state="expanded",
10
+ layout="centered",
11
+ page_icon="./assets/logo.png",
12
+ menu_items={
13
+ "Get Help": "https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new",
14
+ "Report a Bug": "https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new",
15
+ "About": "By the __Wordify__ team.",
16
+ },
17
+ )
18
+
19
+ # logo
20
+ st.sidebar.image(get_logo("./assets/logo.png"))
21
+
22
+ # title
23
+ st.title("Wordify")
24
+
25
+ # file uploader
26
+ uploaded_fl = st.sidebar.file_uploader(
27
+ label="Choose a file",
28
+ type=["csv", "parquet", "tsv", "xlsx"],
29
+ accept_multiple_files=False,
30
+ help="""
31
+ Supported formats:
32
+ - CSV
33
+ - TSV
34
+ - PARQUET
35
+ - XLSX (do not support [Strict Open XML Spreadsheet format](https://stackoverflow.com/questions/62800822/openpyxl-cannot-read-strict-open-xml-spreadsheet-format-userwarning-file-conta))
36
+ """,
37
+ )
38
+
39
+ if not uploaded_fl:
40
+ presentation()
41
+ faq()
42
+ else:
43
+ df = read_file(uploaded_fl)
44
+ new_df = form(df)
45
+ if new_df is not None:
46
+ payload = convert_df(new_df)
47
+ st.download_button(
48
+ label="Download data as CSV",
49
+ data=payload,
50
+ file_name="wordify_results.csv",
51
+ mime="text/csv",
52
+ )
53
+
54
+
55
+ # footer
56
+ footer()
notebooks/wordifier_nb.ipynb CHANGED
@@ -1,67 +1,589 @@
1
  {
2
- "metadata": {
3
- "language_info": {
4
- "codemirror_mode": {
5
- "name": "ipython",
6
- "version": 3
7
- },
8
- "file_extension": ".py",
9
- "mimetype": "text/x-python",
10
- "name": "python",
11
- "nbconvert_exporter": "python",
12
- "pygments_lexer": "ipython3",
13
- "version": "3.8.3"
14
- },
15
- "orig_nbformat": 2,
16
- "kernelspec": {
17
- "name": "python383jvsc74a57bd01cb9a1c850fd1d16c5b98054247a74b7b7a12849bcfa00436ba202c2a9e2bbb2",
18
- "display_name": "Python 3.8.3 64-bit ('py38': conda)"
19
- }
20
- },
21
- "nbformat": 4,
22
- "nbformat_minor": 2,
23
  "cells": [
24
  {
25
  "cell_type": "code",
26
- "execution_count": 1,
27
  "metadata": {},
28
  "outputs": [],
29
  "source": [
30
  "import sys\n",
31
- "nb_dir = os.path.split(os.getcwd())[0]\n",
32
- "if nb_dir not in sys.path:\n",
33
- " sys.path.append(nb_dir)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  "import numpy as np\n",
36
  "import pandas as pd\n",
37
- "# import modin.pandas as mpd\n",
38
- "import spacy\n",
39
- "from src.configs import ModelConfigs, Languages\n",
40
- "from src.utils import wordifier, TextPreprocessor, encode\n",
41
- "\n",
42
- "from textacy.preprocessing import make_pipeline, remove, replace, normalize\n",
43
- "from tqdm import trange\n",
44
- "from sklearn.feature_extraction.text import TfidfVectorizer\n",
45
  "from sklearn.linear_model import LogisticRegression\n",
46
- "from sklearn.preprocessing import LabelEncoder\n",
47
  "from sklearn.utils import resample\n",
48
- "import multiprocessing as mp\n",
49
- "# import dask.dataframe as dask_df\n",
50
- "from stqdm import stqdm\n",
51
- "stqdm.pandas()\n",
52
  "\n",
53
- "from tqdm import trange\n",
54
  "\n",
55
- "import os\n",
56
- "# os.environ[\"MODIN_ENGINE\"] = \"ray\" # Modin will use Ray\n",
57
  "\n",
58
- "import vaex\n",
59
- "pd.set_option(\"display.max_colwidth\", None)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  ]
61
  },
62
  {
63
  "cell_type": "code",
64
- "execution_count": 4,
65
  "metadata": {},
66
  "outputs": [],
67
  "source": [
@@ -70,7 +592,7 @@
70
  },
71
  {
72
  "cell_type": "code",
73
- "execution_count": 28,
74
  "metadata": {},
75
  "outputs": [],
76
  "source": [
@@ -79,7 +601,7 @@
79
  },
80
  {
81
  "cell_type": "code",
82
- "execution_count": 29,
83
  "metadata": {},
84
  "outputs": [],
85
  "source": [
@@ -91,7 +613,7 @@
91
  },
92
  {
93
  "cell_type": "code",
94
- "execution_count": 30,
95
  "metadata": {},
96
  "outputs": [],
97
  "source": [
@@ -104,24 +626,16 @@
104
  },
105
  {
106
  "cell_type": "code",
107
- "execution_count": 31,
108
  "metadata": {},
109
- "outputs": [
110
- {
111
- "output_type": "stream",
112
- "name": "stderr",
113
- "text": [
114
- "100%|██████████| 9939/9939 [00:06<00:00, 1431.09it/s]\n"
115
- ]
116
- }
117
- ],
118
  "source": [
119
  "df[\"p_text\"] = prep.fit_transform(df[\"text\"])"
120
  ]
121
  },
122
  {
123
  "cell_type": "code",
124
- "execution_count": 32,
125
  "metadata": {},
126
  "outputs": [],
127
  "source": [
@@ -130,7 +644,7 @@
130
  },
131
  {
132
  "cell_type": "code",
133
- "execution_count": 21,
134
  "metadata": {},
135
  "outputs": [],
136
  "source": [
@@ -146,28 +660,9 @@
146
  },
147
  {
148
  "cell_type": "code",
149
- "execution_count": 22,
150
  "metadata": {},
151
- "outputs": [
152
- {
153
- "output_type": "stream",
154
- "name": "stdout",
155
- "text": [
156
- "CPU times: user 1.45 s, sys: 10.6 ms, total: 1.46 s\nWall time: 1.46 s\n"
157
- ]
158
- },
159
- {
160
- "output_type": "execute_result",
161
- "data": {
162
- "text/plain": [
163
- "LogisticRegression(C=0.05, class_weight='balanced', max_iter=500, penalty='l1',\n",
164
- " solver='liblinear')"
165
- ]
166
- },
167
- "metadata": {},
168
- "execution_count": 22
169
- }
170
- ],
171
  "source": [
172
  "%%time\n",
173
  "clf.fit(X, y)"
@@ -182,32 +677,9 @@
182
  },
183
  {
184
  "cell_type": "code",
185
- "execution_count": 14,
186
  "metadata": {},
187
- "outputs": [
188
- {
189
- "output_type": "stream",
190
- "name": "stderr",
191
- "text": [
192
- " 6%|▌ | 28/500 [01:01<27:33, 3.50s/it]/Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/svm/_base.py:976: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
193
- " warnings.warn(\"Liblinear failed to converge, increase \"\n",
194
- " 31%|███ | 156/500 [06:18<13:54, 2.43s/it]\n"
195
- ]
196
- },
197
- {
198
- "output_type": "error",
199
- "ename": "KeyboardInterrupt",
200
- "evalue": "",
201
- "traceback": [
202
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
203
- "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
204
- "\u001b[0;32m<ipython-input-14-1fef5b7ccf45>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;31m# fit\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 41\u001b[0;31m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 42\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
205
- "\u001b[0;32m~/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 1354\u001b[0m \u001b[0;34m\" 'solver' is set to 'liblinear'. Got 'n_jobs'\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1355\u001b[0m \" = {}.\".format(effective_n_jobs(self.n_jobs)))\n\u001b[0;32m-> 1356\u001b[0;31m self.coef_, self.intercept_, n_iter_ = _fit_liblinear(\n\u001b[0m\u001b[1;32m 1357\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_intercept\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mintercept_scaling\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1358\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclass_weight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpenalty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdual\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
206
- "\u001b[0;32m~/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/svm/_base.py\u001b[0m in \u001b[0;36m_fit_liblinear\u001b[0;34m(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state, multi_class, loss, epsilon, sample_weight)\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 965\u001b[0m \u001b[0msolver_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_liblinear_solver_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmulti_class\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpenalty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdual\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 966\u001b[0;31m raw_coef_, n_iter_ = liblinear.train_wrap(\n\u001b[0m\u001b[1;32m 967\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_ind\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misspmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msolver_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 968\u001b[0m \u001b[0mclass_weight_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrnd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miinfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'i'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
207
- "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
208
- ]
209
- }
210
- ],
211
  "source": [
212
  "n_instances, n_features = X.shape\n",
213
  "n_classes = len(y_names)\n",
@@ -293,5 +765,30 @@
293
  "outputs": [],
294
  "source": []
295
  }
296
- ]
297
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 65,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
  "import sys\n",
10
+ "sys.path.insert(0, \"..\")\n",
11
+ "import vaex\n",
12
+ "from vaex.ml import LabelEncoder\n",
13
+ "import spacy\n",
14
+ "import pandas as pd\n",
15
+ "from tqdm import tqdm\n",
16
+ "import os\n",
17
+ "import multiprocessing as mp\n",
18
+ "from src.preprocessing import PreprocessingPipeline, encode\n",
19
+ "from src.wordifier import ModelConfigs\n",
20
+ "from sklearn.pipeline import Pipeline\n",
21
+ "from sklearn.linear_model import LogisticRegression\n",
22
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
23
+ "import numpy as np"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 67,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "pipe = PreprocessingPipeline(\n",
33
+ " language=\"English\",\n",
34
+ " pre_steps=list(PreprocessingPipeline.pipeline_components().keys()),\n",
35
+ " lemmatization_step=list(PreprocessingPipeline.lemmatization_component().keys())[1],\n",
36
+ " post_steps=list(PreprocessingPipeline.pipeline_components().keys()),\n",
37
+ ")"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 68,
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "def fn(t):\n",
47
+ " return pipe.post(pipe.lemma(pipe.nlp(pipe.pre(t))))"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 69,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "vdf = vaex.from_pandas(df)\n",
57
+ "vdf[\"processed_text\"] = vdf.apply(fn, arguments=[vdf[\"text\"]], vectorize=False)\n",
58
+ "df = vdf.to_pandas_df()"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 71,
64
+ "metadata": {},
65
+ "outputs": [
66
+ {
67
+ "name": "stderr",
68
+ "output_type": "stream",
69
+ "text": [
70
+ "2021-11-28 17:01:36.883 \n",
71
+ " \u001b[33m\u001b[1mWarning:\u001b[0m to view this Streamlit app on a browser, run it with the following\n",
72
+ " command:\n",
73
+ "\n",
74
+ " streamlit run /Users/pietrolesci/miniconda3/envs/wordify/lib/python3.7/site-packages/ipykernel_launcher.py [ARGUMENTS]\n"
75
+ ]
76
+ }
77
+ ],
78
+ "source": [
79
+ "import streamlit as st\n",
80
+ "pbar = st.progress(0)\n",
81
+ "N = 100\n",
82
+ "for i, _ in enumerate(range(N)):\n",
83
+ " if i % N == 0:\n",
84
+ " pbar.progress(1)"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": null,
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": []
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 24,
97
+ "metadata": {},
98
+ "outputs": [],
99
+ "source": [
100
+ "configs = ModelConfigs\n",
101
+ "clf = Pipeline(\n",
102
+ " [\n",
103
+ " (\"tfidf\", TfidfVectorizer()),\n",
104
+ " (\n",
105
+ " \"classifier\",\n",
106
+ " LogisticRegression(\n",
107
+ " penalty=\"l1\",\n",
108
+ " C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],\n",
109
+ " solver=\"liblinear\",\n",
110
+ " multi_class=\"auto\",\n",
111
+ " max_iter=500,\n",
112
+ " class_weight=\"balanced\",\n",
113
+ " ),\n",
114
+ " ),\n",
115
+ " ]\n",
116
+ ")\n"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 29,
122
+ "metadata": {},
123
+ "outputs": [
124
+ {
125
+ "data": {
126
+ "text/plain": [
127
+ "Pipeline(steps=[('tfidf', TfidfVectorizer()),\n",
128
+ " ('classifier',\n",
129
+ " LogisticRegression(C=1, class_weight='balanced', max_iter=500,\n",
130
+ " penalty='l1', solver='liblinear'))])"
131
+ ]
132
+ },
133
+ "execution_count": 29,
134
+ "metadata": {},
135
+ "output_type": "execute_result"
136
+ }
137
+ ],
138
+ "source": [
139
+ "clf.fit(df[\"text\"], df[\"label\"])"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 39,
145
+ "metadata": {},
146
+ "outputs": [
147
+ {
148
+ "data": {
149
+ "text/plain": [
150
+ "array(['00', '000', '00001', ..., 'ís', 'über', 'überwoman'], dtype=object)"
151
+ ]
152
+ },
153
+ "execution_count": 39,
154
+ "metadata": {},
155
+ "output_type": "execute_result"
156
+ }
157
+ ],
158
+ "source": []
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 40,
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": [
166
+ "def wordifier(df, text_col, label_col, configs=ModelConfigs):\n",
167
+ "\n",
168
+ " n_instances, n_features = X.shape\n",
169
+ " n_classes = np.unique(y)\n",
170
+ "\n",
171
+ " # NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n",
172
+ " sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n",
173
+ "\n",
174
+ " sample_size = min(\n",
175
+ " # this is the maximum supported\n",
176
+ " configs.MAX_SELECTION.value,\n",
177
+ " # at minimum you want MIN_SELECTION but in general you want\n",
178
+ " # n_instances * sample_fraction\n",
179
+ " max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n",
180
+ " # however if previous one is bigger the the available instances take\n",
181
+ " # the number of available instances\n",
182
+ " n_instances,\n",
183
+ " )\n",
184
+ "\n",
185
+ " # TODO: might want to try out something to subsample features at each iteration\n",
186
+ "\n",
187
+ " # initialize coefficient matrices\n",
188
+ " pos_scores = np.zeros((n_classes, n_features), dtype=int)\n",
189
+ " neg_scores = np.zeros((n_classes, n_features), dtype=int)\n",
190
+ "\n",
191
+ " for _ in range(configs.NUM_ITERS.value):\n",
192
+ "\n",
193
+ " # run randomized regression\n",
194
+ " clf = Pipeline([\n",
195
+ " ('tfidf', TfidfVectorizer()), \n",
196
+ " ('classifier', LogisticRegression(\n",
197
+ " penalty=\"l1\",\n",
198
+ " C=configs.PENALTIES.value[\n",
199
+ " np.random.randint(len(configs.PENALTIES.value))\n",
200
+ " ],\n",
201
+ " solver=\"liblinear\",\n",
202
+ " multi_class=\"auto\",\n",
203
+ " max_iter=500,\n",
204
+ " class_weight=\"balanced\",\n",
205
+ " ))]\n",
206
+ " )\n",
207
+ "\n",
208
+ " # sample indices to subsample matrix\n",
209
+ " selection = resample(\n",
210
+ " np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size\n",
211
+ " )\n",
212
+ "\n",
213
+ " # fit\n",
214
+ " try:\n",
215
+ " clf.fit(X[selection], y[selection])\n",
216
+ " except ValueError:\n",
217
+ " continue\n",
218
+ "\n",
219
+ " # record coefficients\n",
220
+ " if n_classes == 2:\n",
221
+ " pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n",
222
+ " neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n",
223
+ " pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n",
224
+ " neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n",
225
+ " else:\n",
226
+ " pos_scores += clf.coef_ > 0\n",
227
+ " neg_scores += clf.coef_ < 0\n",
228
+ "\n",
229
+ "\n",
230
+ " # normalize\n",
231
+ " pos_scores = pos_scores / configs.NUM_ITERS.value\n",
232
+ " neg_scores = neg_scores / configs.NUM_ITERS.value\n",
233
+ "\n",
234
+ " # get only active features\n",
235
+ " pos_positions = np.where(\n",
236
+ " pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0\n",
237
+ " )\n",
238
+ " neg_positions = np.where(\n",
239
+ " neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0\n",
240
+ " )\n",
241
+ "\n",
242
+ " # prepare DataFrame\n",
243
+ " X_names = clf.steps[0][1].get_feature_names_out()\n",
244
+ " pos = [\n",
245
+ " (X_names[i], pos_scores[c, i], y_names[c])\n",
246
+ " for c, i in zip(*pos_positions.nonzero())\n",
247
+ " ]\n",
248
+ " neg = [\n",
249
+ " (X_names[i], neg_scores[c, i], y_names[c])\n",
250
+ " for c, i in zip(*neg_positions.nonzero())\n",
251
+ " ]\n",
252
+ "\n",
253
+ " posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values(\n",
254
+ " [\"label\", \"score\"], ascending=False\n",
255
+ " )\n",
256
+ " negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values(\n",
257
+ " [\"label\", \"score\"], ascending=False\n",
258
+ " )\n",
259
+ "\n",
260
+ " return posdf, negdf"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": 41,
266
+ "metadata": {},
267
+ "outputs": [],
268
+ "source": [
269
+ "res = vdf.apply(wordifier, arguments=[vdf.processed_text, vdf.encoded_label], vectorize=False)"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": 45,
275
+ "metadata": {},
276
+ "outputs": [],
277
+ "source": [
278
+ "from vaex.ml.sklearn import Predictor"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": 60,
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "clf = Pipeline(\n",
288
+ " [\n",
289
+ " (\n",
290
+ " \"tfidf\",\n",
291
+ " TfidfVectorizer(\n",
292
+ " input=\"content\", # default: file already in memory\n",
293
+ " encoding=\"utf-8\", # default\n",
294
+ " decode_error=\"strict\", # default\n",
295
+ " strip_accents=None, # do nothing\n",
296
+ " lowercase=False, # do nothing\n",
297
+ " preprocessor=None, # do nothing - default\n",
298
+ " tokenizer=None, # default\n",
299
+ " stop_words=None, # do nothing\n",
300
+ " analyzer=\"word\",\n",
301
+ " ngram_range=(1, 3), # maximum 3-ngrams\n",
302
+ " min_df=0.001,\n",
303
+ " max_df=0.75,\n",
304
+ " sublinear_tf=True,\n",
305
+ " ),\n",
306
+ " ),\n",
307
+ " (\n",
308
+ " \"classifier\",\n",
309
+ " LogisticRegression(\n",
310
+ " penalty=\"l1\",\n",
311
+ " C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],\n",
312
+ " solver=\"liblinear\",\n",
313
+ " multi_class=\"auto\",\n",
314
+ " max_iter=500,\n",
315
+ " class_weight=\"balanced\",\n",
316
+ " ),\n",
317
+ " ),\n",
318
+ " ]\n",
319
+ ")\n",
320
  "\n",
321
+ "vaex_model = Predictor(\n",
322
+ " features=[\"processed_text\"],\n",
323
+ " target=\"encoded_label\",\n",
324
+ " model=clf,\n",
325
+ " prediction_name=\"prediction\",\n",
326
+ ")\n"
327
+ ]
328
+ },
329
+ {
330
+ "cell_type": "code",
331
+ "execution_count": 61,
332
+ "metadata": {},
333
+ "outputs": [
334
+ {
335
+ "ename": "TypeError",
336
+ "evalue": "unhashable type: 'list'",
337
+ "output_type": "error",
338
+ "traceback": [
339
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
340
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
341
+ "\u001b[0;32m/var/folders/b_/m81mmt0s6gv48kdvk44n2l740000gn/T/ipykernel_52217/687453386.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvaex_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
342
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/ml/sklearn.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, df, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m '''\n\u001b[1;32m 104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 106\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
343
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mvalues\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 6897\u001b[0m \u001b[0mIf\u001b[0m \u001b[0many\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0mcontain\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasks\u001b[0m \u001b[0mare\u001b[0m \u001b[0mignored\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0melements\u001b[0m \u001b[0mare\u001b[0m \u001b[0mreturned\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mwell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6898\u001b[0m \"\"\"\n\u001b[0;32m-> 6899\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__array__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6900\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
344
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m__array__\u001b[0;34m(self, dtype, parallel)\u001b[0m\n\u001b[1;32m 5989\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcolumn_type\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5990\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Cannot cast %r (of type %r) to %r\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5991\u001b[0;31m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'numpy'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5992\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misMaskedArray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5993\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
345
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, progress)\u001b[0m\n\u001b[1;32m 2962\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2963\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2964\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_evaluate_implementation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2965\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2966\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mdocsubst\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
346
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m_evaluate_implementation\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, raw, progress)\u001b[0m\n\u001b[1;32m 6207\u001b[0m \u001b[0;31m# TODO: For NEP branch: dtype -> dtype_evaluate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6209\u001b[0;31m \u001b[0mexpression_to_evaluate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# lets assume we have to do them all\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6211\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mexpression\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
347
+ "\u001b[0;31mTypeError\u001b[0m: unhashable type: 'list'"
348
+ ]
349
+ }
350
+ ],
351
+ "source": [
352
+ "vaex_model.fit(vdf)"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "code",
357
+ "execution_count": null,
358
+ "metadata": {},
359
+ "outputs": [],
360
+ "source": []
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": 52,
365
+ "metadata": {},
366
+ "outputs": [
367
+ {
368
+ "data": {
369
+ "text/plain": [
370
+ "b'\\x80\\x03c__main__\\nwordifier\\nq\\x00.'"
371
+ ]
372
+ },
373
+ "execution_count": 52,
374
+ "metadata": {},
375
+ "output_type": "execute_result"
376
+ }
377
+ ],
378
+ "source": [
379
+ "import pickle\n",
380
+ "pickle.dumps(wordifier)"
381
+ ]
382
+ },
383
+ {
384
+ "cell_type": "code",
385
+ "execution_count": 47,
386
+ "metadata": {},
387
+ "outputs": [
388
+ {
389
+ "ename": "TypeError",
390
+ "evalue": "unhashable type: 'list'",
391
+ "output_type": "error",
392
+ "traceback": [
393
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
394
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
395
+ "\u001b[0;32m/var/folders/b_/m81mmt0s6gv48kdvk44n2l740000gn/T/ipykernel_52217/687453386.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvaex_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
396
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/ml/sklearn.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, df, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m '''\n\u001b[1;32m 104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 106\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
397
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mvalues\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 6897\u001b[0m \u001b[0mIf\u001b[0m \u001b[0many\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0mcontain\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasks\u001b[0m \u001b[0mare\u001b[0m \u001b[0mignored\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0melements\u001b[0m \u001b[0mare\u001b[0m \u001b[0mreturned\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mwell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6898\u001b[0m \"\"\"\n\u001b[0;32m-> 6899\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__array__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6900\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
398
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m__array__\u001b[0;34m(self, dtype, parallel)\u001b[0m\n\u001b[1;32m 5989\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcolumn_type\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5990\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Cannot cast %r (of type %r) to %r\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5991\u001b[0;31m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'numpy'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5992\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misMaskedArray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5993\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
399
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, progress)\u001b[0m\n\u001b[1;32m 2962\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2963\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2964\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_evaluate_implementation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2965\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2966\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mdocsubst\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
400
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m_evaluate_implementation\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, raw, progress)\u001b[0m\n\u001b[1;32m 6207\u001b[0m \u001b[0;31m# TODO: For NEP branch: dtype -> dtype_evaluate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6209\u001b[0;31m \u001b[0mexpression_to_evaluate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# lets assume we have to do them all\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6211\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mexpression\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
401
+ "\u001b[0;31mTypeError\u001b[0m: unhashable type: 'list'"
402
+ ]
403
+ }
404
+ ],
405
+ "source": []
406
+ },
407
+ {
408
+ "cell_type": "code",
409
+ "execution_count": null,
410
+ "metadata": {},
411
+ "outputs": [],
412
+ "source": []
413
+ },
414
+ {
415
+ "cell_type": "code",
416
+ "execution_count": null,
417
+ "metadata": {},
418
+ "outputs": [],
419
+ "source": [
420
+ "res = []\n",
421
+ "with tqdm(total=len(df)) as pbar:\n",
422
+ " for doc in tqdm(nlp.pipe(df[\"text\"].values, batch_size=500, n_process=n_cpus)):\n",
423
+ " res.append([i.lemma_ for i in doc])\n",
424
+ " pbar.update(1)"
425
+ ]
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "execution_count": null,
430
+ "metadata": {},
431
+ "outputs": [],
432
+ "source": [
433
+ "import pickle"
434
+ ]
435
+ },
436
+ {
437
+ "cell_type": "code",
438
+ "execution_count": null,
439
+ "metadata": {},
440
+ "outputs": [],
441
+ "source": [
442
+ "def fn(t):\n",
443
+ " return "
444
+ ]
445
+ },
446
+ {
447
+ "cell_type": "code",
448
+ "execution_count": null,
449
+ "metadata": {},
450
+ "outputs": [],
451
+ "source": [
452
+ "%%timeit\n",
453
+ "with mp.Pool(mp.cpu_count()) as pool:\n",
454
+ " new_s = pool.map(nlp, df[\"text\"].values)"
455
+ ]
456
+ },
457
+ {
458
+ "cell_type": "code",
459
+ "execution_count": null,
460
+ "metadata": {},
461
+ "outputs": [],
462
+ "source": []
463
+ },
464
+ {
465
+ "cell_type": "code",
466
+ "execution_count": null,
467
+ "metadata": {},
468
+ "outputs": [],
469
+ "source": []
470
+ },
471
+ {
472
+ "cell_type": "code",
473
+ "execution_count": null,
474
+ "metadata": {},
475
+ "outputs": [],
476
+ "source": [
477
+ "from typing import List\n",
478
  "import numpy as np\n",
479
  "import pandas as pd\n",
480
+ "import streamlit as st\n",
 
 
 
 
 
 
 
481
  "from sklearn.linear_model import LogisticRegression\n",
 
482
  "from sklearn.utils import resample\n",
 
 
 
 
483
  "\n",
484
+ "from src.configs import ModelConfigs\n",
485
  "\n",
 
 
486
  "\n",
487
+ "def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):\n",
488
+ "\n",
489
+ " n_instances, n_features = X.shape\n",
490
+ " n_classes = len(y_names)\n",
491
+ "\n",
492
+ " # NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n",
493
+ " sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n",
494
+ "\n",
495
+ " sample_size = min(\n",
496
+ " # this is the maximum supported\n",
497
+ " configs.MAX_SELECTION.value,\n",
498
+ " # at minimum you want MIN_SELECTION but in general you want\n",
499
+ " # n_instances * sample_fraction\n",
500
+ " max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n",
501
+ " # however if previous one is bigger the the available instances take\n",
502
+ " # the number of available instances\n",
503
+ " n_instances,\n",
504
+ " )\n",
505
+ "\n",
506
+ " # TODO: might want to try out something to subsample features at each iteration\n",
507
+ "\n",
508
+ " # initialize coefficient matrices\n",
509
+ " pos_scores = np.zeros((n_classes, n_features), dtype=int)\n",
510
+ " neg_scores = np.zeros((n_classes, n_features), dtype=int)\n",
511
+ "\n",
512
+ " with st.spinner(\"Wordifying!\"):\n",
513
+ " pbar = st.progress(0)\n",
514
+ "\n",
515
+ " for i, _ in enumerate(range(configs.NUM_ITERS.value)):\n",
516
+ "\n",
517
+ " # run randomized regression\n",
518
+ " clf = LogisticRegression(\n",
519
+ " penalty=\"l1\",\n",
520
+ " C=configs.PENALTIES.value[\n",
521
+ " np.random.randint(len(configs.PENALTIES.value))\n",
522
+ " ],\n",
523
+ " solver=\"liblinear\",\n",
524
+ " multi_class=\"auto\",\n",
525
+ " max_iter=500,\n",
526
+ " class_weight=\"balanced\",\n",
527
+ " )\n",
528
+ "\n",
529
+ " # sample indices to subsample matrix\n",
530
+ " selection = resample(\n",
531
+ " np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size\n",
532
+ " )\n",
533
+ "\n",
534
+ " # fit\n",
535
+ " try:\n",
536
+ " clf.fit(X[selection], y[selection])\n",
537
+ " except ValueError:\n",
538
+ " continue\n",
539
+ "\n",
540
+ " # record coefficients\n",
541
+ " if n_classes == 2:\n",
542
+ " pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n",
543
+ " neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n",
544
+ " pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n",
545
+ " neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n",
546
+ " else:\n",
547
+ " pos_scores += clf.coef_ > 0\n",
548
+ " neg_scores += clf.coef_ < 0\n",
549
+ "\n",
550
+ " pbar.progress(i + 1)\n",
551
+ "\n",
552
+ " # normalize\n",
553
+ " pos_scores = pos_scores / configs.NUM_ITERS.value\n",
554
+ " neg_scores = neg_scores / configs.NUM_ITERS.value\n",
555
+ "\n",
556
+ " # get only active features\n",
557
+ " pos_positions = np.where(\n",
558
+ " pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0\n",
559
+ " )\n",
560
+ " neg_positions = np.where(\n",
561
+ " neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0\n",
562
+ " )\n",
563
+ "\n",
564
+ " # prepare DataFrame\n",
565
+ " pos = [\n",
566
+ " (X_names[i], pos_scores[c, i], y_names[c])\n",
567
+ " for c, i in zip(*pos_positions.nonzero())\n",
568
+ " ]\n",
569
+ " neg = [\n",
570
+ " (X_names[i], neg_scores[c, i], y_names[c])\n",
571
+ " for c, i in zip(*neg_positions.nonzero())\n",
572
+ " ]\n",
573
+ "\n",
574
+ " posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values(\n",
575
+ " [\"label\", \"score\"], ascending=False\n",
576
+ " )\n",
577
+ " negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values(\n",
578
+ " [\"label\", \"score\"], ascending=False\n",
579
+ " )\n",
580
+ "\n",
581
+ " return posdf, negdf\n"
582
  ]
583
  },
584
  {
585
  "cell_type": "code",
586
+ "execution_count": null,
587
  "metadata": {},
588
  "outputs": [],
589
  "source": [
 
592
  },
593
  {
594
  "cell_type": "code",
595
+ "execution_count": null,
596
  "metadata": {},
597
  "outputs": [],
598
  "source": [
 
601
  },
602
  {
603
  "cell_type": "code",
604
+ "execution_count": null,
605
  "metadata": {},
606
  "outputs": [],
607
  "source": [
 
613
  },
614
  {
615
  "cell_type": "code",
616
+ "execution_count": null,
617
  "metadata": {},
618
  "outputs": [],
619
  "source": [
 
626
  },
627
  {
628
  "cell_type": "code",
629
+ "execution_count": null,
630
  "metadata": {},
631
+ "outputs": [],
 
 
 
 
 
 
 
 
632
  "source": [
633
  "df[\"p_text\"] = prep.fit_transform(df[\"text\"])"
634
  ]
635
  },
636
  {
637
  "cell_type": "code",
638
+ "execution_count": null,
639
  "metadata": {},
640
  "outputs": [],
641
  "source": [
 
644
  },
645
  {
646
  "cell_type": "code",
647
+ "execution_count": null,
648
  "metadata": {},
649
  "outputs": [],
650
  "source": [
 
660
  },
661
  {
662
  "cell_type": "code",
663
+ "execution_count": null,
664
  "metadata": {},
665
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  "source": [
667
  "%%time\n",
668
  "clf.fit(X, y)"
 
677
  },
678
  {
679
  "cell_type": "code",
680
+ "execution_count": null,
681
  "metadata": {},
682
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
  "source": [
684
  "n_instances, n_features = X.shape\n",
685
  "n_classes = len(y_names)\n",
 
765
  "outputs": [],
766
  "source": []
767
  }
768
+ ],
769
+ "metadata": {
770
+ "interpreter": {
771
+ "hash": "aa7efd0b3ada76bb0689aa8ed0b61d7de788847e3d11d2d142fc5800c765982f"
772
+ },
773
+ "kernelspec": {
774
+ "display_name": "Python 3.8.3 64-bit ('py38': conda)",
775
+ "language": "python",
776
+ "name": "python3"
777
+ },
778
+ "language_info": {
779
+ "codemirror_mode": {
780
+ "name": "ipython",
781
+ "version": 3
782
+ },
783
+ "file_extension": ".py",
784
+ "mimetype": "text/x-python",
785
+ "name": "python",
786
+ "nbconvert_exporter": "python",
787
+ "pygments_lexer": "ipython3",
788
+ "version": "3.7.11"
789
+ },
790
+ "orig_nbformat": 2
791
+ },
792
+ "nbformat": 4,
793
+ "nbformat_minor": 2
794
+ }
src/components.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from src.configs import Languages, PreprocessingConfigs, SupportedFiles
4
+ from src.preprocessing import PreprocessingPipeline
5
+ from src.wordifier import input_transform, output_transform, wordifier
6
+
7
+
8
+ def form(df):
9
+ with st.form("my_form"):
10
+ col1, col2 = st.columns([1, 2])
11
+ with col1:
12
+
13
+ cols = [""] + df.columns.tolist()
14
+ label_column = st.selectbox(
15
+ "Select label column",
16
+ cols,
17
+ index=0,
18
+ help="Select the column containing the labels",
19
+ )
20
+ text_column = st.selectbox(
21
+ "Select text column",
22
+ cols,
23
+ index=0,
24
+ help="Select the column containing the text",
25
+ )
26
+ language = st.selectbox(
27
+ "Select language",
28
+ [i.name for i in Languages],
29
+ help="""
30
+ Select the language of your texts amongst the supported one. If we currently do
31
+ not support it, feel free to open an issue
32
+ """,
33
+ )
34
+
35
+ with col2:
36
+ steps_options = list(PreprocessingPipeline.pipeline_components().keys())
37
+ pre_steps = st.multiselect(
38
+ "Select pre-lemmatization processing steps (ordered)",
39
+ options=steps_options,
40
+ default=[
41
+ steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
42
+ ],
43
+ format_func=lambda x: x.replace("_", " ").title(),
44
+ help="Select the processing steps to apply before the text is lemmatized",
45
+ )
46
+
47
+ lammatization_options = list(
48
+ PreprocessingPipeline.lemmatization_component().keys()
49
+ )
50
+ lemmatization_step = st.selectbox(
51
+ "Select lemmatization",
52
+ options=lammatization_options,
53
+ index=PreprocessingConfigs.DEFAULT_LEMMA.value,
54
+ help="Select lemmatization procedure",
55
+ )
56
+
57
+ post_steps = st.multiselect(
58
+ "Select post-lemmatization processing steps (ordered)",
59
+ options=steps_options,
60
+ default=[
61
+ steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value
62
+ ],
63
+ format_func=lambda x: x.replace("_", " ").title(),
64
+ help="Select the processing steps to apply after the text is lemmatized",
65
+ )
66
+
67
+ # Every form must have a submit button.
68
+ submitted = st.form_submit_button("Submit")
69
+ if submitted:
70
+
71
+ # preprocess
72
+ with st.spinner("Step 1/4: Preprocessing text"):
73
+ pipe = PreprocessingPipeline(
74
+ language, pre_steps, lemmatization_step, post_steps
75
+ )
76
+ df = pipe.vaex_process(df, text_column)
77
+
78
+ # prepare input
79
+ with st.spinner("Step 2/4: Preparing inputs"):
80
+ input_dict = input_transform(df[text_column], df[label_column])
81
+
82
+ # wordify
83
+ with st.spinner("Step 3/4: Wordifying"):
84
+ pos, neg = wordifier(**input_dict)
85
+
86
+ # prepare output
87
+ with st.spinner("Step 4/4: Preparing outputs"):
88
+ new_df = output_transform(pos, neg)
89
+
90
+ # col1, col2, col3 = st.columns(3)
91
+ # with col1:
92
+ # st.metric("Total number of words processed", 3, delta_color="normal")
93
+ # with col2:
94
+ # st.metric("Texts processed", 3, delta_color="normal")
95
+ # with col3:
96
+ # st.metric("Texts processed", 3, delta_color="normal")
97
+
98
+ return new_df
99
+
100
+
101
+ def faq():
102
+ st.subheader("Frequently Asked Questions")
103
+ with st.expander("What is Wordify?"):
104
+ st.markdown(
105
+ """
106
+ __Wordify__ is a way to find out which n-grams (i.e., words and concatenations of words) are most indicative for each of your dependent
107
+ variable values.
108
+ """
109
+ )
110
+
111
+ with st.expander("What happens to my data?"):
112
+ st.markdown(
113
+ """
114
+ Nothing. We never store the data you upload on disk: it is only kept in memory for the
115
+ duration of the modeling, and then deleted. We do not retain any copies or traces of
116
+ your data.
117
+ """
118
+ )
119
+
120
+ with st.expander("What input formats do you support?"):
121
+ st.markdown(
122
+ f"""
123
+ We currently support {", ".join([i.name for i in SupportedFiles])}.
124
+ """
125
+ )
126
+
127
+ with st.expander("What languages are supported?"):
128
+ st.markdown(
129
+ f"""
130
+ Currently we support: {", ".join([i.name for i in Languages])}.
131
+ """
132
+ )
133
+
134
+ with st.expander("How does it work?"):
135
+ st.markdown(
136
+ """
137
+ It uses a variant of the Stability Selection algorithm
138
+ [(Meinshausen and Bühlmann, 2010)](https://rss.onlinelibrary.wiley.com/doi/full/10.1111/j.1467-9868.2010.00740.x)
139
+ to fit hundreds of logistic regression models on random subsets of the data, using
140
+ different L1 penalties to drive as many of the term coefficients to 0. Any terms that
141
+ receive a non-zero coefficient in at least 30% of all model runs can be seen as stable
142
+ indicators.
143
+ """
144
+ )
145
+
146
+ with st.expander("What libraries do you use?"):
147
+ st.markdown(
148
+ """
149
+ We leverage the power of many great libraries in the Python ecosystem:
150
+ - `Streamlit`
151
+ - `Pandas`
152
+ - `Numpy`
153
+ - `Spacy`
154
+ - `Scikit-learn`
155
+ - `Vaex`
156
+ """
157
+ )
158
+
159
+ with st.expander("How much data do I need?"):
160
+ st.markdown(
161
+ """
162
+ We recommend at least 2000 instances, the more, the better. With fewer instances, the
163
+ results are less replicable and reliable.
164
+ """
165
+ )
166
+
167
+ with st.expander("Is there a paper I can cite?"):
168
+ st.markdown(
169
+ """
170
+ Yes, please! Cite [Wordify: A Tool for Discovering and Differentiating Consumer Vocabularies](https://academic.oup.com/jcr/article/48/3/394/6199426)
171
+ ```
172
+ @article{10.1093/jcr/ucab018,
173
+ author = {Hovy, Dirk and Melumad, Shiri and Inman, J Jeffrey},
174
+ title = "{Wordify: A Tool for Discovering and Differentiating Consumer Vocabularies}",
175
+ journal = {Journal of Consumer Research},
176
+ volume = {48},
177
+ number = {3},
178
+ pages = {394-414},
179
+ year = {2021},
180
+ month = {03},
181
+ abstract = "{This work describes and illustrates a free and easy-to-use online text-analysis tool for understanding how consumer word use varies across contexts. The tool, Wordify, uses randomized logistic regression (RLR) to identify the words that best discriminate texts drawn from different pre-classified corpora, such as posts written by men versus women, or texts containing mostly negative versus positive valence. We present illustrative examples to show how the tool can be used for such diverse purposes as (1) uncovering the distinctive vocabularies that consumers use when writing reviews on smartphones versus PCs, (2) discovering how the words used in Tweets differ between presumed supporters and opponents of a controversial ad, and (3) expanding the dictionaries of dictionary-based sentiment-measurement tools. We show empirically that Wordify’s RLR algorithm performs better at discriminating vocabularies than support vector machines and chi-square selectors, while offering significant advantages in computing time. A discussion is also provided on the use of Wordify in conjunction with other text-analysis tools, such as probabilistic topic modeling and sentiment analysis, to gain more profound knowledge of the role of language in consumer behavior.}",
182
+ issn = {0093-5301},
183
+ doi = {10.1093/jcr/ucab018},
184
+ url = {https://doi.org/10.1093/jcr/ucab018},
185
+ eprint = {https://academic.oup.com/jcr/article-pdf/48/3/394/40853499/ucab018.pdf},
186
+ }
187
+ ```
188
+ """
189
+ )
190
+
191
+ with st.expander("How can I reach out to the Wordify team?"):
192
+ st.markdown(contacts(), unsafe_allow_html=True)
193
+
194
+
195
+ def presentation():
196
+ st.markdown(
197
+ """
198
+ Wordify makes it easy to identify words that discriminate categories in textual data.
199
+
200
+ :point_left: Start by uploading a file. *Once you upload the file, __Wordify__ will
201
+ show an interactive UI*.
202
+ """
203
+ )
204
+
205
+ st.subheader("Input format")
206
+ st.markdown(
207
+ """
208
+ Please note that your file must have a column with the texts and a column with the labels,
209
+ for example
210
+ """
211
+ )
212
+ st.table(
213
+ {
214
+ "text": ["A review", "Another review", "Yet another one", "etc"],
215
+ "label": ["Good", "Bad", "Good", "etc"],
216
+ }
217
+ )
218
+
219
+ st.subheader("Output format")
220
+ st.markdown(
221
+ """
222
+ As a result of the process, you will get a file containing 4 columns:
223
+ - `Word`: the n-gram (i.e., a word or a concatenation of words) considered
224
+ - `Score`: the wordify score, between 0 and 1, of how important is `Word` to discrimitate `Label`
225
+ - `Label`: the label that `Word` is discriminating
226
+ - `Correlation`: how `Word` is correlated with `Label` (e.g., "negative" means that if `Word` is present in the text then the label is less likely to be `Label`)
227
+ """
228
+ )
229
+
230
+
231
+ def footer():
232
+ st.sidebar.markdown(
233
+ """
234
+ <span style="font-size: 0.75em">Built with &hearts; by [`Pietro Lesci`](https://pietrolesci.github.io/) and [`MilaNLP`](https://twitter.com/MilaNLProc?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor).</span>
235
+ """,
236
+ unsafe_allow_html=True,
237
+ )
238
+
239
+
240
+ def contacts():
241
+ return """
242
+ You can reach out to us via email, phone, or via mail
243
+
244
+ - :email: wordify@unibocconi.it
245
+
246
+ - :telephone_receiver: +39 02 5836 2604
247
+
248
+ - :postbox: Via Röntgen n. 1, Milan 20136 (ITALY)
249
+
250
+
251
+ <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
252
+ """
src/configs.py CHANGED
@@ -1,4 +1,5 @@
1
  from enum import Enum
 
2
  import pandas as pd
3
 
4
 
@@ -10,6 +11,19 @@ class ModelConfigs(Enum):
10
  MIN_SELECTION = 10_000
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  class Languages(Enum):
14
  English = "en_core_web_sm"
15
  Italian = "it_core_news_sm"
 
1
  from enum import Enum
2
+
3
  import pandas as pd
4
 
5
 
 
11
  MIN_SELECTION = 10_000
12
 
13
 
14
+ class InputTransformConfigs(Enum):
15
+ NGRAM_RANGE = (1, 3)
16
+ MIN_DF = 0.001
17
+ MAX_DF = 0.75
18
+ SUBLINEAR = True
19
+
20
+
21
+ class PreprocessingConfigs(Enum):
22
+ DEFAULT_PRE = [1, 3, 5, 15, 21, 22, 18, 19, 0, 20, -1]
23
+ DEFAULT_LEMMA = 1
24
+ DEFAULT_POST = [20, -1]
25
+
26
+
27
  class Languages(Enum):
28
  English = "en_core_web_sm"
29
  Italian = "it_core_news_sm"
src/pages/about.py DELETED
@@ -1,34 +0,0 @@
1
- import streamlit as st
2
-
3
-
4
- def write(*args):
5
- # ==== Contacts ==== #
6
- with st.beta_container():
7
- st.markdown("")
8
- st.markdown("")
9
- st.header(":rocket:About us")
10
-
11
- st.markdown(
12
- """
13
- You can reach out to us via email, phone, or - if you are old-fashioned - via mail
14
- """
15
- )
16
- with st.beta_expander("Contacts"):
17
-
18
- _, col2 = st.beta_columns([0.5, 3])
19
- col2.markdown(
20
- """
21
- :email: wordify@unibocconi.it
22
-
23
- :telephone_receiver: +39 02 5836 2604
24
-
25
- :postbox: Via Röntgen n. 1, Milan 20136 (ITALY)
26
- """
27
- )
28
-
29
- st.write(
30
- """
31
- <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
32
- """,
33
- unsafe_allow_html=True,
34
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/faq.py DELETED
@@ -1,126 +0,0 @@
1
- import streamlit as st
2
- from src.configs import Languages
3
-
4
-
5
- def write(*args):
6
-
7
- # ==== HOW IT WORKS ==== #
8
- with st.beta_container():
9
- st.markdown("")
10
- st.markdown("")
11
- st.markdown(
12
- """
13
- Wordify makes it easy to identify words that discriminate categories in textual data.
14
-
15
- Let's explain Wordify with an example. Imagine you are thinking about having a glass
16
- of wine :wine_glass: with your friends :man-man-girl-girl: and you have to buy a bottle.
17
- You know you like `bold`, `woody` wine but are unsure which one to choose.
18
- You wonder whether there are some words that describe each type of wine.
19
- Since you are a researcher :female-scientist: :male-scientist:, you decide to approach
20
- the problem scientifically :microscope:. That's where Wordify comes to the rescue!
21
- """
22
- )
23
- st.markdown("")
24
- st.markdown("")
25
- st.header("Steps")
26
- st.subheader("Step 1 - Prepare your data")
27
- st.markdown(
28
- """
29
- Create an Excel or CSV file with two columns for each row:
30
-
31
- - a column with the name or the label identifying a specific object or class (e.g., in our
32
- wine example above it would be the type of wine or the name of a specific brand). It is
33
- common practice naming this column `label`
34
-
35
- - a column with the text describing that specific object or class (e.g., in the wine example
36
- above it could be the description that you find on the rear of the bottle label). It is
37
- common practice naming this column `text`
38
-
39
- To have reliable results, we suggest providing at least 2000 labelled texts. If you provide
40
- less we will still wordify your file, but the results should then be taken with a grain of
41
- salt.
42
-
43
- Consider that we also support multi-language texts, therefore you'll be able to
44
- automatically discriminate between international wines, even if your preferred Italian
45
- producer does not provide you with a description written in English!
46
- """
47
- )
48
-
49
- st.subheader("Step 2 - Upload your file and Wordify!")
50
- st.markdown(
51
- """
52
- Once you have prepared your Excel or CSV file, click the "Browse File" button.
53
- Browse for your file.
54
- Choose the language of your texts (select multi-language if your file contains text in
55
- different languages).
56
- Push the "Wordify|" button, set back, and wait for wordify to do its tricks.
57
-
58
- Depending on the size of your data, the process can take from 1 minute to 5 minutes
59
- """
60
- )
61
-
62
- # ==== FAQ ==== #
63
- with st.beta_container():
64
- st.markdown("")
65
- st.markdown("")
66
- st.header(":question:Frequently Asked Questions")
67
- with st.beta_expander("What is Wordify?"):
68
- st.markdown(
69
- """
70
- Wordify is a way to find out which terms are most indicative for each of your dependent
71
- variable values.
72
- """
73
- )
74
-
75
- with st.beta_expander("What happens to my data?"):
76
- st.markdown(
77
- """
78
- Nothing. We never store the data you upload on disk: it is only kept in memory for the
79
- duration of the modeling, and then deleted. We do not retain any copies or traces of
80
- your data.
81
- """
82
- )
83
-
84
- with st.beta_expander("What input formats do you support?"):
85
- st.markdown(
86
- """
87
- The file you upload should be .xlsx, with two columns: the first should be labeled
88
- 'text' and contain all your documents (e.g., tweets, reviews, patents, etc.), one per
89
- line. The second column should be labeled 'label', and contain the dependent variable
90
- label associated with each text (e.g., rating, author gender, company, etc.).
91
- """
92
- )
93
-
94
- with st.beta_expander("How does it work?"):
95
- st.markdown(
96
- """
97
- It uses a variant of the Stability Selection algorithm
98
- [(Meinshausen and Bühlmann, 2010)](https://rss.onlinelibrary.wiley.com/doi/full/10.1111/j.1467-9868.2010.00740.x)
99
- to fit hundreds of logistic regression models on random subsets of the data, using
100
- different L1 penalties to drive as many of the term coefficients to 0. Any terms that
101
- receive a non-zero coefficient in at least 30% of all model runs can be seen as stable
102
- indicators.
103
- """
104
- )
105
-
106
- with st.beta_expander("How much data do I need?"):
107
- st.markdown(
108
- """
109
- We recommend at least 2000 instances, the more, the better. With fewer instances, the
110
- results are less replicable and reliable.
111
- """
112
- )
113
-
114
- with st.beta_expander("Is there a paper I can cite?"):
115
- st.markdown(
116
- """
117
- Yes please! Reference coming soon...
118
- """
119
- )
120
-
121
- with st.beta_expander("What languages are supported?"):
122
- st.markdown(
123
- f"""
124
- Currently we support: {", ".join([i.name for i in Languages])}.
125
- """
126
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pages/home.py DELETED
@@ -1,240 +0,0 @@
1
- from src.configs import Languages
2
- from src.utils import read_file, download_button
3
- from src.plotting import plot_labels_prop, plot_nchars, plot_score
4
- from src.preprocessing import Lemmatizer, PreprocessingPipeline, encode
5
- from src.wordifier import wordifier
6
- import streamlit as st
7
-
8
-
9
- def write(session, uploaded_file):
10
-
11
- if not uploaded_file:
12
- st.markdown(
13
- """
14
- Hi, welcome to __Wordify__! :rocket:
15
-
16
- Start by uploading a file - CSV, XLSX (avoid Strict Open XML Spreadsheet format [here](https://stackoverflow.com/questions/62800822/openpyxl-cannot-read-strict-open-xml-spreadsheet-format-userwarning-file-conta)),
17
- or PARQUET are currently supported.
18
-
19
- Once you have uploaded the file, __Wordify__ will show an interactive UI through which
20
- you'll be able to interactively decide the text preprocessing steps, their order, and
21
- proceed to Wordify your text.
22
-
23
- If you're ready, let's jump in:
24
-
25
- :point_left: upload a file via the upload widget in the sidebar!
26
-
27
- NOTE: whenever you want to reset everything, simply refresh the page.
28
- """
29
- )
30
-
31
- elif uploaded_file:
32
-
33
- # ==== 1. READ FILE ==== #
34
- with st.spinner("Reading file"):
35
- # TODO: write parser function that automatically understands format
36
- data = read_file(uploaded_file)
37
-
38
- # 2. CREATE UI TO SELECT COLUMNS
39
- col1, col2, col3 = st.beta_columns(3)
40
- with col1:
41
- language = st.selectbox("Select language", [i.name for i in Languages])
42
- with st.beta_expander("Description"):
43
- st.markdown(
44
- f"Select a language amongst those supported: {', '.join([f'`{i.name}`' for i in Languages])}. This will be used to lemmatize and remove stopwords."
45
- )
46
- with col2:
47
- cols_options = [""] + data.columns.tolist()
48
- label_column = st.selectbox(
49
- "Select label column name", cols_options, index=0
50
- )
51
- with st.beta_expander("Description"):
52
- st.markdown("Select the column containing the labels.")
53
-
54
- if label_column:
55
- plot = plot_labels_prop(data, label_column)
56
- if plot:
57
- st.altair_chart(plot, use_container_width=True)
58
-
59
- with col3:
60
- text_column = st.selectbox("Select text column name", cols_options, index=0)
61
- with st.beta_expander("Description"):
62
- st.markdown("Select the column containing the texts.")
63
-
64
- if text_column:
65
- st.altair_chart(
66
- plot_nchars(data, text_column), use_container_width=True
67
- )
68
-
69
- # ==== 2.1 CREATE UI FOR ADVANCED OPTIONS ==== #
70
- with st.beta_expander("Advanced options"):
71
-
72
- steps_options = list(PreprocessingPipeline.pipeline_components().keys())
73
-
74
- # stopwords option and
75
- col1, col2 = st.beta_columns([1, 3])
76
- with col1:
77
- st.markdown("Remove stopwords (uses Spacy vocabulary)")
78
- with col2:
79
- remove_stopwords_elem = st.empty()
80
-
81
- # lemmatization option
82
- col1, col2 = st.beta_columns([1, 3])
83
- with col1:
84
- st.markdown("Lemmatizes text (uses Spacy)")
85
- with col2:
86
- lemmatization_elem = st.empty()
87
-
88
- # pre-lemmatization cleaning steps and
89
- # post-lemmatization cleaning steps
90
- col1, col2 = st.beta_columns([1, 3])
91
- with col1:
92
- st.markdown(
93
- f"""
94
- Define a pipeline of cleaning steps that is applied before and/or after lemmatization.
95
- The available cleaning steps are:\n
96
- {", ".join([f"`{x.replace('_', ' ').title()}`" for x in steps_options])}
97
- """
98
- )
99
- with col2:
100
- pre_steps_elem = st.empty()
101
- post_steps_elem = st.empty()
102
- reset_button = st.empty()
103
-
104
- # implement reset logic
105
- if reset_button.button("Reset steps"):
106
- session.run_id += 1
107
-
108
- pre_steps = pre_steps_elem.multiselect(
109
- "Select pre-lemmatization preprocessing steps (ordered)",
110
- options=steps_options,
111
- default=steps_options,
112
- format_func=lambda x: x.replace("_", " ").title(),
113
- key=session.run_id,
114
- )
115
- post_steps = post_steps_elem.multiselect(
116
- "Select post-lemmatization processing steps (ordered)",
117
- options=steps_options,
118
- default=steps_options[-4:],
119
- format_func=lambda x: x.replace("_", " ").title(),
120
- key=session.run_id,
121
- )
122
- remove_stopwords = remove_stopwords_elem.checkbox(
123
- "Remove stopwords",
124
- value=True,
125
- key=session.run_id,
126
- )
127
- lemmatization = lemmatization_elem.checkbox(
128
- "Lemmatize text",
129
- value=True,
130
- key=session.run_id,
131
- )
132
-
133
- # show sample checkbox
134
- col1, col2 = st.beta_columns([1, 2])
135
- with col1:
136
- show_sample = st.checkbox("Show sample of preprocessed text")
137
-
138
- # initialize text preprocessor
139
- preprocessing_pipeline = PreprocessingPipeline(
140
- pre_steps=pre_steps,
141
- lemmatizer=Lemmatizer(
142
- language=language,
143
- remove_stop=remove_stopwords,
144
- lemmatization=lemmatization,
145
- ),
146
- post_steps=post_steps,
147
- )
148
-
149
- print(preprocessing_pipeline.pre_steps)
150
-
151
- # ==== 3. PROVIDE FEEDBACK ON OPTIONS ==== #
152
- if show_sample and not (label_column and text_column):
153
- st.warning("Please select `label` and `text` columns")
154
-
155
- elif show_sample and (label_column and text_column):
156
- sample_data = data.sample(5)
157
- sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
158
- sample_data[text_column]
159
- ).values
160
-
161
- print(sample_data)
162
- st.table(
163
- sample_data.loc[
164
- :, [label_column, text_column, f"preprocessed_{text_column}"]
165
- ]
166
- )
167
-
168
- # ==== 4. RUN ==== #
169
- run_button = st.button("Wordify!")
170
- if run_button and not (label_column and text_column):
171
- st.warning("Please select `label` and `text` columns")
172
-
173
- elif run_button and (label_column and text_column) and not session.process:
174
-
175
- with st.spinner("Process started"):
176
- # data = data.head()
177
- data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
178
- data[text_column]
179
- ).values
180
-
181
- print(data.head())
182
-
183
- inputs = encode(data[f"preprocessed_{text_column}"], data[label_column])
184
- session.posdf, session.negdf = wordifier(**inputs)
185
- st.success("Wordified!")
186
-
187
- # session.posdf, session.negdf = process(data, text_column, label_column)
188
- session.process = True
189
-
190
- # ==== 5. RESULTS ==== #
191
- if session.process and (label_column and text_column):
192
- st.markdown("")
193
- st.markdown("")
194
- st.header("Results")
195
-
196
- # col1, col2, _ = st.beta_columns(3)
197
- col1, col2, col3 = st.beta_columns([2, 3, 3])
198
-
199
- with col1:
200
- label = st.selectbox(
201
- "Select label", data[label_column].unique().tolist()
202
- )
203
- # # with col2:
204
- # thres = st.slider(
205
- # "Select threshold",
206
- # min_value=0,
207
- # max_value=100,
208
- # step=1,
209
- # format="%f",
210
- # value=30,
211
- # )
212
- show_plots = st.checkbox("Show plots of top 100")
213
-
214
- with col2:
215
- st.subheader(f"Words __positively__ identifying label `{label}`")
216
- st.write(
217
- session.posdf[session.posdf[label_column] == label].sort_values(
218
- "score", ascending=False
219
- )
220
- )
221
- download_button(session.posdf, "positive_data")
222
- if show_plots:
223
- st.altair_chart(
224
- plot_score(session.posdf, label_column, label),
225
- use_container_width=True,
226
- )
227
-
228
- with col3:
229
- st.subheader(f"Words __negatively__ identifying label `{label}`")
230
- st.write(
231
- session.negdf[session.negdf[label_column] == label].sort_values(
232
- "score", ascending=False
233
- )
234
- )
235
- download_button(session.negdf, "negative_data")
236
- if show_plots:
237
- st.altair_chart(
238
- plot_score(session.negdf, label_column, label),
239
- use_container_width=True,
240
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/plotting.py DELETED
@@ -1,84 +0,0 @@
1
- import altair as alt
2
- import pandas as pd
3
- import streamlit as st
4
- from stqdm import stqdm
5
-
6
- stqdm.pandas()
7
-
8
-
9
- def plot_labels_prop(data: pd.DataFrame, label_column: str):
10
-
11
- unique_value_limit = 100
12
-
13
- if data[label_column].nunique() > unique_value_limit:
14
-
15
- st.warning(
16
- f"""
17
- The column you selected has more than {unique_value_limit}.
18
- Are you sure it's the right column? If it is, please note that
19
- this will impact __Wordify__ performance.
20
- """
21
- )
22
-
23
- return
24
-
25
- source = (
26
- data[label_column]
27
- .value_counts()
28
- .reset_index()
29
- .rename(columns={"index": "Labels", label_column: "Counts"})
30
- )
31
- source["Props"] = source["Counts"] / source["Counts"].sum()
32
- source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
33
-
34
- bars = (
35
- alt.Chart(source)
36
- .mark_bar()
37
- .encode(
38
- x=alt.X("Labels:O", sort="-y"),
39
- y="Counts:Q",
40
- )
41
- )
42
-
43
- text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
44
- text="Proportions:O"
45
- )
46
-
47
- return (bars + text).properties(height=300)
48
-
49
-
50
- def plot_nchars(data: pd.DataFrame, text_column: str):
51
- source = data[text_column].str.len().to_frame()
52
-
53
- plot = (
54
- alt.Chart(source)
55
- .mark_bar()
56
- .encode(
57
- alt.X(
58
- f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
59
- ),
60
- alt.Y("count()", axis=alt.Axis(title="")),
61
- )
62
- )
63
-
64
- return plot.properties(height=300)
65
-
66
-
67
- def plot_score(data: pd.DataFrame, label_col: str, label: str):
68
-
69
- source = (
70
- data.loc[data[label_col] == label]
71
- .sort_values("score", ascending=False)
72
- .head(100)
73
- )
74
-
75
- plot = (
76
- alt.Chart(source)
77
- .mark_bar()
78
- .encode(
79
- y=alt.Y("word:O", sort="-x"),
80
- x="score:Q",
81
- )
82
- )
83
-
84
- return plot.properties(height=max(30 * source.shape[0], 50))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/preprocessing.py CHANGED
@@ -1,56 +1,20 @@
 
 
1
  import re
2
  import string
3
  from collections import OrderedDict
4
- from typing import Callable, List, Optional, Tuple
5
 
6
- import numpy as np
7
  import pandas as pd
8
  import spacy
9
  import streamlit as st
 
 
10
  from pandas.core.series import Series
11
- from sklearn.feature_extraction.text import TfidfVectorizer
12
- from sklearn.preprocessing import LabelEncoder
13
- from stqdm import stqdm
14
  from textacy.preprocessing import make_pipeline, normalize, remove, replace
15
 
16
  from .configs import Languages
17
 
18
- stqdm.pandas()
19
-
20
-
21
- def encode(text: pd.Series, labels: pd.Series):
22
- """
23
- Encodes text in mathematical object ameanable to training algorithm
24
- """
25
- tfidf_vectorizer = TfidfVectorizer(
26
- input="content", # default: file already in memory
27
- encoding="utf-8", # default
28
- decode_error="strict", # default
29
- strip_accents=None, # do nothing
30
- lowercase=False, # do nothing
31
- preprocessor=None, # do nothing - default
32
- tokenizer=None, # default
33
- stop_words=None, # do nothing
34
- analyzer="word",
35
- ngram_range=(1, 3), # maximum 3-ngrams
36
- min_df=0.001,
37
- max_df=0.75,
38
- sublinear_tf=True,
39
- )
40
- label_encoder = LabelEncoder()
41
-
42
- with st.spinner("Encoding text using TF-IDF and Encoding labels"):
43
- X = tfidf_vectorizer.fit_transform(text.values)
44
- y = label_encoder.fit_transform(labels.values)
45
-
46
- return {
47
- "X": X,
48
- "y": y,
49
- "X_names": np.array(tfidf_vectorizer.get_feature_names()),
50
- "y_names": label_encoder.classes_,
51
- }
52
-
53
-
54
  # more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
55
  # and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
56
  # fmt: off
@@ -87,118 +51,105 @@ def normalize_repeating_words(t):
87
  return _re_wrep.sub(_replace_wrep, t)
88
 
89
 
90
- # fmt: on
91
- class Lemmatizer:
92
- """Creates lemmatizer based on spacy"""
93
 
94
- def __init__(
95
- self, language: str, remove_stop: bool = True, lemmatization: bool = True
96
- ) -> None:
97
- self.language = language
98
- self.nlp = spacy.load(
99
- Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
100
- )
101
- self._lemmatizer_fn = self._get_lemmatization_fn(remove_stop, lemmatization)
102
- self.lemmatization = lemmatization
103
 
104
- def _get_lemmatization_fn(
105
- self, remove_stop: bool, lemmatization: bool
106
- ) -> Optional[Callable]:
107
- """Return the correct spacy Doc-level lemmatizer"""
108
- if remove_stop and lemmatization:
109
 
110
- def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
111
- return " ".join(
112
- [t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop]
113
- )
114
 
115
- elif remove_stop and not lemmatization:
 
 
 
116
 
117
- def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
118
- return " ".join([t.text for t in doc if not t.is_stop])
119
 
120
- elif lemmatization and not remove_stop:
 
121
 
122
- def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
123
- return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
124
 
125
- else:
126
- self.status = False
127
- return
128
-
129
- return lemmatizer_fn
130
-
131
- def __call__(self, series: Series) -> Series:
132
- """
133
- Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
134
- """
135
- res = []
136
- pbar = stqdm(total=len(series), desc="Lemmatizing")
137
- for doc in self.nlp.pipe(series, batch_size=500):
138
- res.append(self._lemmatizer_fn(doc))
139
- pbar.update(1)
140
- pbar.close()
141
- return pd.Series(res)
142
 
143
 
 
144
  class PreprocessingPipeline:
145
  def __init__(
146
- self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]
 
 
 
 
147
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
- # build pipeline
150
- self.pre_pipeline, self.lemmatizer, self.post_pipeline = self.make_pipeline(
151
- pre_steps, lemmatizer, post_steps
 
 
 
 
152
  )
153
 
154
- def __call__(self, series: Series) -> Series:
155
- with st.spinner("Pre-lemmatization cleaning"):
156
- res = series.progress_map(self.pre_pipeline)
157
 
158
- with st.spinner("Lemmatizing"):
159
- res = self.lemmatizer(series)
 
160
 
161
- with st.spinner("Post-lemmatization cleaning"):
162
- res = series.progress_map(self.post_pipeline)
 
 
 
 
 
 
163
 
164
- return res
 
165
 
166
- def make_pipeline(
167
- self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]
168
- ) -> Tuple[Callable]:
169
 
170
- # pre-lemmatization steps
171
- pre_steps = [
172
- self.pipeline_components()[step]
173
- for step in pre_steps
174
- if step in self.pipeline_components()
175
- ]
176
- pre_steps = make_pipeline(*pre_steps) if pre_steps else lambda x: x
177
 
178
- # lemmatization
179
- lemmatizer = lemmatizer if lemmatizer.lemmatization else lambda x: x
180
 
181
- # post lemmatization steps
182
- post_steps = [
183
- self.pipeline_components()[step]
184
- for step in post_steps
185
- if step in self.pipeline_components()
186
- ]
187
- post_steps = make_pipeline(*post_steps) if post_steps else lambda x: x
188
 
189
- return pre_steps, lemmatizer, post_steps
190
 
191
  @staticmethod
192
  def pipeline_components() -> "OrderedDict[str, Callable]":
193
  """Returns available cleaning steps in order"""
194
  return OrderedDict(
195
  [
196
- ("lower", lambda x: x.lower()),
197
  ("normalize_unicode", normalize.unicode),
198
  ("normalize_bullet_points", normalize.bullet_points),
199
  ("normalize_hyphenated_words", normalize.hyphenated_words),
200
  ("normalize_quotation_marks", normalize.quotation_marks),
201
- ("normalize_whitespace", normalize.whitespace),
202
  ("replace_urls", replace.urls),
203
  ("replace_currency_symbols", replace.currency_symbols),
204
  ("replace_emails", replace.emails),
@@ -216,6 +167,17 @@ class PreprocessingPipeline:
216
  ("normalize_useless_spaces", normalize_useless_spaces),
217
  ("normalize_repeating_chars", normalize_repeating_chars),
218
  ("normalize_repeating_words", normalize_repeating_words),
219
- ("strip", lambda x: x.strip()),
 
 
 
 
 
 
 
 
 
 
 
220
  ]
221
  )
 
1
+ import multiprocessing as mp
2
+ import os
3
  import re
4
  import string
5
  from collections import OrderedDict
6
+ from typing import Callable, List, Optional
7
 
 
8
  import pandas as pd
9
  import spacy
10
  import streamlit as st
11
+ import vaex
12
+ from pandas.core.frame import DataFrame
13
  from pandas.core.series import Series
 
 
 
14
  from textacy.preprocessing import make_pipeline, normalize, remove, replace
15
 
16
  from .configs import Languages
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
19
  # and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
20
  # fmt: off
 
51
  return _re_wrep.sub(_replace_wrep, t)
52
 
53
 
54
+ def lowercase(t: str) -> str:
55
+ return t.lower()
 
56
 
 
 
 
 
 
 
 
 
 
57
 
58
+ def strip(t: str) -> str:
59
+ return t.strip()
 
 
 
60
 
 
 
 
 
61
 
62
+ def lemmatize_remove_stopwords(doc: spacy.tokens.doc.Doc) -> str:
63
+ return " ".join(
64
+ [t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop]
65
+ )
66
 
 
 
67
 
68
+ def remove_stopwords(doc: spacy.tokens.doc.Doc) -> str:
69
+ return " ".join([t.text for t in doc if not t.is_stop])
70
 
 
 
71
 
72
+ def lemmatize_keep_stopwords(doc: spacy.tokens.doc.Doc) -> str:
73
+ return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
 
76
+ # fmt: on
77
  class PreprocessingPipeline:
78
  def __init__(
79
+ self,
80
+ language: str,
81
+ pre_steps: Optional[List[str]],
82
+ lemmatization_step: Optional[str],
83
+ post_steps: Optional[List[str]],
84
  ):
85
+ self.language = language
86
+ self.pre_steps = pre_steps
87
+ self.lemmatization_step = lemmatization_step
88
+ self.post_steps = post_steps
89
+
90
+ self.nlp = spacy.load(Languages[language].value, disable=["parser", "ner"])
91
+ self.pre = self.make_pre_post_component(self.pre_steps)
92
+ self.post = self.make_pre_post_component(self.post_steps)
93
+ self.lemma = self.lemmatization_component()[self.lemmatization_step]
94
+
95
+ def apply_multiproc(fn, series):
96
+ with mp.Pool(mp.cpu_count()) as pool:
97
+ new_series = pool.map(fn, series)
98
+
99
+ return new_series
100
 
101
+ def vaex_process(self, df: DataFrame, text_column: str) -> DataFrame:
102
+ def fn(t):
103
+ return self.post(self.lemma(self.nlp(self.pre(t))))
104
+
105
+ vdf = vaex.from_pandas(df)
106
+ vdf["processed_text"] = vdf.apply(
107
+ fn, arguments=[vdf[text_column]], vectorize=False
108
  )
109
 
110
+ return vdf.to_pandas_df()
 
 
111
 
112
+ def __call__(self, series: Series) -> Series:
113
+ if self.pre:
114
+ series = series.map(self.pre)
115
 
116
+ if self.lemma:
117
+ total_steps = len(series) // 100
118
+ res = []
119
+ pbar = st.progress(0)
120
+ for i, doc in enumerate(
121
+ self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())
122
+ ):
123
+ res.append(self.lemma(doc))
124
 
125
+ if i % total_steps == 0:
126
+ pbar.progress(1)
127
 
128
+ series = pd.Series(res)
 
 
129
 
130
+ if self.post:
131
+ series = series.map(self.post)
 
 
 
 
 
132
 
133
+ return series
 
134
 
135
+ def make_pre_post_component(self, steps: Optional[List[str]]) -> Optional[Callable]:
136
+ if not steps:
137
+ return
138
+ components = [self.pipeline_components()[step] for step in steps]
 
 
 
139
 
140
+ return make_pipeline(*components)
141
 
142
  @staticmethod
143
  def pipeline_components() -> "OrderedDict[str, Callable]":
144
  """Returns available cleaning steps in order"""
145
  return OrderedDict(
146
  [
147
+ ("lowercase", lowercase),
148
  ("normalize_unicode", normalize.unicode),
149
  ("normalize_bullet_points", normalize.bullet_points),
150
  ("normalize_hyphenated_words", normalize.hyphenated_words),
151
  ("normalize_quotation_marks", normalize.quotation_marks),
152
+ ("normalize_whitespaces", normalize.whitespace),
153
  ("replace_urls", replace.urls),
154
  ("replace_currency_symbols", replace.currency_symbols),
155
  ("replace_emails", replace.emails),
 
167
  ("normalize_useless_spaces", normalize_useless_spaces),
168
  ("normalize_repeating_chars", normalize_repeating_chars),
169
  ("normalize_repeating_words", normalize_repeating_words),
170
+ ("strip", strip),
171
+ ]
172
+ )
173
+
174
+ @staticmethod
175
+ def lemmatization_component() -> "OrderedDict[str, Optional[Callable]]":
176
+ return OrderedDict(
177
+ [
178
+ ("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
179
+ ("Spacy lemmatizer (no stopwords)", lemmatize_remove_stopwords),
180
+ ("Disable lemmatizer", None),
181
+ ("Remove stopwords", remove_stopwords),
182
  ]
183
  )
src/session_state.py DELETED
@@ -1,121 +0,0 @@
1
- """Hack to add per-session state to Streamlit.
2
-
3
- Usage
4
- -----
5
-
6
- >>> import SessionState
7
- >>>
8
- >>> session_state = SessionState.get(user_name='', favorite_color='black')
9
- >>> session_state.user_name
10
- ''
11
- >>> session_state.user_name = 'Mary'
12
- >>> session_state.favorite_color
13
- 'black'
14
-
15
- Since you set user_name above, next time your script runs this will be the
16
- result:
17
- >>> session_state = get(user_name='', favorite_color='black')
18
- >>> session_state.user_name
19
- 'Mary'
20
-
21
- """
22
- try:
23
- import streamlit.ReportThread as ReportThread
24
- from streamlit.server.Server import Server
25
- except Exception:
26
- # Streamlit >= 0.65.0
27
- import streamlit.report_thread as ReportThread
28
- from streamlit.server.server import Server
29
-
30
-
31
- class SessionState(object):
32
- def __init__(self, **kwargs):
33
- """A new SessionState object.
34
-
35
- Parameters
36
- ----------
37
- **kwargs : any
38
- Default values for the session state.
39
-
40
- Example
41
- -------
42
- >>> session_state = SessionState(user_name='', favorite_color='black')
43
- >>> session_state.user_name = 'Mary'
44
- ''
45
- >>> session_state.favorite_color
46
- 'black'
47
-
48
- """
49
- for key, val in kwargs.items():
50
- setattr(self, key, val)
51
-
52
-
53
- def get(**kwargs):
54
- """Gets a SessionState object for the current session.
55
-
56
- Creates a new object if necessary.
57
-
58
- Parameters
59
- ----------
60
- **kwargs : any
61
- Default values you want to add to the session state, if we're creating a
62
- new one.
63
-
64
- Example
65
- -------
66
- >>> session_state = get(user_name='', favorite_color='black')
67
- >>> session_state.user_name
68
- ''
69
- >>> session_state.user_name = 'Mary'
70
- >>> session_state.favorite_color
71
- 'black'
72
-
73
- Since you set user_name above, next time your script runs this will be the
74
- result:
75
- >>> session_state = get(user_name='', favorite_color='black')
76
- >>> session_state.user_name
77
- 'Mary'
78
-
79
- """
80
- # Hack to get the session object from Streamlit.
81
-
82
- ctx = ReportThread.get_report_ctx()
83
-
84
- this_session = None
85
-
86
- current_server = Server.get_current()
87
- if hasattr(current_server, "_session_infos"):
88
- # Streamlit < 0.56
89
- session_infos = Server.get_current()._session_infos.values()
90
- else:
91
- session_infos = Server.get_current()._session_info_by_id.values()
92
-
93
- for session_info in session_infos:
94
- s = session_info.session
95
- if (
96
- # Streamlit < 0.54.0
97
- (hasattr(s, "_main_dg") and s._main_dg == ctx.main_dg)
98
- or
99
- # Streamlit >= 0.54.0
100
- (not hasattr(s, "_main_dg") and s.enqueue == ctx.enqueue)
101
- or
102
- # Streamlit >= 0.65.2
103
- (
104
- not hasattr(s, "_main_dg")
105
- and s._uploaded_file_mgr == ctx.uploaded_file_mgr
106
- )
107
- ):
108
- this_session = s
109
-
110
- if this_session is None:
111
- raise RuntimeError(
112
- "Oh noes. Couldn't get your Streamlit Session object. "
113
- "Are you doing something fancy with threads?"
114
- )
115
-
116
- # Got the session object! Now let's attach some state into it.
117
-
118
- if not hasattr(this_session, "_custom_session_state"):
119
- this_session._custom_session_state = SessionState(**kwargs)
120
-
121
- return this_session._custom_session_state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils.py CHANGED
@@ -1,34 +1,31 @@
1
  import base64
 
2
  import altair as alt
3
  import pandas as pd
4
  import streamlit as st
5
  from PIL import Image
6
- from stqdm import stqdm
7
 
8
  from .configs import SupportedFiles
9
 
10
- stqdm.pandas()
11
-
12
 
13
  @st.cache
14
  def get_logo(path):
15
  return Image.open(path)
16
 
17
 
18
- # @st.cache(suppress_st_warning=True)
19
- @st.cache(allow_output_mutation=True)
20
  def read_file(uploaded_file) -> pd.DataFrame:
21
-
22
  file_type = uploaded_file.name.split(".")[-1]
23
- if file_type in set(i.name for i in SupportedFiles):
24
- read_f = SupportedFiles[file_type].value[0]
25
- df = read_f(uploaded_file)
26
- # remove any NA
27
- df = df.dropna()
28
- return df
29
-
30
- else:
31
- st.error("File type not supported")
 
32
 
33
 
34
  def download_button(dataframe: pd.DataFrame, name: str):
 
1
  import base64
2
+
3
  import altair as alt
4
  import pandas as pd
5
  import streamlit as st
6
  from PIL import Image
 
7
 
8
  from .configs import SupportedFiles
9
 
 
 
10
 
11
  @st.cache
12
  def get_logo(path):
13
  return Image.open(path)
14
 
15
 
16
+ @st.experimental_memo
 
17
  def read_file(uploaded_file) -> pd.DataFrame:
 
18
  file_type = uploaded_file.name.split(".")[-1]
19
+ read_fn = SupportedFiles[file_type].value[0]
20
+ df = read_fn(uploaded_file)
21
+ df = df.dropna()
22
+ return df
23
+
24
+
25
+ @st.cache
26
+ def convert_df(df):
27
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
28
+ return df.to_csv(index=False, sep=";").encode("utf-8")
29
 
30
 
31
  def download_button(dataframe: pd.DataFrame, name: str):
src/wordifier.py CHANGED
@@ -1,17 +1,58 @@
1
- from typing import List
 
2
  import numpy as np
3
  import pandas as pd
4
  import streamlit as st
 
 
5
  from sklearn.linear_model import LogisticRegression
 
6
  from sklearn.utils import resample
7
- from stqdm import stqdm
8
 
9
- from .configs import ModelConfigs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- stqdm.pandas()
 
 
 
 
 
12
 
13
 
14
- def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):
 
 
 
 
 
 
15
 
16
  n_instances, n_features = X.shape
17
  n_classes = len(y_names)
@@ -36,70 +77,80 @@ def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs
36
  pos_scores = np.zeros((n_classes, n_features), dtype=int)
37
  neg_scores = np.zeros((n_classes, n_features), dtype=int)
38
 
39
- with st.spinner("Wordifying!"):
40
-
41
- for _ in stqdm(range(configs.NUM_ITERS.value)):
42
-
43
- # run randomized regression
44
- clf = LogisticRegression(
45
- penalty="l1",
46
- C=configs.PENALTIES.value[
47
- np.random.randint(len(configs.PENALTIES.value))
48
- ],
49
- solver="liblinear",
50
- multi_class="auto",
51
- max_iter=500,
52
- class_weight="balanced",
53
- )
54
-
55
- # sample indices to subsample matrix
56
- selection = resample(
57
- np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size
58
- )
59
-
60
- # fit
61
- try:
62
- clf.fit(X[selection], y[selection])
63
- except ValueError:
64
- continue
65
-
66
- # record coefficients
67
- if n_classes == 2:
68
- pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
69
- neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
70
- pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
71
- neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
72
- else:
73
- pos_scores += clf.coef_ > 0
74
- neg_scores += clf.coef_ < 0
75
-
76
- # normalize
77
- pos_scores = pos_scores / configs.NUM_ITERS.value
78
- neg_scores = neg_scores / configs.NUM_ITERS.value
79
-
80
- # get only active features
81
- pos_positions = np.where(
82
- pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0
83
  )
84
- neg_positions = np.where(
85
- neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0
 
 
86
  )
87
 
88
- # prepare DataFrame
89
- pos = [
90
- (X_names[i], pos_scores[c, i], y_names[c])
91
- for c, i in zip(*pos_positions.nonzero())
92
- ]
93
- neg = [
94
- (X_names[i], neg_scores[c, i], y_names[c])
95
- for c, i in zip(*neg_positions.nonzero())
96
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
 
 
 
 
98
  posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(
99
  ["label", "score"], ascending=False
100
  )
 
101
  negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(
102
  ["label", "score"], ascending=False
103
  )
 
 
 
 
104
 
105
- return posdf, negdf
 
1
+ from typing import Dict, List, Tuple
2
+
3
  import numpy as np
4
  import pandas as pd
5
  import streamlit as st
6
+ from pandas.core.frame import DataFrame
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.linear_model import LogisticRegression
9
+ from sklearn.preprocessing import LabelEncoder
10
  from sklearn.utils import resample
 
11
 
12
+ from .configs import InputTransformConfigs, ModelConfigs
13
+
14
+
15
+ def input_transform(
16
+ text: pd.Series, labels: pd.Series, configs=InputTransformConfigs
17
+ ) -> Dict[str, np.ndarray]:
18
+ """
19
+ Encodes text in mathematical object ameanable to training algorithm
20
+ """
21
+ tfidf_vectorizer = TfidfVectorizer(
22
+ input="content", # default: file already in memory
23
+ encoding="utf-8", # default
24
+ decode_error="strict", # default
25
+ strip_accents=None, # do nothing
26
+ lowercase=False, # do nothing
27
+ preprocessor=None, # do nothing - default
28
+ tokenizer=None, # default
29
+ stop_words=None, # do nothing
30
+ analyzer="word",
31
+ ngram_range=configs.NGRAM_RANGE.value, # maximum 3-ngrams
32
+ min_df=configs.MIN_DF.value,
33
+ max_df=configs.MAX_DF.value,
34
+ sublinear_tf=configs.SUBLINEAR.value,
35
+ )
36
+ label_encoder = LabelEncoder()
37
+
38
+ X = tfidf_vectorizer.fit_transform(text.values)
39
+ y = label_encoder.fit_transform(labels.values)
40
 
41
+ return {
42
+ "X": X,
43
+ "y": y,
44
+ "X_names": np.array(tfidf_vectorizer.get_feature_names_out()),
45
+ "y_names": label_encoder.classes_,
46
+ }
47
 
48
 
49
+ def wordifier(
50
+ X: np.ndarray,
51
+ y: np.ndarray,
52
+ X_names: List[str],
53
+ y_names: List[str],
54
+ configs=ModelConfigs,
55
+ ) -> List[Tuple[str, float, str]]:
56
 
57
  n_instances, n_features = X.shape
58
  n_classes = len(y_names)
 
77
  pos_scores = np.zeros((n_classes, n_features), dtype=int)
78
  neg_scores = np.zeros((n_classes, n_features), dtype=int)
79
 
80
+ pbar = st.progress(0)
81
+ for i, _ in enumerate(range(configs.NUM_ITERS.value)):
82
+
83
+ # run randomized regression
84
+ clf = LogisticRegression(
85
+ penalty="l1",
86
+ C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
87
+ solver="liblinear",
88
+ multi_class="auto",
89
+ max_iter=500,
90
+ class_weight="balanced",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  )
92
+
93
+ # sample indices to subsample matrix
94
+ selection = resample(
95
+ np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size
96
  )
97
 
98
+ # fit
99
+ try:
100
+ clf.fit(X[selection], y[selection])
101
+ except ValueError:
102
+ continue
103
+
104
+ # record coefficients
105
+ if n_classes == 2:
106
+ pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
107
+ neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
108
+ pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
109
+ neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
110
+ else:
111
+ pos_scores += clf.coef_ > 0
112
+ neg_scores += clf.coef_ < 0
113
+
114
+ pbar.progress(round(i / configs.NUM_ITERS.value, 1))
115
+
116
+ # normalize
117
+ pos_scores = pos_scores / configs.NUM_ITERS.value
118
+ neg_scores = neg_scores / configs.NUM_ITERS.value
119
+
120
+ # get only active features
121
+ pos_positions = np.where(
122
+ pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0
123
+ )
124
+ neg_positions = np.where(
125
+ neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0
126
+ )
127
+
128
+ # prepare DataFrame
129
+ pos = [
130
+ (X_names[i], pos_scores[c, i], y_names[c])
131
+ for c, i in zip(*pos_positions.nonzero())
132
+ ]
133
+ neg = [
134
+ (X_names[i], neg_scores[c, i], y_names[c])
135
+ for c, i in zip(*neg_positions.nonzero())
136
+ ]
137
+
138
+ return pos, neg
139
 
140
+
141
+ def output_transform(
142
+ pos: List[Tuple[str, float, str]], neg: List[Tuple[str, float, str]]
143
+ ) -> DataFrame:
144
  posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(
145
  ["label", "score"], ascending=False
146
  )
147
+ posdf["correlation"] = "positive"
148
  negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(
149
  ["label", "score"], ascending=False
150
  )
151
+ negdf["correlation"] = "negative"
152
+
153
+ output = pd.concat([posdf, negdf], ignore_index=False, axis=0)
154
+ output.columns = output.columns.str.title()
155
 
156
+ return output