Pietro Lesci commited on
Commit
c700823
1 Parent(s): 51cab9d

enhanced UI

Browse files
Files changed (2) hide show
  1. src/pages/home.py +64 -61
  2. src/preprocessing.py +74 -63
src/pages/home.py CHANGED
@@ -1,13 +1,7 @@
1
  from src.configs import Languages
2
- from src.utils import (
3
- encode,
4
- download_button,
5
- TextPreprocessor,
6
- plot_labels_prop,
7
- plot_nchars,
8
- plot_score,
9
- read_file,
10
- )
11
  from src.wordifier import wordifier
12
  import streamlit as st
13
 
@@ -36,7 +30,7 @@ def write(session, uploaded_file):
36
 
37
  elif uploaded_file:
38
 
39
- # 1. READ FILE
40
  with st.spinner("Reading file"):
41
  # TODO: write parser function that automatically understands format
42
  data = read_file(uploaded_file)
@@ -47,15 +41,13 @@ def write(session, uploaded_file):
47
  language = st.selectbox("Select language", [i.name for i in Languages])
48
  with st.beta_expander("Description"):
49
  st.markdown(
50
- f"Select a language of text amongst those supported: {', '.join([f'`{i.name}`' for i in Languages])}"
51
  )
52
  with col2:
53
  cols_options = [""] + data.columns.tolist()
54
- label_column = st.selectbox(
55
- "Select label column name", cols_options, index=0
56
- )
57
  with st.beta_expander("Description"):
58
- st.markdown("Select the column containing the label")
59
 
60
  if label_column:
61
  plot = plot_labels_prop(data, label_column)
@@ -65,90 +57,103 @@ def write(session, uploaded_file):
65
  with col3:
66
  text_column = st.selectbox("Select text column name", cols_options, index=0)
67
  with st.beta_expander("Description"):
68
- st.markdown("Select the column containing the text")
69
 
70
  if text_column:
71
- st.altair_chart(
72
- plot_nchars(data, text_column), use_container_width=True
73
- )
74
 
 
75
  with st.beta_expander("Advanced options"):
76
- # Lemmatization option
 
 
 
77
  col1, col2 = st.beta_columns([1, 3])
78
  with col1:
79
- lemmatization_when_elem = st.empty()
80
  with col2:
81
- st.markdown("Choose lemmatization option")
82
 
83
- # stopwords option
84
  col1, col2 = st.beta_columns([1, 3])
85
  with col1:
86
- remove_stopwords_elem = st.empty()
87
  with col2:
88
- st.markdown("Choose stopword option")
89
 
90
- # cleaning steps
 
91
  col1, col2 = st.beta_columns([1, 3])
92
  with col1:
93
- cleaning_steps_elem = st.empty()
94
- reset_button = st.empty()
 
 
 
 
 
95
  with col2:
96
- st.markdown("Choose cleaning steps")
 
 
97
 
98
  # implement reset logic
99
  if reset_button.button("Reset steps"):
100
  session.run_id += 1
101
 
102
- steps_options = list(TextPreprocessor._cleaning_options().keys())
103
- cleaning_steps = cleaning_steps_elem.multiselect(
104
- "Select text processing steps (ordered)",
105
  options=steps_options,
106
- default=steps_options,
107
  format_func=lambda x: x.replace("_", " ").title(),
108
  key=session.run_id,
109
  )
110
- lemmatization_options = list(
111
- TextPreprocessor._lemmatization_options().keys()
112
- )
113
- lemmatization_when = lemmatization_when_elem.selectbox(
114
- "Select when lemmatization happens",
115
- options=lemmatization_options,
116
- index=0,
117
  key=session.run_id,
118
  )
119
  remove_stopwords = remove_stopwords_elem.checkbox(
120
- "Remove stopwords", value=True, key=session.run_id
 
 
 
 
 
 
 
121
  )
122
 
123
- # Show sample checkbox
124
  col1, col2 = st.beta_columns([1, 2])
125
  with col1:
126
  show_sample = st.checkbox("Show sample of preprocessed text")
127
 
128
  # initialize text preprocessor
129
- preprocessor = TextPreprocessor(
130
- language=language,
131
- cleaning_steps=cleaning_steps,
132
- lemmatizer_when=lemmatization_when,
133
- remove_stop=remove_stopwords,
 
 
 
134
  )
135
 
136
- # 3. PROVIDE FEEDBACK ON OPTIONS
137
  if show_sample and not (label_column and text_column):
138
  st.warning("Please select `label` and `text` columns")
139
 
140
  elif show_sample and (label_column and text_column):
141
- sample_data = data.sample(10)
142
- sample_data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(
143
  sample_data[text_column]
144
  ).values
145
- st.table(
146
- sample_data.loc[
147
- :, [label_column, text_column, f"preprocessed_{text_column}"]
148
- ]
149
- )
150
 
151
- # 4. RUN
152
  run_button = st.button("Wordify!")
153
  if run_button and not (label_column and text_column):
154
  st.warning("Please select `label` and `text` columns")
@@ -157,7 +162,7 @@ def write(session, uploaded_file):
157
 
158
  with st.spinner("Process started"):
159
  # data = data.head()
160
- data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(
161
  data[text_column]
162
  ).values
163
 
@@ -168,7 +173,7 @@ def write(session, uploaded_file):
168
  # session.posdf, session.negdf = process(data, text_column, label_column)
169
  session.process = True
170
 
171
- # 5. RESULTS
172
  if session.process and (label_column and text_column):
173
  st.markdown("")
174
  st.markdown("")
@@ -178,9 +183,7 @@ def write(session, uploaded_file):
178
  col1, col2, col3 = st.beta_columns([2, 3, 3])
179
 
180
  with col1:
181
- label = st.selectbox(
182
- "Select label", data[label_column].unique().tolist()
183
- )
184
  # # with col2:
185
  # thres = st.slider(
186
  # "Select threshold",
 
1
  from src.configs import Languages
2
+ from src.utils import read_file, download_button
3
+ from src.plotting import plot_labels_prop, plot_nchars, plot_score
4
+ from src.preprocessing import Lemmatizer, PreprocessingPipeline, encode
 
 
 
 
 
 
5
  from src.wordifier import wordifier
6
  import streamlit as st
7
 
 
30
 
31
  elif uploaded_file:
32
 
33
+ # ==== 1. READ FILE ==== #
34
  with st.spinner("Reading file"):
35
  # TODO: write parser function that automatically understands format
36
  data = read_file(uploaded_file)
 
41
  language = st.selectbox("Select language", [i.name for i in Languages])
42
  with st.beta_expander("Description"):
43
  st.markdown(
44
+ f"Select a language amongst those supported: {', '.join([f'`{i.name}`' for i in Languages])}. This will be used to lemmatize and remove stopwords."
45
  )
46
  with col2:
47
  cols_options = [""] + data.columns.tolist()
48
+ label_column = st.selectbox("Select label column name", cols_options, index=0)
 
 
49
  with st.beta_expander("Description"):
50
+ st.markdown("Select the column containing the labels.")
51
 
52
  if label_column:
53
  plot = plot_labels_prop(data, label_column)
 
57
  with col3:
58
  text_column = st.selectbox("Select text column name", cols_options, index=0)
59
  with st.beta_expander("Description"):
60
+ st.markdown("Select the column containing the texts.")
61
 
62
  if text_column:
63
+ st.altair_chart(plot_nchars(data, text_column), use_container_width=True)
 
 
64
 
65
+ # ==== 2.1 CREATE UI FOR ADVANCED OPTIONS ==== #
66
  with st.beta_expander("Advanced options"):
67
+
68
+ steps_options = list(PreprocessingPipeline.pipeline_components().keys())
69
+
70
+ # stopwords option and
71
  col1, col2 = st.beta_columns([1, 3])
72
  with col1:
73
+ st.markdown("Remove stopwords (uses Spacy vocabulary)")
74
  with col2:
75
+ remove_stopwords_elem = st.empty()
76
 
77
+ # lemmatization option
78
  col1, col2 = st.beta_columns([1, 3])
79
  with col1:
80
+ st.markdown("Lemmatizes text (uses Spacy)")
81
  with col2:
82
+ lemmatization_elem = st.empty()
83
 
84
+ # pre-lemmatization cleaning steps and
85
+ # post-lemmatization cleaning steps
86
  col1, col2 = st.beta_columns([1, 3])
87
  with col1:
88
+ st.markdown(
89
+ f"""
90
+ Define a pipeline of cleaning steps that is applied before and/or after lemmatization.
91
+ The available cleaning steps are:\n
92
+ {", ".join([f"`{x.replace('_', ' ').title()}`" for x in steps_options])}
93
+ """
94
+ )
95
  with col2:
96
+ pre_steps_elem = st.empty()
97
+ post_steps_elem = st.empty()
98
+ reset_button = st.empty()
99
 
100
  # implement reset logic
101
  if reset_button.button("Reset steps"):
102
  session.run_id += 1
103
 
104
+ pre_steps = pre_steps_elem.multiselect(
105
+ "Select pre-lemmatization preprocessing steps (ordered)",
 
106
  options=steps_options,
107
+ default=steps_options[1:],
108
  format_func=lambda x: x.replace("_", " ").title(),
109
  key=session.run_id,
110
  )
111
+ post_steps = post_steps_elem.multiselect(
112
+ "Select post-lemmatization processing steps (ordered)",
113
+ options=steps_options,
114
+ default=steps_options[-4:],
115
+ format_func=lambda x: x.replace("_", " ").title(),
 
 
116
  key=session.run_id,
117
  )
118
  remove_stopwords = remove_stopwords_elem.checkbox(
119
+ "Remove stopwords",
120
+ value=True,
121
+ key=session.run_id,
122
+ )
123
+ lemmatization = lemmatization_elem.checkbox(
124
+ "Lemmatize text",
125
+ value=True,
126
+ key=session.run_id,
127
  )
128
 
129
+ # show sample checkbox
130
  col1, col2 = st.beta_columns([1, 2])
131
  with col1:
132
  show_sample = st.checkbox("Show sample of preprocessed text")
133
 
134
  # initialize text preprocessor
135
+ preprocessing_pipeline = PreprocessingPipeline(
136
+ pre_steps=pre_steps,
137
+ lemmatizer=Lemmatizer(
138
+ language=language,
139
+ remove_stop=remove_stopwords,
140
+ lemmatization=lemmatization,
141
+ ),
142
+ post_steps=post_steps,
143
  )
144
 
145
+ # ==== 3. PROVIDE FEEDBACK ON OPTIONS ==== #
146
  if show_sample and not (label_column and text_column):
147
  st.warning("Please select `label` and `text` columns")
148
 
149
  elif show_sample and (label_column and text_column):
150
+ sample_data = data.sample(5)
151
+ sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
152
  sample_data[text_column]
153
  ).values
154
+ st.table(sample_data.loc[:, [label_column, text_column, f"preprocessed_{text_column}"]])
 
 
 
 
155
 
156
+ # ==== 4. RUN ==== #
157
  run_button = st.button("Wordify!")
158
  if run_button and not (label_column and text_column):
159
  st.warning("Please select `label` and `text` columns")
 
162
 
163
  with st.spinner("Process started"):
164
  # data = data.head()
165
+ data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
166
  data[text_column]
167
  ).values
168
 
 
173
  # session.posdf, session.negdf = process(data, text_column, label_column)
174
  session.process = True
175
 
176
+ # ==== 5. RESULTS ==== #
177
  if session.process and (label_column and text_column):
178
  st.markdown("")
179
  st.markdown("")
 
183
  col1, col2, col3 = st.beta_columns([2, 3, 3])
184
 
185
  with col1:
186
+ label = st.selectbox("Select label", data[label_column].unique().tolist())
 
 
187
  # # with col2:
188
  # thres = st.slider(
189
  # "Select threshold",
src/preprocessing.py CHANGED
@@ -1,7 +1,7 @@
1
  import re
2
  import string
3
  from collections import OrderedDict
4
- from typing import Callable, Dict, List
5
 
6
  import numpy as np
7
  import pandas as pd
@@ -86,75 +86,102 @@ def normalize_repeating_words(t):
86
 
87
  return _re_wrep.sub(_replace_wrep, t)
88
 
 
89
  # fmt: on
90
- class TextPreprocessor:
91
- def __init__(
92
- self,
93
- language: str,
94
- cleaning_steps: List[str],
95
- lemmatizer_when: str = "last",
96
- remove_stop: bool = True,
97
- ) -> None:
98
-
99
- # prepare lemmatizer
100
  self.language = language
101
  self.nlp = spacy.load(
102
  Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
103
  )
104
- self.lemmatizer_when = self._lemmatization_options().get(lemmatizer_when, None)
105
- self.remove_stop = remove_stop
106
- self._lemmatize = self._get_lemmatizer()
107
-
108
- # prepare cleaning
109
- self.cleaning_steps = [
110
- self._cleaning_options()[step]
111
- for step in cleaning_steps
112
- if step in self._cleaning_options()
113
- ]
114
- self.cleaning_pipeline = (
115
- make_pipeline(*self.cleaning_steps) if self.cleaning_steps else lambda x: x
116
- )
117
 
118
- def _get_lemmatizer(self) -> Callable:
119
  """Return the correct spacy Doc-level lemmatizer"""
120
- if self.remove_stop:
121
 
122
- def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
123
- """Lemmatizes spacy Doc and removes stopwords"""
124
- return " ".join(
125
- [t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop]
126
- )
127
 
128
- else:
 
 
 
 
 
129
 
130
- def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
131
- """Lemmatizes spacy Doc"""
132
  return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
133
 
134
- return lemmatizer
 
 
135
 
136
- @staticmethod
137
- def _lemmatization_options() -> Dict[str, str]:
138
- return {
139
- "Before preprocessing": "first",
140
- "After preprocessing": "last",
141
- "Never! Let's do it quick and dirty": None,
142
- }
143
-
144
- def lemmatizer(self, series: pd.Series) -> pd.Series:
145
  """
146
  Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
147
  """
148
  res = []
149
- pbar = stqdm(total=len(series))
150
  for doc in self.nlp.pipe(series, batch_size=500):
151
- res.append(self._lemmatize(doc))
152
  pbar.update(1)
153
  pbar.close()
154
  return pd.Series(res)
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  @staticmethod
157
- def _cleaning_options():
158
  """Returns available cleaning steps in order"""
159
  return OrderedDict(
160
  [
@@ -184,19 +211,3 @@ class TextPreprocessor:
184
  ("strip", lambda x: x.strip()),
185
  ]
186
  )
187
-
188
- def fit_transform(self, series: pd.Series) -> Series:
189
- """Applies text preprocessing"""
190
-
191
- if self.lemmatizer_when == "first":
192
- with st.spinner("Lemmatizing"):
193
- series = self.lemmatizer(series)
194
-
195
- with st.spinner("Cleaning"):
196
- series = series.progress_map(self.cleaning_pipeline)
197
-
198
- if self.lemmatizer_when == "last":
199
- with st.spinner("Lemmatizing"):
200
- series = self.lemmatizer(series)
201
-
202
- return series
 
1
  import re
2
  import string
3
  from collections import OrderedDict
4
+ from typing import Callable, List, Optional, Tuple
5
 
6
  import numpy as np
7
  import pandas as pd
 
86
 
87
  return _re_wrep.sub(_replace_wrep, t)
88
 
89
+
90
  # fmt: on
91
+ class Lemmatizer:
92
+ """Creates lemmatizer based on spacy"""
93
+
94
+ def __init__(self, language: str, remove_stop: bool = True, lemmatization: bool = True) -> None:
 
 
 
 
 
 
95
  self.language = language
96
  self.nlp = spacy.load(
97
  Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
98
  )
99
+ self._lemmatizer_fn = self._get_lemmatization_fn(remove_stop, lemmatization)
100
+ self.lemmatization = lemmatization
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ def _get_lemmatization_fn(self, remove_stop: bool, lemmatization: bool) -> Optional[Callable]:
103
  """Return the correct spacy Doc-level lemmatizer"""
104
+ if remove_stop and lemmatization:
105
 
106
+ def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
107
+ return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
 
 
 
108
 
109
+ elif remove_stop and not lemmatization:
110
+
111
+ def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
112
+ return " ".join([t for t in doc if not t.is_stop])
113
+
114
+ elif lemmatization and not remove_stop:
115
 
116
+ def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
 
117
  return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
118
 
119
+ else:
120
+ self.status = False
121
+ return
122
 
123
+ return lemmatizer_fn
124
+
125
+ def __call__(self, series: Series) -> Series:
 
 
 
 
 
 
126
  """
127
  Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
128
  """
129
  res = []
130
+ pbar = stqdm(total=len(series), desc="Lemmatizing")
131
  for doc in self.nlp.pipe(series, batch_size=500):
132
+ res.append(self._lemmatizer_fn(doc))
133
  pbar.update(1)
134
  pbar.close()
135
  return pd.Series(res)
136
 
137
+
138
+ class PreprocessingPipeline:
139
+ def __init__(self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]):
140
+
141
+ # build pipeline
142
+ self.pre_pipeline, self.lemmatizer, self.post_pipeline = self.make_pipeline(
143
+ pre_steps, lemmatizer, post_steps
144
+ )
145
+
146
+ def __call__(self, series: Series) -> Series:
147
+ with st.spinner("Pre-lemmatization cleaning"):
148
+ res = series.progress_map(self.pre_pipeline)
149
+
150
+ with st.spinner("Lemmatizing"):
151
+ res = self.lemmatizer(series)
152
+
153
+ with st.spinner("Post-lemmatization cleaning"):
154
+ res = series.progress_map(self.post_pipeline)
155
+
156
+ return res
157
+
158
+ def make_pipeline(
159
+ self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]
160
+ ) -> Tuple[Callable]:
161
+
162
+ # pre-lemmatization steps
163
+ pre_steps = [
164
+ self.pipeline_components()[step]
165
+ for step in pre_steps
166
+ if step in self.pipeline_components()
167
+ ]
168
+ pre_steps = make_pipeline(*pre_steps) if pre_steps else lambda x: x
169
+
170
+ # lemmatization
171
+ lemmatizer = lemmatizer if lemmatizer.lemmatization else lambda x: x
172
+
173
+ # post lemmatization steps
174
+ post_steps = [
175
+ self.pipeline_components()[step]
176
+ for step in post_steps
177
+ if step in self.pipeline_components()
178
+ ]
179
+ post_steps = make_pipeline(*post_steps) if post_steps else lambda x: x
180
+
181
+ return pre_steps, lemmatizer, post_steps
182
+
183
  @staticmethod
184
+ def pipeline_components() -> "OrderedDict[str, Callable]":
185
  """Returns available cleaning steps in order"""
186
  return OrderedDict(
187
  [
 
211
  ("strip", lambda x: x.strip()),
212
  ]
213
  )