Pietro Lesci commited on
Commit
dc4ad9e
1 Parent(s): 13c6837

formatting

Browse files
Files changed (4) hide show
  1. Makefile +9 -1
  2. src/components.py +46 -14
  3. src/preprocessing.py +0 -26
  4. src/utils.py +0 -80
Makefile CHANGED
@@ -1,6 +1,7 @@
1
  # Docker image build info
2
  PROJECT:=wordify
3
  BUILD_TAG?=v2.0
 
4
 
5
  ########################################################
6
  ## Local development
@@ -21,4 +22,11 @@ run:
21
  docker run -d --name $(PROJECT)-${BUILD_TAG}-container -it --rm -p 4321:8501 $(PROJECT):${BUILD_TAG}
22
 
23
  stop:
24
- docker stop $(PROJECT)-${BUILD_TAG}-container
 
 
 
 
 
 
 
 
1
  # Docker image build info
2
  PROJECT:=wordify
3
  BUILD_TAG?=v2.0
4
+ sources = src
5
 
6
  ########################################################
7
  ## Local development
 
22
  docker run -d --name $(PROJECT)-${BUILD_TAG}-container -it --rm -p 4321:8501 $(PROJECT):${BUILD_TAG}
23
 
24
  stop:
25
+ docker stop $(PROJECT)-${BUILD_TAG}-container
26
+
27
+ format:
28
+ isort $(sources)
29
+ black $(sources)
30
+
31
+ lint:
32
+ flake8 $(sources)
src/components.py CHANGED
@@ -65,12 +65,16 @@ def form(df):
65
  pre_steps = st.multiselect(
66
  "Select pre-lemmatization processing steps (ordered)",
67
  options=steps_options,
68
- default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value],
 
 
69
  format_func=lambda x: x.replace("_", " ").title(),
70
  help="Select the processing steps to apply before the text is lemmatized",
71
  )
72
 
73
- lammatization_options = list(PreprocessingPipeline.lemmatization_component().keys())
 
 
74
  lemmatization_step = st.selectbox(
75
  "Select lemmatization",
76
  options=lammatization_options,
@@ -81,7 +85,10 @@ def form(df):
81
  post_steps = st.multiselect(
82
  "Select post-lemmatization processing steps (ordered)",
83
  options=steps_options,
84
- default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value],
 
 
 
85
  format_func=lambda x: x.replace("_", " ").title(),
86
  help="Select the processing steps to apply after the text is lemmatized",
87
  )
@@ -93,21 +100,31 @@ def form(df):
93
  start_time = time.time()
94
 
95
  # warnings about inputs
96
- language_specific_warnings(pre_steps, post_steps, lemmatization_step, language)
 
 
97
 
98
  # preprocess
99
  if not disable_preprocessing:
100
  with st.spinner("Step 1/4: Preprocessing text"):
101
- pipe = PreprocessingPipeline(language, pre_steps, lemmatization_step, post_steps)
 
 
102
  df = pipe.vaex_process(df, text_column)
103
  else:
104
- with st.spinner("Step 1/4: Preprocessing has been disabled - doing nothing"):
105
- df = df.rename(columns={text_column: ColumnNames.PROCESSED_TEXT.value})
 
 
 
 
106
  time.sleep(1.2)
107
 
108
  # prepare input
109
  with st.spinner("Step 2/4: Preparing inputs"):
110
- input_dict = input_transform(df[ColumnNames.PROCESSED_TEXT.value], df[label_column])
 
 
111
 
112
  # wordify
113
  with st.spinner("Step 3/4: Wordifying"):
@@ -217,7 +234,13 @@ def how_it_works():
217
  "Wine light cherry",
218
  "Chardonnay wine oak buttery",
219
  ],
220
- "Label": ["Italy", "United States", "United States", "Italy", "United States"],
 
 
 
 
 
 
221
  }
222
  )
223
 
@@ -268,7 +291,9 @@ def how_it_works():
268
  vectors of coefficients reported in table 3 (indicators that are not present in a run are listed as 0 here):
269
  """
270
  )
271
- st.caption("Table 3: Coefficients for frequency of indicators in each of the four runs for US wines.")
 
 
272
  st.table(table3)
273
 
274
  st.markdown(
@@ -278,7 +303,9 @@ def how_it_works():
278
  that are positively and negatively correlated with the US wines.
279
  """
280
  )
281
- st.caption("Table 4: Final set of indicators that are positively versus negatively correlated with US wines.")
 
 
282
  st.table(table4)
283
  st.markdown(
284
  """
@@ -459,11 +486,15 @@ def analysis(outputs):
459
  )
460
 
461
  with st.expander("Vocabulary"):
462
- st.markdown("The table below shows all candidate n-grams that Wordify considered")
 
 
463
  st.write(meta_data["vocabulary"])
464
 
465
  with st.expander("Labels"):
466
- st.markdown("The table below summarizes the labels that your file contained")
 
 
467
  st.write(meta_data["labels"])
468
 
469
  return subset_df
@@ -493,5 +524,6 @@ def language_specific_warnings(pre_steps, post_steps, lemmatization_step, langua
493
  "Chinese",
494
  ):
495
  st.info(
496
- msg + " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
 
497
  )
 
65
  pre_steps = st.multiselect(
66
  "Select pre-lemmatization processing steps (ordered)",
67
  options=steps_options,
68
+ default=[
69
+ steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
70
+ ],
71
  format_func=lambda x: x.replace("_", " ").title(),
72
  help="Select the processing steps to apply before the text is lemmatized",
73
  )
74
 
75
+ lammatization_options = list(
76
+ PreprocessingPipeline.lemmatization_component().keys()
77
+ )
78
  lemmatization_step = st.selectbox(
79
  "Select lemmatization",
80
  options=lammatization_options,
 
85
  post_steps = st.multiselect(
86
  "Select post-lemmatization processing steps (ordered)",
87
  options=steps_options,
88
+ default=[
89
+ steps_options[i]
90
+ for i in PreprocessingConfigs.DEFAULT_POST.value
91
+ ],
92
  format_func=lambda x: x.replace("_", " ").title(),
93
  help="Select the processing steps to apply after the text is lemmatized",
94
  )
 
100
  start_time = time.time()
101
 
102
  # warnings about inputs
103
+ language_specific_warnings(
104
+ pre_steps, post_steps, lemmatization_step, language
105
+ )
106
 
107
  # preprocess
108
  if not disable_preprocessing:
109
  with st.spinner("Step 1/4: Preprocessing text"):
110
+ pipe = PreprocessingPipeline(
111
+ language, pre_steps, lemmatization_step, post_steps
112
+ )
113
  df = pipe.vaex_process(df, text_column)
114
  else:
115
+ with st.spinner(
116
+ "Step 1/4: Preprocessing has been disabled - doing nothing"
117
+ ):
118
+ df = df.rename(
119
+ columns={text_column: ColumnNames.PROCESSED_TEXT.value}
120
+ )
121
  time.sleep(1.2)
122
 
123
  # prepare input
124
  with st.spinner("Step 2/4: Preparing inputs"):
125
+ input_dict = input_transform(
126
+ df[ColumnNames.PROCESSED_TEXT.value], df[label_column]
127
+ )
128
 
129
  # wordify
130
  with st.spinner("Step 3/4: Wordifying"):
 
234
  "Wine light cherry",
235
  "Chardonnay wine oak buttery",
236
  ],
237
+ "Label": [
238
+ "Italy",
239
+ "United States",
240
+ "United States",
241
+ "Italy",
242
+ "United States",
243
+ ],
244
  }
245
  )
246
 
 
291
  vectors of coefficients reported in table 3 (indicators that are not present in a run are listed as 0 here):
292
  """
293
  )
294
+ st.caption(
295
+ "Table 3: Coefficients for frequency of indicators in each of the four runs for US wines."
296
+ )
297
  st.table(table3)
298
 
299
  st.markdown(
 
303
  that are positively and negatively correlated with the US wines.
304
  """
305
  )
306
+ st.caption(
307
+ "Table 4: Final set of indicators that are positively versus negatively correlated with US wines."
308
+ )
309
  st.table(table4)
310
  st.markdown(
311
  """
 
486
  )
487
 
488
  with st.expander("Vocabulary"):
489
+ st.markdown(
490
+ "The table below shows all candidate n-grams that Wordify considered"
491
+ )
492
  st.write(meta_data["vocabulary"])
493
 
494
  with st.expander("Labels"):
495
+ st.markdown(
496
+ "The table below summarizes the labels that your file contained"
497
+ )
498
  st.write(meta_data["labels"])
499
 
500
  return subset_df
 
524
  "Chinese",
525
  ):
526
  st.info(
527
+ msg
528
+ + " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
529
  )
src/preprocessing.py CHANGED
@@ -1,5 +1,3 @@
1
- import multiprocessing as mp
2
- import os
3
  import re
4
  import string
5
  from collections import OrderedDict
@@ -8,7 +6,6 @@ from typing import Callable, List, Optional, Union
8
  import spacy
9
  import vaex
10
  from pandas.core.frame import DataFrame
11
- from pandas.core.series import Series
12
  from textacy.preprocessing import make_pipeline, normalize, remove, replace
13
 
14
  from .configs import Languages
@@ -119,29 +116,6 @@ class PreprocessingPipeline:
119
 
120
  return df
121
 
122
- # def __call__(self, series: Series) -> Series:
123
- # if self.pre:
124
- # series = series.map(self.pre)
125
-
126
- # if self.lemma:
127
- # total_steps = len(series) // 100
128
- # res = []
129
- # pbar = st.progress(0)
130
- # for i, doc in enumerate(
131
- # self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())
132
- # ):
133
- # res.append(self.lemma(doc))
134
-
135
- # if i % total_steps == 0:
136
- # pbar.progress(1)
137
-
138
- # series = pd.Series(res)
139
-
140
- # if self.post:
141
- # series = series.map(self.post)
142
-
143
- # return series
144
-
145
  @classmethod
146
  def make_pipe_component(cls, steps: Optional[List[str]], language: str) -> Callable:
147
  if not steps:
 
 
 
1
  import re
2
  import string
3
  from collections import OrderedDict
 
6
  import spacy
7
  import vaex
8
  from pandas.core.frame import DataFrame
 
9
  from textacy.preprocessing import make_pipeline, normalize, remove, replace
10
 
11
  from .configs import Languages
 
116
 
117
  return df
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  @classmethod
120
  def make_pipe_component(cls, steps: Optional[List[str]], language: str) -> Callable:
121
  if not steps:
src/utils.py CHANGED
@@ -7,8 +7,6 @@ from PIL import Image
7
 
8
  from .configs import ColumnNames, SupportedFiles
9
 
10
- # import altair as alt
11
-
12
 
13
  def get_col_indices(cols: List) -> Tuple[int, int]:
14
  """Ugly but works"""
@@ -52,81 +50,3 @@ def download_button(dataframe: DataFrame, name: str) -> None:
52
  b64 = base64.b64encode(csv.encode()).decode()
53
  href = f'<a href="data:file/csv;base64,{b64}" download="{name}.csv">Download</a>'
54
  st.write(href, unsafe_allow_html=True)
55
-
56
-
57
- # def plot_labels_prop(data: DataFrame, label_column: str):
58
-
59
- # unique_value_limit = 100
60
-
61
- # if data[label_column].nunique() > unique_value_limit:
62
-
63
- # st.warning(
64
- # f"""
65
- # The column you selected has more than {unique_value_limit}.
66
- # Are you sure it's the right column? If it is, please note that
67
- # this will impact __Wordify__ performance.
68
- # """
69
- # )
70
-
71
- # return
72
-
73
- # source = (
74
- # data[label_column]
75
- # .value_counts()
76
- # .reset_index()
77
- # .rename(columns={"index": "Labels", label_column: "Counts"})
78
- # )
79
- # source["Props"] = source["Counts"] / source["Counts"].sum()
80
- # source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
81
-
82
- # bars = (
83
- # alt.Chart(source)
84
- # .mark_bar()
85
- # .encode(
86
- # x=alt.X("Labels:O", sort="-y"),
87
- # y="Counts:Q",
88
- # )
89
- # )
90
-
91
- # text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
92
- # text="Proportions:O"
93
- # )
94
-
95
- # return (bars + text).properties(height=300)
96
-
97
-
98
- # def plot_nchars(data: DataFrame, text_column: str):
99
- # source = data[text_column].str.len().to_frame()
100
-
101
- # plot = (
102
- # alt.Chart(source)
103
- # .mark_bar()
104
- # .encode(
105
- # alt.X(
106
- # f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
107
- # ),
108
- # alt.Y("count()", axis=alt.Axis(title="")),
109
- # )
110
- # )
111
-
112
- # return plot.properties(height=300)
113
-
114
-
115
- # def plot_score(data: DataFrame, label_col: str, label: str):
116
-
117
- # source = (
118
- # data.loc[data[label_col] == label]
119
- # .sort_values("score", ascending=False)
120
- # .head(100)
121
- # )
122
-
123
- # plot = (
124
- # alt.Chart(source)
125
- # .mark_bar()
126
- # .encode(
127
- # y=alt.Y("word:O", sort="-x"),
128
- # x="score:Q",
129
- # )
130
- # )
131
-
132
- # return plot.properties(height=max(30 * source.shape[0], 50))
 
7
 
8
  from .configs import ColumnNames, SupportedFiles
9
 
 
 
10
 
11
  def get_col_indices(cols: List) -> Tuple[int, int]:
12
  """Ugly but works"""
 
50
  b64 = base64.b64encode(csv.encode()).decode()
51
  href = f'<a href="data:file/csv;base64,{b64}" download="{name}.csv">Download</a>'
52
  st.write(href, unsafe_allow_html=True)