Pietro Lesci commited on
Commit
a97ba6f
1 Parent(s): bd07b6e

enhance UI (non-functional)

Browse files
Files changed (3) hide show
  1. src/components.py +42 -22
  2. src/configs.py +5 -0
  3. src/utils.py +21 -18
src/components.py CHANGED
@@ -3,6 +3,7 @@ import streamlit as st
3
  from src.configs import Languages, PreprocessingConfigs, SupportedFiles
4
  from src.preprocessing import PreprocessingPipeline
5
  from src.wordifier import input_transform, output_transform, wordifier
 
6
 
7
 
8
  def form(df):
@@ -11,16 +12,18 @@ def form(df):
11
  with col1:
12
 
13
  cols = [""] + df.columns.tolist()
 
 
14
  label_column = st.selectbox(
15
  "Select label column",
16
  cols,
17
- index=0,
18
  help="Select the column containing the labels",
19
  )
20
  text_column = st.selectbox(
21
  "Select text column",
22
  cols,
23
- index=0,
24
  help="Select the column containing the text",
25
  )
26
  language = st.selectbox(
@@ -37,16 +40,12 @@ def form(df):
37
  pre_steps = st.multiselect(
38
  "Select pre-lemmatization processing steps (ordered)",
39
  options=steps_options,
40
- default=[
41
- steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
42
- ],
43
  format_func=lambda x: x.replace("_", " ").title(),
44
  help="Select the processing steps to apply before the text is lemmatized",
45
  )
46
 
47
- lammatization_options = list(
48
- PreprocessingPipeline.lemmatization_component().keys()
49
- )
50
  lemmatization_step = st.selectbox(
51
  "Select lemmatization",
52
  options=lammatization_options,
@@ -57,9 +56,7 @@ def form(df):
57
  post_steps = st.multiselect(
58
  "Select post-lemmatization processing steps (ordered)",
59
  options=steps_options,
60
- default=[
61
- steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value
62
- ],
63
  format_func=lambda x: x.replace("_", " ").title(),
64
  help="Select the processing steps to apply after the text is lemmatized",
65
  )
@@ -70,9 +67,7 @@ def form(df):
70
 
71
  # preprocess
72
  with st.spinner("Step 1/4: Preprocessing text"):
73
- pipe = PreprocessingPipeline(
74
- language, pre_steps, lemmatization_step, post_steps
75
- )
76
  df = pipe.vaex_process(df, text_column)
77
 
78
  # prepare input
@@ -87,14 +82,6 @@ def form(df):
87
  with st.spinner("Step 4/4: Preparing outputs"):
88
  new_df = output_transform(pos, neg)
89
 
90
- # col1, col2, col3 = st.columns(3)
91
- # with col1:
92
- # st.metric("Total number of words processed", 3, delta_color="normal")
93
- # with col2:
94
- # st.metric("Texts processed", 3, delta_color="normal")
95
- # with col3:
96
- # st.metric("Texts processed", 3, delta_color="normal")
97
-
98
  return new_df
99
 
100
 
@@ -124,6 +111,15 @@ def faq():
124
  """
125
  )
126
 
 
 
 
 
 
 
 
 
 
127
  with st.expander("What languages are supported?"):
128
  st.markdown(
129
  f"""
@@ -202,6 +198,19 @@ def presentation():
202
  """
203
  )
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  st.subheader("Input format")
206
  st.markdown(
207
  """
@@ -224,9 +233,20 @@ def presentation():
224
  - `Score`: the wordify score, between 0 and 1, of how important is `Word` to discrimitate `Label`
225
  - `Label`: the label that `Word` is discriminating
226
  - `Correlation`: how `Word` is correlated with `Label` (e.g., "negative" means that if `Word` is present in the text then the label is less likely to be `Label`)
 
 
227
  """
228
  )
229
 
 
 
 
 
 
 
 
 
 
230
 
231
  def footer():
232
  st.sidebar.markdown(
 
3
  from src.configs import Languages, PreprocessingConfigs, SupportedFiles
4
  from src.preprocessing import PreprocessingPipeline
5
  from src.wordifier import input_transform, output_transform, wordifier
6
+ from src.utils import get_col_indices
7
 
8
 
9
  def form(df):
 
12
  with col1:
13
 
14
  cols = [""] + df.columns.tolist()
15
+ text_index, label_index = get_col_indices(cols)
16
+
17
  label_column = st.selectbox(
18
  "Select label column",
19
  cols,
20
+ index=label_index,
21
  help="Select the column containing the labels",
22
  )
23
  text_column = st.selectbox(
24
  "Select text column",
25
  cols,
26
+ index=text_index,
27
  help="Select the column containing the text",
28
  )
29
  language = st.selectbox(
 
40
  pre_steps = st.multiselect(
41
  "Select pre-lemmatization processing steps (ordered)",
42
  options=steps_options,
43
+ default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value],
 
 
44
  format_func=lambda x: x.replace("_", " ").title(),
45
  help="Select the processing steps to apply before the text is lemmatized",
46
  )
47
 
48
+ lammatization_options = list(PreprocessingPipeline.lemmatization_component().keys())
 
 
49
  lemmatization_step = st.selectbox(
50
  "Select lemmatization",
51
  options=lammatization_options,
 
56
  post_steps = st.multiselect(
57
  "Select post-lemmatization processing steps (ordered)",
58
  options=steps_options,
59
+ default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value],
 
 
60
  format_func=lambda x: x.replace("_", " ").title(),
61
  help="Select the processing steps to apply after the text is lemmatized",
62
  )
 
67
 
68
  # preprocess
69
  with st.spinner("Step 1/4: Preprocessing text"):
70
+ pipe = PreprocessingPipeline(language, pre_steps, lemmatization_step, post_steps)
 
 
71
  df = pipe.vaex_process(df, text_column)
72
 
73
  # prepare input
 
82
  with st.spinner("Step 4/4: Preparing outputs"):
83
  new_df = output_transform(pos, neg)
84
 
 
 
 
 
 
 
 
 
85
  return new_df
86
 
87
 
 
111
  """
112
  )
113
 
114
+ with st.expander("Do I need to preprocess my data?"):
115
+ st.markdown(
116
+ """
117
+ No, there is no need to preprocess your text, we will take of it.
118
+ However, if you wish to do so, turn off preprocessing in the `Advanced
119
+ Settings` in the interactive UI.
120
+ """
121
+ )
122
+
123
  with st.expander("What languages are supported?"):
124
  st.markdown(
125
  f"""
 
198
  """
199
  )
200
 
201
+ st.subheader("Quickstart")
202
+ st.markdown(
203
+ """
204
+ - There is no need to preprocess your text, we will take care of it. However, if you wish to
205
+ do so, turn off preprocessing in the `Advanced Settings` in the interactive UI.
206
+
207
+ - We expect a file with two columns: `label` with the labels and `text` with the texts (the names are case insensitive). If
208
+ you provide a file following this naming convention, Wordify will automatically select the
209
+ correct columns. However, if you wish to use a different nomenclature, you will be asked to
210
+ provide the column names in the interactive UI.
211
+ """
212
+ )
213
+
214
  st.subheader("Input format")
215
  st.markdown(
216
  """
 
233
  - `Score`: the wordify score, between 0 and 1, of how important is `Word` to discrimitate `Label`
234
  - `Label`: the label that `Word` is discriminating
235
  - `Correlation`: how `Word` is correlated with `Label` (e.g., "negative" means that if `Word` is present in the text then the label is less likely to be `Label`)
236
+
237
+ for example
238
  """
239
  )
240
 
241
+ st.table(
242
+ {
243
+ "Word": ["good", "awful", "bad service", "etc"],
244
+ "Score": ["0.52", "0.49", "0.35", "etc"],
245
+ "Label": ["Good", "Bad", "Good", "etc"],
246
+ "Correlation": ["positive", "positive", "negative", "etc"],
247
+ }
248
+ )
249
+
250
 
251
  def footer():
252
  st.sidebar.markdown(
src/configs.py CHANGED
@@ -3,6 +3,11 @@ from enum import Enum
3
  import pandas as pd
4
 
5
 
 
 
 
 
 
6
  class ModelConfigs(Enum):
7
  NUM_ITERS = 500
8
  SELECTION_THRESHOLD = 0.0
 
3
  import pandas as pd
4
 
5
 
6
+ class ColumnNames(Enum):
7
+ LABEL = "label"
8
+ TEXT = "text"
9
+
10
+
11
  class ModelConfigs(Enum):
12
  NUM_ITERS = 500
13
  SELECTION_THRESHOLD = 0.0
src/utils.py CHANGED
@@ -5,7 +5,23 @@ import pandas as pd
5
  import streamlit as st
6
  from PIL import Image
7
 
8
- from .configs import SupportedFiles
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  @st.cache
@@ -52,12 +68,7 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
52
 
53
  return
54
 
55
- source = (
56
- data[label_column]
57
- .value_counts()
58
- .reset_index()
59
- .rename(columns={"index": "Labels", label_column: "Counts"})
60
- )
61
  source["Props"] = source["Counts"] / source["Counts"].sum()
62
  source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
63
 
@@ -70,9 +81,7 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
70
  )
71
  )
72
 
73
- text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
74
- text="Proportions:O"
75
- )
76
 
77
  return (bars + text).properties(height=300)
78
 
@@ -84,9 +93,7 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
84
  alt.Chart(source)
85
  .mark_bar()
86
  .encode(
87
- alt.X(
88
- f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
89
- ),
90
  alt.Y("count()", axis=alt.Axis(title="")),
91
  )
92
  )
@@ -96,11 +103,7 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
96
 
97
  def plot_score(data: pd.DataFrame, label_col: str, label: str):
98
 
99
- source = (
100
- data.loc[data[label_col] == label]
101
- .sort_values("score", ascending=False)
102
- .head(100)
103
- )
104
 
105
  plot = (
106
  alt.Chart(source)
 
5
  import streamlit as st
6
  from PIL import Image
7
 
8
+ from .configs import SupportedFiles, ColumnNames
9
+
10
+
11
+ def get_col_indices(cols):
12
+ """Ugly but works"""
13
+ cols = [i.lower() for i in cols]
14
+ try:
15
+ label_index = cols.index(ColumnNames.LABEL.value)
16
+ except:
17
+ label_index = 0
18
+
19
+ try:
20
+ text_index = cols.index(ColumnNames.TEXT.value)
21
+ except:
22
+ text_index = 0
23
+
24
+ return text_index, label_index
25
 
26
 
27
  @st.cache
 
68
 
69
  return
70
 
71
+ source = data[label_column].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
 
 
 
 
 
72
  source["Props"] = source["Counts"] / source["Counts"].sum()
73
  source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
74
 
 
81
  )
82
  )
83
 
84
+ text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
 
 
85
 
86
  return (bars + text).properties(height=300)
87
 
 
93
  alt.Chart(source)
94
  .mark_bar()
95
  .encode(
96
+ alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
 
 
97
  alt.Y("count()", axis=alt.Axis(title="")),
98
  )
99
  )
 
103
 
104
  def plot_score(data: pd.DataFrame, label_col: str, label: str):
105
 
106
+ source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
 
 
 
 
107
 
108
  plot = (
109
  alt.Chart(source)