Spaces:

pietrolesci
/

wordify

Build error

App Files Files Community

Pietro Lesci commited on Dec 15, 2021

Commit

a97ba6f

•

1 Parent(s): bd07b6e

enhance UI (non-functional)

Browse files

Files changed (3) hide show

src/components.py +42 -22
src/configs.py +5 -0
src/utils.py +21 -18

src/components.py CHANGED Viewed

@@ -3,6 +3,7 @@ import streamlit as st
 from src.configs import Languages, PreprocessingConfigs, SupportedFiles
 from src.preprocessing import PreprocessingPipeline
 from src.wordifier import input_transform, output_transform, wordifier
 def form(df):
@@ -11,16 +12,18 @@ def form(df):
         with col1:
             cols = [""] + df.columns.tolist()
             label_column = st.selectbox(
                 "Select label column",
                 cols,
-                index=0,
                 help="Select the column containing the labels",
             )
             text_column = st.selectbox(
                 "Select text column",
                 cols,
-                index=0,
                 help="Select the column containing the text",
             )
             language = st.selectbox(
@@ -37,16 +40,12 @@ def form(df):
             pre_steps = st.multiselect(
                 "Select pre-lemmatization processing steps (ordered)",
                 options=steps_options,
-                default=[
-                    steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
-                ],
                 format_func=lambda x: x.replace("_", " ").title(),
                 help="Select the processing steps to apply before the text is lemmatized",
             )
-            lammatization_options = list(
-                PreprocessingPipeline.lemmatization_component().keys()
-            )
             lemmatization_step = st.selectbox(
                 "Select lemmatization",
                 options=lammatization_options,
@@ -57,9 +56,7 @@ def form(df):
             post_steps = st.multiselect(
                 "Select post-lemmatization processing steps (ordered)",
                 options=steps_options,
-                default=[
-                    steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value
-                ],
                 format_func=lambda x: x.replace("_", " ").title(),
                 help="Select the processing steps to apply after the text is lemmatized",
             )
@@ -70,9 +67,7 @@ def form(df):
             # preprocess
             with st.spinner("Step 1/4: Preprocessing text"):
-                pipe = PreprocessingPipeline(
-                    language, pre_steps, lemmatization_step, post_steps
-                )
                 df = pipe.vaex_process(df, text_column)
             # prepare input
@@ -87,14 +82,6 @@ def form(df):
             with st.spinner("Step 4/4: Preparing outputs"):
                 new_df = output_transform(pos, neg)
-            # col1, col2, col3 = st.columns(3)
-            # with col1:
-            #     st.metric("Total number of words processed", 3, delta_color="normal")
-            # with col2:
-            #     st.metric("Texts processed", 3, delta_color="normal")
-            # with col3:
-            #     st.metric("Texts processed", 3, delta_color="normal")
             return new_df
@@ -124,6 +111,15 @@ def faq():
             """
         )
     with st.expander("What languages are supported?"):
         st.markdown(
             f"""
@@ -202,6 +198,19 @@ def presentation():
         """
     )
     st.subheader("Input format")
     st.markdown(
         """
@@ -224,9 +233,20 @@ def presentation():
         - `Score`: the wordify score, between 0 and 1, of how important is `Word` to discrimitate `Label`
         - `Label`: the label that `Word` is discriminating
         - `Correlation`: how `Word` is correlated with `Label` (e.g., "negative" means that if `Word` is present in the text then the label is less likely to be `Label`)
         """
     )
 def footer():
     st.sidebar.markdown(

 from src.configs import Languages, PreprocessingConfigs, SupportedFiles
 from src.preprocessing import PreprocessingPipeline
 from src.wordifier import input_transform, output_transform, wordifier
+from src.utils import get_col_indices
 def form(df):
         with col1:
             cols = [""] + df.columns.tolist()
+            text_index, label_index = get_col_indices(cols)
             label_column = st.selectbox(
                 "Select label column",
                 cols,
+                index=label_index,
                 help="Select the column containing the labels",
             )
             text_column = st.selectbox(
                 "Select text column",
                 cols,
+                index=text_index,
                 help="Select the column containing the text",
             )
             language = st.selectbox(
             pre_steps = st.multiselect(
                 "Select pre-lemmatization processing steps (ordered)",
                 options=steps_options,
+                default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value],
                 format_func=lambda x: x.replace("_", " ").title(),
                 help="Select the processing steps to apply before the text is lemmatized",
             )
+            lammatization_options = list(PreprocessingPipeline.lemmatization_component().keys())
             lemmatization_step = st.selectbox(
                 "Select lemmatization",
                 options=lammatization_options,
             post_steps = st.multiselect(
                 "Select post-lemmatization processing steps (ordered)",
                 options=steps_options,
+                default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value],
                 format_func=lambda x: x.replace("_", " ").title(),
                 help="Select the processing steps to apply after the text is lemmatized",
             )
             # preprocess
             with st.spinner("Step 1/4: Preprocessing text"):
+                pipe = PreprocessingPipeline(language, pre_steps, lemmatization_step, post_steps)
                 df = pipe.vaex_process(df, text_column)
             # prepare input
             with st.spinner("Step 4/4: Preparing outputs"):
                 new_df = output_transform(pos, neg)
             return new_df
             """
         )
+    with st.expander("Do I need to preprocess my data?"):
+        st.markdown(
+            """
+            No, there is no need to preprocess your text, we will take of it.
+            However, if you wish to do so, turn off preprocessing in the `Advanced
+            Settings` in the interactive UI.
+        """
+        )
     with st.expander("What languages are supported?"):
         st.markdown(
             f"""
         """
     )
+    st.subheader("Quickstart")
+    st.markdown(
+        """
+        - There is no need to preprocess your text, we will take care of it. However, if you wish to
+        do so, turn off preprocessing in the `Advanced Settings` in the interactive UI.
+        - We expect a file with two columns: `label` with the labels and `text` with the texts (the names are case insensitive). If
+        you provide a file following this naming convention, Wordify will automatically select the
+        correct columns. However, if you wish to use a different nomenclature, you will be asked to
+        provide the column names in the interactive UI.
+        """
+    )
     st.subheader("Input format")
     st.markdown(
         """
         - `Score`: the wordify score, between 0 and 1, of how important is `Word` to discrimitate `Label`
         - `Label`: the label that `Word` is discriminating
         - `Correlation`: how `Word` is correlated with `Label` (e.g., "negative" means that if `Word` is present in the text then the label is less likely to be `Label`)
+        for example
         """
     )
+    st.table(
+        {
+            "Word": ["good", "awful", "bad service", "etc"],
+            "Score": ["0.52", "0.49", "0.35", "etc"],
+            "Label": ["Good", "Bad", "Good", "etc"],
+            "Correlation": ["positive", "positive", "negative", "etc"],
+        }
+    )
 def footer():
     st.sidebar.markdown(

src/configs.py CHANGED Viewed

@@ -3,6 +3,11 @@ from enum import Enum
 import pandas as pd
 class ModelConfigs(Enum):
     NUM_ITERS = 500
     SELECTION_THRESHOLD = 0.0

 import pandas as pd
+class ColumnNames(Enum):
+    LABEL = "label"
+    TEXT = "text"
 class ModelConfigs(Enum):
     NUM_ITERS = 500
     SELECTION_THRESHOLD = 0.0

src/utils.py CHANGED Viewed

@@ -5,7 +5,23 @@ import pandas as pd
 import streamlit as st
 from PIL import Image
-from .configs import SupportedFiles
 @st.cache
@@ -52,12 +68,7 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
         return
-    source = (
-        data[label_column]
-        .value_counts()
-        .reset_index()
-        .rename(columns={"index": "Labels", label_column: "Counts"})
-    )
     source["Props"] = source["Counts"] / source["Counts"].sum()
     source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
@@ -70,9 +81,7 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
         )
     )
-    text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
-        text="Proportions:O"
-    )
     return (bars + text).properties(height=300)
@@ -84,9 +93,7 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
         alt.Chart(source)
         .mark_bar()
         .encode(
-            alt.X(
-                f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
-            ),
             alt.Y("count()", axis=alt.Axis(title="")),
         )
     )
@@ -96,11 +103,7 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
 def plot_score(data: pd.DataFrame, label_col: str, label: str):
-    source = (
-        data.loc[data[label_col] == label]
-        .sort_values("score", ascending=False)
-        .head(100)
-    )
     plot = (
         alt.Chart(source)

 import streamlit as st
 from PIL import Image
+from .configs import SupportedFiles, ColumnNames
+def get_col_indices(cols):
+    """Ugly but works"""
+    cols = [i.lower() for i in cols]
+    try:
+        label_index = cols.index(ColumnNames.LABEL.value)
+    except:
+        label_index = 0
+    try:
+        text_index = cols.index(ColumnNames.TEXT.value)
+    except:
+        text_index = 0
+    return text_index, label_index
 @st.cache
         return
+    source = data[label_column].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
     source["Props"] = source["Counts"] / source["Counts"].sum()
     source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
         )
     )
+    text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
     return (bars + text).properties(height=300)
         alt.Chart(source)
         .mark_bar()
         .encode(
+            alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
             alt.Y("count()", axis=alt.Axis(title="")),
         )
     )
 def plot_score(data: pd.DataFrame, label_col: str, label: str):
+    source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
     plot = (
         alt.Chart(source)