Spaces:

MilaNLProc
/

wordify

Build error

App Files Files Community

Pietro Lesci commited on May 31, 2021

Commit

ab15c62

•

1 Parent(s): c700823

reformat code

Browse files

Files changed (2) hide show

src/pages/home.py +14 -4
src/preprocessing.py +14 -6

src/pages/home.py CHANGED Viewed

@@ -45,7 +45,9 @@ def write(session, uploaded_file):
                 )
         with col2:
             cols_options = [""] + data.columns.tolist()
-            label_column = st.selectbox("Select label column name", cols_options, index=0)
             with st.beta_expander("Description"):
                 st.markdown("Select the column containing the labels.")
@@ -60,7 +62,9 @@ def write(session, uploaded_file):
                 st.markdown("Select the column containing the texts.")
             if text_column:
-                st.altair_chart(plot_nchars(data, text_column), use_container_width=True)
         # ==== 2.1 CREATE UI FOR ADVANCED OPTIONS ==== #
         with st.beta_expander("Advanced options"):
@@ -151,7 +155,11 @@ def write(session, uploaded_file):
             sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
                 sample_data[text_column]
             ).values
-            st.table(sample_data.loc[:, [label_column, text_column, f"preprocessed_{text_column}"]])
         # ==== 4. RUN ==== #
         run_button = st.button("Wordify!")
@@ -183,7 +191,9 @@ def write(session, uploaded_file):
             col1, col2, col3 = st.beta_columns([2, 3, 3])
             with col1:
-                label = st.selectbox("Select label", data[label_column].unique().tolist())
                 # # with col2:
                 # thres = st.slider(
                 #     "Select threshold",

                 )
         with col2:
             cols_options = [""] + data.columns.tolist()
+            label_column = st.selectbox(
+                "Select label column name", cols_options, index=0
+            )
             with st.beta_expander("Description"):
                 st.markdown("Select the column containing the labels.")
                 st.markdown("Select the column containing the texts.")
             if text_column:
+                st.altair_chart(
+                    plot_nchars(data, text_column), use_container_width=True
+                )
         # ==== 2.1 CREATE UI FOR ADVANCED OPTIONS ==== #
         with st.beta_expander("Advanced options"):
             sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
                 sample_data[text_column]
             ).values
+            st.table(
+                sample_data.loc[
+                    :, [label_column, text_column, f"preprocessed_{text_column}"]
+                ]
+            )
         # ==== 4. RUN ==== #
         run_button = st.button("Wordify!")
             col1, col2, col3 = st.beta_columns([2, 3, 3])
             with col1:
+                label = st.selectbox(
+                    "Select label", data[label_column].unique().tolist()
+                )
                 # # with col2:
                 # thres = st.slider(
                 #     "Select threshold",

src/preprocessing.py CHANGED Viewed

@@ -91,7 +91,9 @@ def normalize_repeating_words(t):
 class Lemmatizer:
     """Creates lemmatizer based on spacy"""
-    def __init__(self, language: str, remove_stop: bool = True, lemmatization: bool = True) -> None:
         self.language = language
         self.nlp = spacy.load(
             Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
@@ -99,12 +101,16 @@ class Lemmatizer:
         self._lemmatizer_fn = self._get_lemmatization_fn(remove_stop, lemmatization)
         self.lemmatization = lemmatization
-    def _get_lemmatization_fn(self, remove_stop: bool, lemmatization: bool) -> Optional[Callable]:
         """Return the correct spacy Doc-level lemmatizer"""
         if remove_stop and lemmatization:
             def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
-                return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
         elif remove_stop and not lemmatization:
@@ -136,7 +142,9 @@ class Lemmatizer:
 class PreprocessingPipeline:
-    def __init__(self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]):
         # build pipeline
         self.pre_pipeline, self.lemmatizer, self.post_pipeline = self.make_pipeline(
@@ -146,10 +154,10 @@ class PreprocessingPipeline:
     def __call__(self, series: Series) -> Series:
         with st.spinner("Pre-lemmatization cleaning"):
             res = series.progress_map(self.pre_pipeline)
         with st.spinner("Lemmatizing"):
             res = self.lemmatizer(series)
         with st.spinner("Post-lemmatization cleaning"):
             res = series.progress_map(self.post_pipeline)

 class Lemmatizer:
     """Creates lemmatizer based on spacy"""
+    def __init__(
+        self, language: str, remove_stop: bool = True, lemmatization: bool = True
+    ) -> None:
         self.language = language
         self.nlp = spacy.load(
             Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
         self._lemmatizer_fn = self._get_lemmatization_fn(remove_stop, lemmatization)
         self.lemmatization = lemmatization
+    def _get_lemmatization_fn(
+        self, remove_stop: bool, lemmatization: bool
+    ) -> Optional[Callable]:
         """Return the correct spacy Doc-level lemmatizer"""
         if remove_stop and lemmatization:
             def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
+                return " ".join(
+                    [t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop]
+                )
         elif remove_stop and not lemmatization:
 class PreprocessingPipeline:
+    def __init__(
+        self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]
+    ):
         # build pipeline
         self.pre_pipeline, self.lemmatizer, self.post_pipeline = self.make_pipeline(
     def __call__(self, series: Series) -> Series:
         with st.spinner("Pre-lemmatization cleaning"):
             res = series.progress_map(self.pre_pipeline)
         with st.spinner("Lemmatizing"):
             res = self.lemmatizer(series)
         with st.spinner("Post-lemmatization cleaning"):
             res = series.progress_map(self.post_pipeline)