Spaces:

pietrolesci
/

wordify

Build error

App Files Files Community

Pietro Lesci commited on May 31, 2021

Commit

51cab9d

1 Parent(s): 02c2d7e

blacked

Browse files

Files changed (9) hide show

app.py +7 -3
notebooks/wordifier_nb.ipynb +24 -11
src/pages/about.py +1 -1
src/pages/home.py +64 -31
src/plotting.py +17 -4
src/preprocessing.py +3 -1
src/session_state.py +6 -2
src/utils.py +17 -4
src/wordifier.py +27 -9

app.py CHANGED Viewed

@@ -16,7 +16,9 @@ st.set_page_config(
 )
 # session state
-session = session_state.get(process=False, run_id=0, posdf=None, negdf=None, uploaded_file_id=0)
 # ==== SIDEBAR ==== #
@@ -42,7 +44,9 @@ st.sidebar.markdown("")
 st.sidebar.markdown("")
 st.sidebar.header("Upload file")
 # with st.sidebar.beta_container():
-uploaded_file = st.sidebar.file_uploader("Select file", type=[i.name for i in SupportedFiles])
 # FOOTER
@@ -62,4 +66,4 @@ with st.beta_container():
     st.title("Wordify")
-page.write(session, uploaded_file)

 )
 # session state
+session = session_state.get(
+    process=False, run_id=0, posdf=None, negdf=None, uploaded_file_id=0
+)
 # ==== SIDEBAR ==== #
 st.sidebar.markdown("")
 st.sidebar.header("Upload file")
 # with st.sidebar.beta_container():
+uploaded_file = st.sidebar.file_uploader(
+    "Select file", type=[i.name for i in SupportedFiles]
+)
 # FOOTER
     st.title("Wordify")
+page.write(session, uploaded_file)

notebooks/wordifier_nb.ipynb CHANGED Viewed

@@ -61,11 +61,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = pd.read_excel(\"../data/test_de.xlsx\")\n",
     "# mdf = mpd.read_csv(\"../data/test_en.csv\")\n",
     "language = \"English\"\n",
     "nlp = spacy.load(Languages[language].value, exclude=[\"parser\", \"ner\", \"pos\", \"tok2vec\"])"
@@ -73,7 +91,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -86,19 +104,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "name": "stderr",
      "text": [
-      "2021-05-10 18:34:49.425 WARNING root: \n",
-      "  \u001b[33m\u001b[1mWarning:\u001b[0m to view this Streamlit app on a browser, run it with the following\n",
-      "  command:\n",
-      "\n",
-      "    streamlit run /Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/ipykernel_launcher.py [ARGUMENTS]\n",
-      "100%|██████████| 6269/6269 [00:02<00:00, 2750.45it/s]\n"
      ]
     }
    ],
@@ -108,7 +121,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [

   },
   {
    "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = \"../../../../Downloads/wordify_10000_copy.xlsx\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
+    "df = pd.read_excel(path, dtype=str).dropna()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df = pd.read_excel(\"../data/test_de.xlsx\")\n",
     "# mdf = mpd.read_csv(\"../data/test_en.csv\")\n",
     "language = \"English\"\n",
     "nlp = spacy.load(Languages[language].value, exclude=[\"parser\", \"ner\", \"pos\", \"tok2vec\"])"
   },
   {
    "cell_type": "code",
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "name": "stderr",
      "text": [
+      "100%|██████████| 9939/9939 [00:06<00:00, 1431.09it/s]\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 32,
    "metadata": {},
    "outputs": [],
    "source": [

src/pages/about.py CHANGED Viewed

@@ -31,4 +31,4 @@ def write(*args):
             <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
             """,
             unsafe_allow_html=True,
-        )

             <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
             """,
             unsafe_allow_html=True,
+        )

src/pages/home.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from src.configs import Languages
 from src.utils import (
     encode,
-    wordifier,
     download_button,
     TextPreprocessor,
     plot_labels_prop,
@@ -9,28 +8,33 @@ from src.utils import (
     plot_score,
     read_file,
 )
 import streamlit as st
 def write(session, uploaded_file):
-    st.markdown(
-        """
-        Hi! Welcome to __Wordify__. Start by uploading a file - CSV, XLSX (avoid Strict Open XML Spreadsheet format [here](https://stackoverflow.com/questions/62800822/openpyxl-cannot-read-strict-open-xml-spreadsheet-format-userwarning-file-conta)),
-        or PARQUET are currently supported.
-        Once you have uploaded the file, __Wordify__ will show an interactive UI through which
-        you'll be able to interactively decide the text preprocessing steps, their order, and
-        proceed to Wordify your text.
-        If you're ready, let's jump in:
-        :point_left: upload a file via the upload widget in the sidebar!
-        NOTE: whenever you want to reset everything, simply refresh the page
-        """
-    )
-    if uploaded_file:
         # 1. READ FILE
         with st.spinner("Reading file"):
@@ -38,10 +42,6 @@ def write(session, uploaded_file):
             data = read_file(uploaded_file)
         # 2. CREATE UI TO SELECT COLUMNS
-        st.markdown("")
-        st.markdown("")
-        st.header("Process")
         col1, col2, col3 = st.beta_columns(3)
         with col1:
             language = st.selectbox("Select language", [i.name for i in Languages])
@@ -51,13 +51,16 @@ def write(session, uploaded_file):
                 )
         with col2:
             cols_options = [""] + data.columns.tolist()
-            label_column = st.selectbox("Select label column name", cols_options, index=0)
             with st.beta_expander("Description"):
                 st.markdown("Select the column containing the label")
             if label_column:
                 plot = plot_labels_prop(data, label_column)
-                if plot: st.altair_chart(plot, use_container_width=True)
         with col3:
             text_column = st.selectbox("Select text column name", cols_options, index=0)
@@ -65,7 +68,9 @@ def write(session, uploaded_file):
                 st.markdown("Select the column containing the text")
             if text_column:
-                st.altair_chart(plot_nchars(data, text_column), use_container_width=True)
         with st.beta_expander("Advanced options"):
             # Lemmatization option
@@ -102,14 +107,18 @@ def write(session, uploaded_file):
                 format_func=lambda x: x.replace("_", " ").title(),
                 key=session.run_id,
             )
-            lemmatization_options = list(TextPreprocessor._lemmatization_options().keys())
             lemmatization_when = lemmatization_when_elem.selectbox(
                 "Select when lemmatization happens",
                 options=lemmatization_options,
                 index=0,
                 key=session.run_id,
             )
-            remove_stopwords = remove_stopwords_elem.checkbox("Remove stopwords", value=True, key=session.run_id)
         # Show sample checkbox
         col1, col2 = st.beta_columns([1, 2])
@@ -130,8 +139,14 @@ def write(session, uploaded_file):
         elif show_sample and (label_column and text_column):
             sample_data = data.sample(10)
-            sample_data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(sample_data[text_column]).values
-            st.table(sample_data.loc[:, [label_column, text_column, f"preprocessed_{text_column}"]])
         # 4. RUN
         run_button = st.button("Wordify!")
@@ -142,7 +157,9 @@ def write(session, uploaded_file):
             with st.spinner("Process started"):
                 # data = data.head()
-                data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(data[text_column]).values
                 inputs = encode(data[f"preprocessed_{text_column}"], data[label_column])
                 session.posdf, session.negdf = wordifier(**inputs)
@@ -161,7 +178,9 @@ def write(session, uploaded_file):
             col1, col2, col3 = st.beta_columns([2, 3, 3])
             with col1:
-                label = st.selectbox("Select label", data[label_column].unique().tolist())
                 # # with col2:
                 # thres = st.slider(
                 #     "Select threshold",
@@ -175,14 +194,28 @@ def write(session, uploaded_file):
             with col2:
                 st.subheader(f"Words __positively__ identifying label `{label}`")
-                st.write(session.posdf[session.posdf[label_column] == label].sort_values("score", ascending=False))
                 download_button(session.posdf, "positive_data")
                 if show_plots:
-                    st.altair_chart(plot_score(session.posdf, label_column, label), use_container_width=True)
             with col3:
                 st.subheader(f"Words __negatively__ identifying label `{label}`")
-                st.write(session.negdf[session.negdf[label_column] == label].sort_values("score", ascending=False))
                 download_button(session.negdf, "negative_data")
                 if show_plots:
-                    st.altair_chart(plot_score(session.negdf, label_column, label), use_container_width=True)

 from src.configs import Languages
 from src.utils import (
     encode,
     download_button,
     TextPreprocessor,
     plot_labels_prop,
     plot_score,
     read_file,
 )
+from src.wordifier import wordifier
 import streamlit as st
 def write(session, uploaded_file):
+    if not uploaded_file:
+        st.markdown(
+            """
+            Hi, welcome to __Wordify__! :rocket:
+            Start by uploading a file - CSV, XLSX (avoid Strict Open XML Spreadsheet format [here](https://stackoverflow.com/questions/62800822/openpyxl-cannot-read-strict-open-xml-spreadsheet-format-userwarning-file-conta)),
+            or PARQUET are currently supported.
+            Once you have uploaded the file, __Wordify__ will show an interactive UI through which
+            you'll be able to interactively decide the text preprocessing steps, their order, and
+            proceed to Wordify your text.
+            If you're ready, let's jump in:
+            :point_left: upload a file via the upload widget in the sidebar!
+            NOTE: whenever you want to reset everything, simply refresh the page.
+            """
+        )
+    elif uploaded_file:
         # 1. READ FILE
         with st.spinner("Reading file"):
             data = read_file(uploaded_file)
         # 2. CREATE UI TO SELECT COLUMNS
         col1, col2, col3 = st.beta_columns(3)
         with col1:
             language = st.selectbox("Select language", [i.name for i in Languages])
                 )
         with col2:
             cols_options = [""] + data.columns.tolist()
+            label_column = st.selectbox(
+                "Select label column name", cols_options, index=0
+            )
             with st.beta_expander("Description"):
                 st.markdown("Select the column containing the label")
             if label_column:
                 plot = plot_labels_prop(data, label_column)
+                if plot:
+                    st.altair_chart(plot, use_container_width=True)
         with col3:
             text_column = st.selectbox("Select text column name", cols_options, index=0)
                 st.markdown("Select the column containing the text")
             if text_column:
+                st.altair_chart(
+                    plot_nchars(data, text_column), use_container_width=True
+                )
         with st.beta_expander("Advanced options"):
             # Lemmatization option
                 format_func=lambda x: x.replace("_", " ").title(),
                 key=session.run_id,
             )
+            lemmatization_options = list(
+                TextPreprocessor._lemmatization_options().keys()
+            )
             lemmatization_when = lemmatization_when_elem.selectbox(
                 "Select when lemmatization happens",
                 options=lemmatization_options,
                 index=0,
                 key=session.run_id,
             )
+            remove_stopwords = remove_stopwords_elem.checkbox(
+                "Remove stopwords", value=True, key=session.run_id
+            )
         # Show sample checkbox
         col1, col2 = st.beta_columns([1, 2])
         elif show_sample and (label_column and text_column):
             sample_data = data.sample(10)
+            sample_data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(
+                sample_data[text_column]
+            ).values
+            st.table(
+                sample_data.loc[
+                    :, [label_column, text_column, f"preprocessed_{text_column}"]
+                ]
+            )
         # 4. RUN
         run_button = st.button("Wordify!")
             with st.spinner("Process started"):
                 # data = data.head()
+                data[f"preprocessed_{text_column}"] = preprocessor.fit_transform(
+                    data[text_column]
+                ).values
                 inputs = encode(data[f"preprocessed_{text_column}"], data[label_column])
                 session.posdf, session.negdf = wordifier(**inputs)
             col1, col2, col3 = st.beta_columns([2, 3, 3])
             with col1:
+                label = st.selectbox(
+                    "Select label", data[label_column].unique().tolist()
+                )
                 # # with col2:
                 # thres = st.slider(
                 #     "Select threshold",
             with col2:
                 st.subheader(f"Words __positively__ identifying label `{label}`")
+                st.write(
+                    session.posdf[session.posdf[label_column] == label].sort_values(
+                        "score", ascending=False
+                    )
+                )
                 download_button(session.posdf, "positive_data")
                 if show_plots:
+                    st.altair_chart(
+                        plot_score(session.posdf, label_column, label),
+                        use_container_width=True,
+                    )
             with col3:
                 st.subheader(f"Words __negatively__ identifying label `{label}`")
+                st.write(
+                    session.negdf[session.negdf[label_column] == label].sort_values(
+                        "score", ascending=False
+                    )
+                )
                 download_button(session.negdf, "negative_data")
                 if show_plots:
+                    st.altair_chart(
+                        plot_score(session.negdf, label_column, label),
+                        use_container_width=True,
+                    )

src/plotting.py CHANGED Viewed

@@ -22,7 +22,12 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
         return
-    source = data[label_column].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
     source["Props"] = source["Counts"] / source["Counts"].sum()
     source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
@@ -35,7 +40,9 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
         )
     )
-    text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
     return (bars + text).properties(height=300)
@@ -47,7 +54,9 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
         alt.Chart(source)
         .mark_bar()
         .encode(
-            alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
             alt.Y("count()", axis=alt.Axis(title="")),
         )
     )
@@ -57,7 +66,11 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
 def plot_score(data: pd.DataFrame, label_col: str, label: str):
-    source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
     plot = (
         alt.Chart(source)

         return
+    source = (
+        data[label_column]
+        .value_counts()
+        .reset_index()
+        .rename(columns={"index": "Labels", label_column: "Counts"})
+    )
     source["Props"] = source["Counts"] / source["Counts"].sum()
     source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
         )
     )
+    text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
+        text="Proportions:O"
+    )
     return (bars + text).properties(height=300)
         alt.Chart(source)
         .mark_bar()
         .encode(
+            alt.X(
+                f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
+            ),
             alt.Y("count()", axis=alt.Axis(title="")),
         )
     )
 def plot_score(data: pd.DataFrame, label_col: str, label: str):
+    source = (
+        data.loc[data[label_col] == label]
+        .sort_values("score", ascending=False)
+        .head(100)
+    )
     plot = (
         alt.Chart(source)

src/preprocessing.py CHANGED Viewed

@@ -121,7 +121,9 @@ class TextPreprocessor:
             def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
                 """Lemmatizes spacy Doc and removes stopwords"""
-                return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
         else:

             def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
                 """Lemmatizes spacy Doc and removes stopwords"""
+                return " ".join(
+                    [t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop]
+                )
         else:

src/session_state.py CHANGED Viewed

@@ -100,13 +100,17 @@ def get(**kwargs):
             (not hasattr(s, "_main_dg") and s.enqueue == ctx.enqueue)
             or
             # Streamlit >= 0.65.2
-            (not hasattr(s, "_main_dg") and s._uploaded_file_mgr == ctx.uploaded_file_mgr)
         ):
             this_session = s
     if this_session is None:
         raise RuntimeError(
-            "Oh noes. Couldn't get your Streamlit Session object. " "Are you doing something fancy with threads?"
         )
     # Got the session object! Now let's attach some state into it.

             (not hasattr(s, "_main_dg") and s.enqueue == ctx.enqueue)
             or
             # Streamlit >= 0.65.2
+            (
+                not hasattr(s, "_main_dg")
+                and s._uploaded_file_mgr == ctx.uploaded_file_mgr
+            )
         ):
             this_session = s
     if this_session is None:
         raise RuntimeError(
+            "Oh noes. Couldn't get your Streamlit Session object. "
+            "Are you doing something fancy with threads?"
         )
     # Got the session object! Now let's attach some state into it.

src/utils.py CHANGED Viewed

@@ -55,7 +55,12 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
         return
-    source = data[label_column].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
     source["Props"] = source["Counts"] / source["Counts"].sum()
     source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
@@ -68,7 +73,9 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
         )
     )
-    text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
     return (bars + text).properties(height=300)
@@ -80,7 +87,9 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
         alt.Chart(source)
         .mark_bar()
         .encode(
-            alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
             alt.Y("count()", axis=alt.Axis(title="")),
         )
     )
@@ -90,7 +99,11 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
 def plot_score(data: pd.DataFrame, label_col: str, label: str):
-    source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
     plot = (
         alt.Chart(source)

         return
+    source = (
+        data[label_column]
+        .value_counts()
+        .reset_index()
+        .rename(columns={"index": "Labels", label_column: "Counts"})
+    )
     source["Props"] = source["Counts"] / source["Counts"].sum()
     source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
         )
     )
+    text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
+        text="Proportions:O"
+    )
     return (bars + text).properties(height=300)
         alt.Chart(source)
         .mark_bar()
         .encode(
+            alt.X(
+                f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
+            ),
             alt.Y("count()", axis=alt.Axis(title="")),
         )
     )
 def plot_score(data: pd.DataFrame, label_col: str, label: str):
+    source = (
+        data.loc[data[label_col] == label]
+        .sort_values("score", ascending=False)
+        .head(100)
+    )
     plot = (
         alt.Chart(source)

src/wordifier.py CHANGED Viewed

@@ -43,7 +43,9 @@ def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs
             # run randomized regression
             clf = LogisticRegression(
                 penalty="l1",
-                C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
                 solver="liblinear",
                 multi_class="auto",
                 max_iter=500,
@@ -51,7 +53,9 @@ def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs
             )
             # sample indices to subsample matrix
-            selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)
             # fit
             try:
@@ -74,14 +78,28 @@ def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs
         neg_scores = neg_scores / configs.NUM_ITERS.value
         # get only active features
-        pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0)
-        neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0)
         # prepare DataFrame
-        pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
-        neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]
-    posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
-    negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
     return posdf, negdf

             # run randomized regression
             clf = LogisticRegression(
                 penalty="l1",
+                C=configs.PENALTIES.value[
+                    np.random.randint(len(configs.PENALTIES.value))
+                ],
                 solver="liblinear",
                 multi_class="auto",
                 max_iter=500,
             )
             # sample indices to subsample matrix
+            selection = resample(
+                np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size
+            )
             # fit
             try:
         neg_scores = neg_scores / configs.NUM_ITERS.value
         # get only active features
+        pos_positions = np.where(
+            pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0
+        )
+        neg_positions = np.where(
+            neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0
+        )
         # prepare DataFrame
+        pos = [
+            (X_names[i], pos_scores[c, i], y_names[c])
+            for c, i in zip(*pos_positions.nonzero())
+        ]
+        neg = [
+            (X_names[i], neg_scores[c, i], y_names[c])
+            for c, i in zip(*neg_positions.nonzero())
+        ]
+    posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(
+        ["label", "score"], ascending=False
+    )
+    negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(
+        ["label", "score"], ascending=False
+    )
     return posdf, negdf