Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
•
ab15c62
1
Parent(s):
c700823
reformat code
Browse files- src/pages/home.py +14 -4
- src/preprocessing.py +14 -6
src/pages/home.py
CHANGED
@@ -45,7 +45,9 @@ def write(session, uploaded_file):
|
|
45 |
)
|
46 |
with col2:
|
47 |
cols_options = [""] + data.columns.tolist()
|
48 |
-
label_column = st.selectbox(
|
|
|
|
|
49 |
with st.beta_expander("Description"):
|
50 |
st.markdown("Select the column containing the labels.")
|
51 |
|
@@ -60,7 +62,9 @@ def write(session, uploaded_file):
|
|
60 |
st.markdown("Select the column containing the texts.")
|
61 |
|
62 |
if text_column:
|
63 |
-
st.altair_chart(
|
|
|
|
|
64 |
|
65 |
# ==== 2.1 CREATE UI FOR ADVANCED OPTIONS ==== #
|
66 |
with st.beta_expander("Advanced options"):
|
@@ -151,7 +155,11 @@ def write(session, uploaded_file):
|
|
151 |
sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
|
152 |
sample_data[text_column]
|
153 |
).values
|
154 |
-
st.table(
|
|
|
|
|
|
|
|
|
155 |
|
156 |
# ==== 4. RUN ==== #
|
157 |
run_button = st.button("Wordify!")
|
@@ -183,7 +191,9 @@ def write(session, uploaded_file):
|
|
183 |
col1, col2, col3 = st.beta_columns([2, 3, 3])
|
184 |
|
185 |
with col1:
|
186 |
-
label = st.selectbox(
|
|
|
|
|
187 |
# # with col2:
|
188 |
# thres = st.slider(
|
189 |
# "Select threshold",
|
|
|
45 |
)
|
46 |
with col2:
|
47 |
cols_options = [""] + data.columns.tolist()
|
48 |
+
label_column = st.selectbox(
|
49 |
+
"Select label column name", cols_options, index=0
|
50 |
+
)
|
51 |
with st.beta_expander("Description"):
|
52 |
st.markdown("Select the column containing the labels.")
|
53 |
|
|
|
62 |
st.markdown("Select the column containing the texts.")
|
63 |
|
64 |
if text_column:
|
65 |
+
st.altair_chart(
|
66 |
+
plot_nchars(data, text_column), use_container_width=True
|
67 |
+
)
|
68 |
|
69 |
# ==== 2.1 CREATE UI FOR ADVANCED OPTIONS ==== #
|
70 |
with st.beta_expander("Advanced options"):
|
|
|
155 |
sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
|
156 |
sample_data[text_column]
|
157 |
).values
|
158 |
+
st.table(
|
159 |
+
sample_data.loc[
|
160 |
+
:, [label_column, text_column, f"preprocessed_{text_column}"]
|
161 |
+
]
|
162 |
+
)
|
163 |
|
164 |
# ==== 4. RUN ==== #
|
165 |
run_button = st.button("Wordify!")
|
|
|
191 |
col1, col2, col3 = st.beta_columns([2, 3, 3])
|
192 |
|
193 |
with col1:
|
194 |
+
label = st.selectbox(
|
195 |
+
"Select label", data[label_column].unique().tolist()
|
196 |
+
)
|
197 |
# # with col2:
|
198 |
# thres = st.slider(
|
199 |
# "Select threshold",
|
src/preprocessing.py
CHANGED
@@ -91,7 +91,9 @@ def normalize_repeating_words(t):
|
|
91 |
class Lemmatizer:
|
92 |
"""Creates lemmatizer based on spacy"""
|
93 |
|
94 |
-
def __init__(
|
|
|
|
|
95 |
self.language = language
|
96 |
self.nlp = spacy.load(
|
97 |
Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
|
@@ -99,12 +101,16 @@ class Lemmatizer:
|
|
99 |
self._lemmatizer_fn = self._get_lemmatization_fn(remove_stop, lemmatization)
|
100 |
self.lemmatization = lemmatization
|
101 |
|
102 |
-
def _get_lemmatization_fn(
|
|
|
|
|
103 |
"""Return the correct spacy Doc-level lemmatizer"""
|
104 |
if remove_stop and lemmatization:
|
105 |
|
106 |
def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
|
107 |
-
return " ".join(
|
|
|
|
|
108 |
|
109 |
elif remove_stop and not lemmatization:
|
110 |
|
@@ -136,7 +142,9 @@ class Lemmatizer:
|
|
136 |
|
137 |
|
138 |
class PreprocessingPipeline:
|
139 |
-
def __init__(
|
|
|
|
|
140 |
|
141 |
# build pipeline
|
142 |
self.pre_pipeline, self.lemmatizer, self.post_pipeline = self.make_pipeline(
|
@@ -146,10 +154,10 @@ class PreprocessingPipeline:
|
|
146 |
def __call__(self, series: Series) -> Series:
|
147 |
with st.spinner("Pre-lemmatization cleaning"):
|
148 |
res = series.progress_map(self.pre_pipeline)
|
149 |
-
|
150 |
with st.spinner("Lemmatizing"):
|
151 |
res = self.lemmatizer(series)
|
152 |
-
|
153 |
with st.spinner("Post-lemmatization cleaning"):
|
154 |
res = series.progress_map(self.post_pipeline)
|
155 |
|
|
|
91 |
class Lemmatizer:
|
92 |
"""Creates lemmatizer based on spacy"""
|
93 |
|
94 |
+
def __init__(
|
95 |
+
self, language: str, remove_stop: bool = True, lemmatization: bool = True
|
96 |
+
) -> None:
|
97 |
self.language = language
|
98 |
self.nlp = spacy.load(
|
99 |
Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
|
|
|
101 |
self._lemmatizer_fn = self._get_lemmatization_fn(remove_stop, lemmatization)
|
102 |
self.lemmatization = lemmatization
|
103 |
|
104 |
+
def _get_lemmatization_fn(
|
105 |
+
self, remove_stop: bool, lemmatization: bool
|
106 |
+
) -> Optional[Callable]:
|
107 |
"""Return the correct spacy Doc-level lemmatizer"""
|
108 |
if remove_stop and lemmatization:
|
109 |
|
110 |
def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
|
111 |
+
return " ".join(
|
112 |
+
[t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop]
|
113 |
+
)
|
114 |
|
115 |
elif remove_stop and not lemmatization:
|
116 |
|
|
|
142 |
|
143 |
|
144 |
class PreprocessingPipeline:
|
145 |
+
def __init__(
|
146 |
+
self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]
|
147 |
+
):
|
148 |
|
149 |
# build pipeline
|
150 |
self.pre_pipeline, self.lemmatizer, self.post_pipeline = self.make_pipeline(
|
|
|
154 |
def __call__(self, series: Series) -> Series:
|
155 |
with st.spinner("Pre-lemmatization cleaning"):
|
156 |
res = series.progress_map(self.pre_pipeline)
|
157 |
+
|
158 |
with st.spinner("Lemmatizing"):
|
159 |
res = self.lemmatizer(series)
|
160 |
+
|
161 |
with st.spinner("Post-lemmatization cleaning"):
|
162 |
res = series.progress_map(self.post_pipeline)
|
163 |
|