Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
•
dc4ad9e
1
Parent(s):
13c6837
formatting
Browse files- Makefile +9 -1
- src/components.py +46 -14
- src/preprocessing.py +0 -26
- src/utils.py +0 -80
Makefile
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
# Docker image build info
|
2 |
PROJECT:=wordify
|
3 |
BUILD_TAG?=v2.0
|
|
|
4 |
|
5 |
########################################################
|
6 |
## Local development
|
@@ -21,4 +22,11 @@ run:
|
|
21 |
docker run -d --name $(PROJECT)-${BUILD_TAG}-container -it --rm -p 4321:8501 $(PROJECT):${BUILD_TAG}
|
22 |
|
23 |
stop:
|
24 |
-
docker stop $(PROJECT)-${BUILD_TAG}-container
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Docker image build info
|
2 |
PROJECT:=wordify
|
3 |
BUILD_TAG?=v2.0
|
4 |
+
sources = src
|
5 |
|
6 |
########################################################
|
7 |
## Local development
|
|
|
22 |
docker run -d --name $(PROJECT)-${BUILD_TAG}-container -it --rm -p 4321:8501 $(PROJECT):${BUILD_TAG}
|
23 |
|
24 |
stop:
|
25 |
+
docker stop $(PROJECT)-${BUILD_TAG}-container
|
26 |
+
|
27 |
+
format:
|
28 |
+
isort $(sources)
|
29 |
+
black $(sources)
|
30 |
+
|
31 |
+
lint:
|
32 |
+
flake8 $(sources)
|
src/components.py
CHANGED
@@ -65,12 +65,16 @@ def form(df):
|
|
65 |
pre_steps = st.multiselect(
|
66 |
"Select pre-lemmatization processing steps (ordered)",
|
67 |
options=steps_options,
|
68 |
-
default=[
|
|
|
|
|
69 |
format_func=lambda x: x.replace("_", " ").title(),
|
70 |
help="Select the processing steps to apply before the text is lemmatized",
|
71 |
)
|
72 |
|
73 |
-
lammatization_options = list(
|
|
|
|
|
74 |
lemmatization_step = st.selectbox(
|
75 |
"Select lemmatization",
|
76 |
options=lammatization_options,
|
@@ -81,7 +85,10 @@ def form(df):
|
|
81 |
post_steps = st.multiselect(
|
82 |
"Select post-lemmatization processing steps (ordered)",
|
83 |
options=steps_options,
|
84 |
-
default=[
|
|
|
|
|
|
|
85 |
format_func=lambda x: x.replace("_", " ").title(),
|
86 |
help="Select the processing steps to apply after the text is lemmatized",
|
87 |
)
|
@@ -93,21 +100,31 @@ def form(df):
|
|
93 |
start_time = time.time()
|
94 |
|
95 |
# warnings about inputs
|
96 |
-
language_specific_warnings(
|
|
|
|
|
97 |
|
98 |
# preprocess
|
99 |
if not disable_preprocessing:
|
100 |
with st.spinner("Step 1/4: Preprocessing text"):
|
101 |
-
pipe = PreprocessingPipeline(
|
|
|
|
|
102 |
df = pipe.vaex_process(df, text_column)
|
103 |
else:
|
104 |
-
with st.spinner(
|
105 |
-
|
|
|
|
|
|
|
|
|
106 |
time.sleep(1.2)
|
107 |
|
108 |
# prepare input
|
109 |
with st.spinner("Step 2/4: Preparing inputs"):
|
110 |
-
input_dict = input_transform(
|
|
|
|
|
111 |
|
112 |
# wordify
|
113 |
with st.spinner("Step 3/4: Wordifying"):
|
@@ -217,7 +234,13 @@ def how_it_works():
|
|
217 |
"Wine light cherry",
|
218 |
"Chardonnay wine oak buttery",
|
219 |
],
|
220 |
-
"Label": [
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
}
|
222 |
)
|
223 |
|
@@ -268,7 +291,9 @@ def how_it_works():
|
|
268 |
vectors of coefficients reported in table 3 (indicators that are not present in a run are listed as 0 here):
|
269 |
"""
|
270 |
)
|
271 |
-
st.caption(
|
|
|
|
|
272 |
st.table(table3)
|
273 |
|
274 |
st.markdown(
|
@@ -278,7 +303,9 @@ def how_it_works():
|
|
278 |
that are positively and negatively correlated with the US wines.
|
279 |
"""
|
280 |
)
|
281 |
-
st.caption(
|
|
|
|
|
282 |
st.table(table4)
|
283 |
st.markdown(
|
284 |
"""
|
@@ -459,11 +486,15 @@ def analysis(outputs):
|
|
459 |
)
|
460 |
|
461 |
with st.expander("Vocabulary"):
|
462 |
-
st.markdown(
|
|
|
|
|
463 |
st.write(meta_data["vocabulary"])
|
464 |
|
465 |
with st.expander("Labels"):
|
466 |
-
st.markdown(
|
|
|
|
|
467 |
st.write(meta_data["labels"])
|
468 |
|
469 |
return subset_df
|
@@ -493,5 +524,6 @@ def language_specific_warnings(pre_steps, post_steps, lemmatization_step, langua
|
|
493 |
"Chinese",
|
494 |
):
|
495 |
st.info(
|
496 |
-
msg
|
|
|
497 |
)
|
|
|
65 |
pre_steps = st.multiselect(
|
66 |
"Select pre-lemmatization processing steps (ordered)",
|
67 |
options=steps_options,
|
68 |
+
default=[
|
69 |
+
steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
|
70 |
+
],
|
71 |
format_func=lambda x: x.replace("_", " ").title(),
|
72 |
help="Select the processing steps to apply before the text is lemmatized",
|
73 |
)
|
74 |
|
75 |
+
lammatization_options = list(
|
76 |
+
PreprocessingPipeline.lemmatization_component().keys()
|
77 |
+
)
|
78 |
lemmatization_step = st.selectbox(
|
79 |
"Select lemmatization",
|
80 |
options=lammatization_options,
|
|
|
85 |
post_steps = st.multiselect(
|
86 |
"Select post-lemmatization processing steps (ordered)",
|
87 |
options=steps_options,
|
88 |
+
default=[
|
89 |
+
steps_options[i]
|
90 |
+
for i in PreprocessingConfigs.DEFAULT_POST.value
|
91 |
+
],
|
92 |
format_func=lambda x: x.replace("_", " ").title(),
|
93 |
help="Select the processing steps to apply after the text is lemmatized",
|
94 |
)
|
|
|
100 |
start_time = time.time()
|
101 |
|
102 |
# warnings about inputs
|
103 |
+
language_specific_warnings(
|
104 |
+
pre_steps, post_steps, lemmatization_step, language
|
105 |
+
)
|
106 |
|
107 |
# preprocess
|
108 |
if not disable_preprocessing:
|
109 |
with st.spinner("Step 1/4: Preprocessing text"):
|
110 |
+
pipe = PreprocessingPipeline(
|
111 |
+
language, pre_steps, lemmatization_step, post_steps
|
112 |
+
)
|
113 |
df = pipe.vaex_process(df, text_column)
|
114 |
else:
|
115 |
+
with st.spinner(
|
116 |
+
"Step 1/4: Preprocessing has been disabled - doing nothing"
|
117 |
+
):
|
118 |
+
df = df.rename(
|
119 |
+
columns={text_column: ColumnNames.PROCESSED_TEXT.value}
|
120 |
+
)
|
121 |
time.sleep(1.2)
|
122 |
|
123 |
# prepare input
|
124 |
with st.spinner("Step 2/4: Preparing inputs"):
|
125 |
+
input_dict = input_transform(
|
126 |
+
df[ColumnNames.PROCESSED_TEXT.value], df[label_column]
|
127 |
+
)
|
128 |
|
129 |
# wordify
|
130 |
with st.spinner("Step 3/4: Wordifying"):
|
|
|
234 |
"Wine light cherry",
|
235 |
"Chardonnay wine oak buttery",
|
236 |
],
|
237 |
+
"Label": [
|
238 |
+
"Italy",
|
239 |
+
"United States",
|
240 |
+
"United States",
|
241 |
+
"Italy",
|
242 |
+
"United States",
|
243 |
+
],
|
244 |
}
|
245 |
)
|
246 |
|
|
|
291 |
vectors of coefficients reported in table 3 (indicators that are not present in a run are listed as 0 here):
|
292 |
"""
|
293 |
)
|
294 |
+
st.caption(
|
295 |
+
"Table 3: Coefficients for frequency of indicators in each of the four runs for US wines."
|
296 |
+
)
|
297 |
st.table(table3)
|
298 |
|
299 |
st.markdown(
|
|
|
303 |
that are positively and negatively correlated with the US wines.
|
304 |
"""
|
305 |
)
|
306 |
+
st.caption(
|
307 |
+
"Table 4: Final set of indicators that are positively versus negatively correlated with US wines."
|
308 |
+
)
|
309 |
st.table(table4)
|
310 |
st.markdown(
|
311 |
"""
|
|
|
486 |
)
|
487 |
|
488 |
with st.expander("Vocabulary"):
|
489 |
+
st.markdown(
|
490 |
+
"The table below shows all candidate n-grams that Wordify considered"
|
491 |
+
)
|
492 |
st.write(meta_data["vocabulary"])
|
493 |
|
494 |
with st.expander("Labels"):
|
495 |
+
st.markdown(
|
496 |
+
"The table below summarizes the labels that your file contained"
|
497 |
+
)
|
498 |
st.write(meta_data["labels"])
|
499 |
|
500 |
return subset_df
|
|
|
524 |
"Chinese",
|
525 |
):
|
526 |
st.info(
|
527 |
+
msg
|
528 |
+
+ " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
|
529 |
)
|
src/preprocessing.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
import multiprocessing as mp
|
2 |
-
import os
|
3 |
import re
|
4 |
import string
|
5 |
from collections import OrderedDict
|
@@ -8,7 +6,6 @@ from typing import Callable, List, Optional, Union
|
|
8 |
import spacy
|
9 |
import vaex
|
10 |
from pandas.core.frame import DataFrame
|
11 |
-
from pandas.core.series import Series
|
12 |
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
13 |
|
14 |
from .configs import Languages
|
@@ -119,29 +116,6 @@ class PreprocessingPipeline:
|
|
119 |
|
120 |
return df
|
121 |
|
122 |
-
# def __call__(self, series: Series) -> Series:
|
123 |
-
# if self.pre:
|
124 |
-
# series = series.map(self.pre)
|
125 |
-
|
126 |
-
# if self.lemma:
|
127 |
-
# total_steps = len(series) // 100
|
128 |
-
# res = []
|
129 |
-
# pbar = st.progress(0)
|
130 |
-
# for i, doc in enumerate(
|
131 |
-
# self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())
|
132 |
-
# ):
|
133 |
-
# res.append(self.lemma(doc))
|
134 |
-
|
135 |
-
# if i % total_steps == 0:
|
136 |
-
# pbar.progress(1)
|
137 |
-
|
138 |
-
# series = pd.Series(res)
|
139 |
-
|
140 |
-
# if self.post:
|
141 |
-
# series = series.map(self.post)
|
142 |
-
|
143 |
-
# return series
|
144 |
-
|
145 |
@classmethod
|
146 |
def make_pipe_component(cls, steps: Optional[List[str]], language: str) -> Callable:
|
147 |
if not steps:
|
|
|
|
|
|
|
1 |
import re
|
2 |
import string
|
3 |
from collections import OrderedDict
|
|
|
6 |
import spacy
|
7 |
import vaex
|
8 |
from pandas.core.frame import DataFrame
|
|
|
9 |
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
10 |
|
11 |
from .configs import Languages
|
|
|
116 |
|
117 |
return df
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
@classmethod
|
120 |
def make_pipe_component(cls, steps: Optional[List[str]], language: str) -> Callable:
|
121 |
if not steps:
|
src/utils.py
CHANGED
@@ -7,8 +7,6 @@ from PIL import Image
|
|
7 |
|
8 |
from .configs import ColumnNames, SupportedFiles
|
9 |
|
10 |
-
# import altair as alt
|
11 |
-
|
12 |
|
13 |
def get_col_indices(cols: List) -> Tuple[int, int]:
|
14 |
"""Ugly but works"""
|
@@ -52,81 +50,3 @@ def download_button(dataframe: DataFrame, name: str) -> None:
|
|
52 |
b64 = base64.b64encode(csv.encode()).decode()
|
53 |
href = f'<a href="data:file/csv;base64,{b64}" download="{name}.csv">Download</a>'
|
54 |
st.write(href, unsafe_allow_html=True)
|
55 |
-
|
56 |
-
|
57 |
-
# def plot_labels_prop(data: DataFrame, label_column: str):
|
58 |
-
|
59 |
-
# unique_value_limit = 100
|
60 |
-
|
61 |
-
# if data[label_column].nunique() > unique_value_limit:
|
62 |
-
|
63 |
-
# st.warning(
|
64 |
-
# f"""
|
65 |
-
# The column you selected has more than {unique_value_limit}.
|
66 |
-
# Are you sure it's the right column? If it is, please note that
|
67 |
-
# this will impact __Wordify__ performance.
|
68 |
-
# """
|
69 |
-
# )
|
70 |
-
|
71 |
-
# return
|
72 |
-
|
73 |
-
# source = (
|
74 |
-
# data[label_column]
|
75 |
-
# .value_counts()
|
76 |
-
# .reset_index()
|
77 |
-
# .rename(columns={"index": "Labels", label_column: "Counts"})
|
78 |
-
# )
|
79 |
-
# source["Props"] = source["Counts"] / source["Counts"].sum()
|
80 |
-
# source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
|
81 |
-
|
82 |
-
# bars = (
|
83 |
-
# alt.Chart(source)
|
84 |
-
# .mark_bar()
|
85 |
-
# .encode(
|
86 |
-
# x=alt.X("Labels:O", sort="-y"),
|
87 |
-
# y="Counts:Q",
|
88 |
-
# )
|
89 |
-
# )
|
90 |
-
|
91 |
-
# text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
|
92 |
-
# text="Proportions:O"
|
93 |
-
# )
|
94 |
-
|
95 |
-
# return (bars + text).properties(height=300)
|
96 |
-
|
97 |
-
|
98 |
-
# def plot_nchars(data: DataFrame, text_column: str):
|
99 |
-
# source = data[text_column].str.len().to_frame()
|
100 |
-
|
101 |
-
# plot = (
|
102 |
-
# alt.Chart(source)
|
103 |
-
# .mark_bar()
|
104 |
-
# .encode(
|
105 |
-
# alt.X(
|
106 |
-
# f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
|
107 |
-
# ),
|
108 |
-
# alt.Y("count()", axis=alt.Axis(title="")),
|
109 |
-
# )
|
110 |
-
# )
|
111 |
-
|
112 |
-
# return plot.properties(height=300)
|
113 |
-
|
114 |
-
|
115 |
-
# def plot_score(data: DataFrame, label_col: str, label: str):
|
116 |
-
|
117 |
-
# source = (
|
118 |
-
# data.loc[data[label_col] == label]
|
119 |
-
# .sort_values("score", ascending=False)
|
120 |
-
# .head(100)
|
121 |
-
# )
|
122 |
-
|
123 |
-
# plot = (
|
124 |
-
# alt.Chart(source)
|
125 |
-
# .mark_bar()
|
126 |
-
# .encode(
|
127 |
-
# y=alt.Y("word:O", sort="-x"),
|
128 |
-
# x="score:Q",
|
129 |
-
# )
|
130 |
-
# )
|
131 |
-
|
132 |
-
# return plot.properties(height=max(30 * source.shape[0], 50))
|
|
|
7 |
|
8 |
from .configs import ColumnNames, SupportedFiles
|
9 |
|
|
|
|
|
10 |
|
11 |
def get_col_indices(cols: List) -> Tuple[int, int]:
|
12 |
"""Ugly but works"""
|
|
|
50 |
b64 = base64.b64encode(csv.encode()).decode()
|
51 |
href = f'<a href="data:file/csv;base64,{b64}" download="{name}.csv">Download</a>'
|
52 |
st.write(href, unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|