Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
•
b3ecaa7
1
Parent(s):
b482a79
add support for chinese
Browse files- .streamlit/config.toml +1 -1
- app.py +3 -2
- data/test_chinese.xlsx +0 -0
- requirements.txt +3 -0
- src/components.py +52 -5
- src/configs.py +4 -3
- src/preprocessing.py +54 -14
- src/utils.py +4 -3
- tests/notebook.ipynb +66 -191
.streamlit/config.toml
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
[server]
|
2 |
# Max size, in megabytes, for files uploaded with the file_uploader.
|
3 |
# Default: 200
|
4 |
-
maxUploadSize =
|
5 |
|
6 |
[browser]
|
7 |
gatherUsageStats = false
|
|
|
1 |
[server]
|
2 |
# Max size, in megabytes, for files uploaded with the file_uploader.
|
3 |
# Default: 200
|
4 |
+
maxUploadSize = 10
|
5 |
|
6 |
[browser]
|
7 |
gatherUsageStats = false
|
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
|
3 |
-
from src.components import faq, footer, form, presentation
|
|
|
4 |
from src.utils import convert_df, get_logo, read_file
|
5 |
|
6 |
# app configs
|
@@ -25,7 +26,7 @@ st.title("Wordify")
|
|
25 |
# file uploader
|
26 |
uploaded_fl = st.sidebar.file_uploader(
|
27 |
label="Choose a file",
|
28 |
-
type=[
|
29 |
accept_multiple_files=False,
|
30 |
help="""
|
31 |
Supported formats:
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
+
from src.components import analysis, docs, faq, footer, form, presentation
|
4 |
+
from src.configs import SupportedFiles
|
5 |
from src.utils import convert_df, get_logo, read_file
|
6 |
|
7 |
# app configs
|
|
|
26 |
# file uploader
|
27 |
uploaded_fl = st.sidebar.file_uploader(
|
28 |
label="Choose a file",
|
29 |
+
type=[i.name for i in SupportedFiles],
|
30 |
accept_multiple_files=False,
|
31 |
help="""
|
32 |
Supported formats:
|
data/test_chinese.xlsx
ADDED
Binary file (580 kB). View file
|
|
requirements.txt
CHANGED
@@ -37,3 +37,6 @@ https://github.com/explosion/spacy-models/releases/download/ro_core_news_sm-3.2.
|
|
37 |
https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz#egg=ru_core_news_sm
|
38 |
# multi-language
|
39 |
https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.2.0/xx_ent_wiki_sm-3.2.0.tar.gz#egg=xx_ent_wiki_sm
|
|
|
|
|
|
|
|
37 |
https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.2.0/ru_core_news_sm-3.2.0.tar.gz#egg=ru_core_news_sm
|
38 |
# multi-language
|
39 |
https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.2.0/xx_ent_wiki_sm-3.2.0.tar.gz#egg=xx_ent_wiki_sm
|
40 |
+
# chinese
|
41 |
+
https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.2.0/zh_core_web_sm-3.2.0.tar.gz#egg=zh_core_web_sm
|
42 |
+
|
src/components.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
-
import streamlit as st
|
2 |
import time
|
|
|
3 |
import pandas as pd
|
|
|
4 |
|
5 |
-
from src.configs import Languages, PreprocessingConfigs, SupportedFiles
|
6 |
from src.preprocessing import PreprocessingPipeline
|
7 |
-
from src.wordifier import input_transform, output_transform, wordifier
|
8 |
from src.utils import get_col_indices
|
|
|
9 |
|
10 |
|
11 |
def docs():
|
@@ -78,7 +79,7 @@ def form(df):
|
|
78 |
"Select lemmatization",
|
79 |
options=lammatization_options,
|
80 |
index=PreprocessingConfigs.DEFAULT_LEMMA.value,
|
81 |
-
help="Select lemmatization procedure",
|
82 |
)
|
83 |
|
84 |
post_steps = st.multiselect(
|
@@ -98,6 +99,11 @@ def form(df):
|
|
98 |
|
99 |
start_time = time.time()
|
100 |
|
|
|
|
|
|
|
|
|
|
|
101 |
# preprocess
|
102 |
if not disable_preprocessing:
|
103 |
with st.spinner("Step 1/4: Preprocessing text"):
|
@@ -109,7 +115,10 @@ def form(df):
|
|
109 |
with st.spinner(
|
110 |
"Step 1/4: Preprocessing has been disabled - doing nothing"
|
111 |
):
|
112 |
-
|
|
|
|
|
|
|
113 |
|
114 |
# prepare input
|
115 |
with st.spinner("Step 2/4: Preparing inputs"):
|
@@ -260,6 +269,15 @@ def presentation():
|
|
260 |
you provide a file following this naming convention, Wordify will automatically select the
|
261 |
correct columns. However, if you wish to use a different nomenclature, you will be asked to
|
262 |
provide the column names in the interactive UI.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
"""
|
264 |
)
|
265 |
|
@@ -377,3 +395,32 @@ def analysis(outputs):
|
|
377 |
st.write(meta_data["labels"])
|
378 |
|
379 |
return subset_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import time
|
2 |
+
|
3 |
import pandas as pd
|
4 |
+
import streamlit as st
|
5 |
|
6 |
+
from src.configs import ColumnNames, Languages, PreprocessingConfigs, SupportedFiles
|
7 |
from src.preprocessing import PreprocessingPipeline
|
|
|
8 |
from src.utils import get_col_indices
|
9 |
+
from src.wordifier import input_transform, output_transform, wordifier
|
10 |
|
11 |
|
12 |
def docs():
|
|
|
79 |
"Select lemmatization",
|
80 |
options=lammatization_options,
|
81 |
index=PreprocessingConfigs.DEFAULT_LEMMA.value,
|
82 |
+
help="Select lemmatization procedure. This is automatically disabled when the selected language is Chinese or MultiLanguage.",
|
83 |
)
|
84 |
|
85 |
post_steps = st.multiselect(
|
|
|
99 |
|
100 |
start_time = time.time()
|
101 |
|
102 |
+
# warnings about inputs
|
103 |
+
language_specific_warnings(
|
104 |
+
pre_steps, post_steps, lemmatization_step, language
|
105 |
+
)
|
106 |
+
|
107 |
# preprocess
|
108 |
if not disable_preprocessing:
|
109 |
with st.spinner("Step 1/4: Preprocessing text"):
|
|
|
115 |
with st.spinner(
|
116 |
"Step 1/4: Preprocessing has been disabled - doing nothing"
|
117 |
):
|
118 |
+
df = df.rename(
|
119 |
+
columns={text_column: ColumnNames.PROCESSED_TEXT.value}
|
120 |
+
)
|
121 |
+
time.sleep(1.2)
|
122 |
|
123 |
# prepare input
|
124 |
with st.spinner("Step 2/4: Preparing inputs"):
|
|
|
269 |
you provide a file following this naming convention, Wordify will automatically select the
|
270 |
correct columns. However, if you wish to use a different nomenclature, you will be asked to
|
271 |
provide the column names in the interactive UI.
|
272 |
+
|
273 |
+
- Maintain a stable connection with the Wordify page until you download the data. If you refresh the page,
|
274 |
+
a new Wordify session is created and your progress is lost.
|
275 |
+
|
276 |
+
- Wordify performances depend on the length of the individual texts in your file. The longer the texts, the higher
|
277 |
+
the chance that Wordify considers many n-grams. More n-grams means more data to analyse in each run.
|
278 |
+
We tailored Wordify performance for files of approximately 5'000 lines or 50k n-grams. In such cases we expect a runtime
|
279 |
+
between 90 seconds and 10 minutes. If your file is big, try to apply a stricter preprocessing of the text in the `Advanced Options` section.
|
280 |
+
If this is not enough, please do feel free to reach out to us directly so we can help.
|
281 |
"""
|
282 |
)
|
283 |
|
|
|
395 |
st.write(meta_data["labels"])
|
396 |
|
397 |
return subset_df
|
398 |
+
|
399 |
+
|
400 |
+
# warning for Chinese and MultiLanguage
|
401 |
+
def language_specific_warnings(pre_steps, post_steps, lemmatization_step, language):
|
402 |
+
|
403 |
+
if language in ("MultiLanguage", "Chinese") and (
|
404 |
+
"remove_non_words" in pre_steps or "remove_non_words" in post_steps
|
405 |
+
):
|
406 |
+
msg = """
|
407 |
+
NOTE: for Chinese and MultiLanguage we automatically substitute `remove_non_words` with
|
408 |
+
`remove_numbers` and `remove_punctuation` to avoid wrong results.
|
409 |
+
"""
|
410 |
+
st.info(msg)
|
411 |
+
|
412 |
+
msg = "NOTE: for Chinese and MultiLanguage we turn-off lemmatization automatically."
|
413 |
+
if lemmatization_step == "Spacy lemmatizer (keep stopwords)" and language in (
|
414 |
+
"MultiLanguage",
|
415 |
+
"Chinese",
|
416 |
+
):
|
417 |
+
st.info(msg)
|
418 |
+
|
419 |
+
elif lemmatization_step == "Spacy lemmatizer (remove stopwords)" and language in (
|
420 |
+
"MultiLanguage",
|
421 |
+
"Chinese",
|
422 |
+
):
|
423 |
+
st.info(
|
424 |
+
msg
|
425 |
+
+ " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
|
426 |
+
)
|
src/configs.py
CHANGED
@@ -25,7 +25,7 @@ class InputTransformConfigs(Enum):
|
|
25 |
|
26 |
|
27 |
class PreprocessingConfigs(Enum):
|
28 |
-
DEFAULT_PRE = [1, 14, 2, 3, 4,
|
29 |
DEFAULT_LEMMA = 1
|
30 |
DEFAULT_POST = [0, 17, 15, 19, 23, 22, 21, 24]
|
31 |
|
@@ -39,7 +39,6 @@ class Languages(Enum):
|
|
39 |
Dutch = "nl_core_news_sm"
|
40 |
Portuguese = "pt_core_news_sm"
|
41 |
French = "fr_core_news_sm"
|
42 |
-
# Chinese = "zh_core_news_sm"
|
43 |
Danish = "da_core_news_sm"
|
44 |
# Japanese = "ja_core_news_sm"
|
45 |
Lithuanian = "lt_core_news_sm"
|
@@ -48,9 +47,11 @@ class Languages(Enum):
|
|
48 |
Romanian = "ro_core_news_sm"
|
49 |
Russian = "ru_core_news_sm"
|
50 |
MultiLanguage = "xx_ent_wiki_sm"
|
|
|
51 |
|
52 |
|
53 |
class SupportedFiles(Enum):
|
54 |
xlsx = (lambda x: pd.read_excel(x, dtype=str),)
|
55 |
-
|
|
|
56 |
parquet = (lambda x: pd.read_parquet(x),)
|
|
|
25 |
|
26 |
|
27 |
class PreprocessingConfigs(Enum):
|
28 |
+
DEFAULT_PRE = [1, 14, 2, 3, 4, 5, 23, 22, 21, 24]
|
29 |
DEFAULT_LEMMA = 1
|
30 |
DEFAULT_POST = [0, 17, 15, 19, 23, 22, 21, 24]
|
31 |
|
|
|
39 |
Dutch = "nl_core_news_sm"
|
40 |
Portuguese = "pt_core_news_sm"
|
41 |
French = "fr_core_news_sm"
|
|
|
42 |
Danish = "da_core_news_sm"
|
43 |
# Japanese = "ja_core_news_sm"
|
44 |
Lithuanian = "lt_core_news_sm"
|
|
|
47 |
Romanian = "ro_core_news_sm"
|
48 |
Russian = "ru_core_news_sm"
|
49 |
MultiLanguage = "xx_ent_wiki_sm"
|
50 |
+
Chinese = "zh_core_web_sm"
|
51 |
|
52 |
|
53 |
class SupportedFiles(Enum):
|
54 |
xlsx = (lambda x: pd.read_excel(x, dtype=str),)
|
55 |
+
tsv = (lambda x: pd.read_csv(x, dtype=str, sep="\t"),)
|
56 |
+
csv = (lambda x: pd.read_csv(x, dtype=str, sep=","),)
|
57 |
parquet = (lambda x: pd.read_parquet(x),)
|
src/preprocessing.py
CHANGED
@@ -3,11 +3,9 @@ import os
|
|
3 |
import re
|
4 |
import string
|
5 |
from collections import OrderedDict
|
6 |
-
from typing import Callable, List, Optional
|
7 |
|
8 |
-
import pandas as pd
|
9 |
import spacy
|
10 |
-
import streamlit as st
|
11 |
import vaex
|
12 |
from pandas.core.frame import DataFrame
|
13 |
from pandas.core.series import Series
|
@@ -99,14 +97,10 @@ class PreprocessingPipeline:
|
|
99 |
self.lemmatization_step = lemmatization_step
|
100 |
self.post_steps = post_steps
|
101 |
|
102 |
-
self.
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
)
|
107 |
-
self.pre = self.make_pipe_component(self.pre_steps)
|
108 |
-
self.post = self.make_pipe_component(self.post_steps)
|
109 |
-
self.lemma = self.lemmatization_component().get(self.lemmatization_step)
|
110 |
|
111 |
# def apply_multiproc(fn, series):
|
112 |
# with mp.Pool(mp.cpu_count()) as pool:
|
@@ -148,13 +142,59 @@ class PreprocessingPipeline:
|
|
148 |
|
149 |
# return series
|
150 |
|
151 |
-
|
|
|
152 |
if not steps:
|
153 |
return identity
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
return make_pipeline(*components)
|
157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
@staticmethod
|
159 |
def pipeline_components() -> "OrderedDict[str, Callable]":
|
160 |
"""Returns available cleaning steps in order"""
|
@@ -193,7 +233,7 @@ class PreprocessingPipeline:
|
|
193 |
return OrderedDict(
|
194 |
[
|
195 |
("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
|
196 |
-
("Spacy lemmatizer (
|
197 |
("Disable lemmatizer", identity),
|
198 |
("Remove stopwords", remove_stopwords),
|
199 |
]
|
|
|
3 |
import re
|
4 |
import string
|
5 |
from collections import OrderedDict
|
6 |
+
from typing import Callable, List, Optional, Union
|
7 |
|
|
|
8 |
import spacy
|
|
|
9 |
import vaex
|
10 |
from pandas.core.frame import DataFrame
|
11 |
from pandas.core.series import Series
|
|
|
97 |
self.lemmatization_step = lemmatization_step
|
98 |
self.post_steps = post_steps
|
99 |
|
100 |
+
self.pre = self.make_pipe_component(self.pre_steps, self.language)
|
101 |
+
self.post = self.make_pipe_component(self.post_steps, self.language)
|
102 |
+
self.nlp = self.make_nlp(self.lemmatization_step, self.language)
|
103 |
+
self.lemma = self.make_lemma(self.lemmatization_step, self.language)
|
|
|
|
|
|
|
|
|
104 |
|
105 |
# def apply_multiproc(fn, series):
|
106 |
# with mp.Pool(mp.cpu_count()) as pool:
|
|
|
142 |
|
143 |
# return series
|
144 |
|
145 |
+
@classmethod
|
146 |
+
def make_pipe_component(cls, steps: Optional[List[str]], language: str) -> Callable:
|
147 |
if not steps:
|
148 |
return identity
|
149 |
+
|
150 |
+
elif language in ("MultiLanguage", "Chinese") and "remove_non_words" in steps:
|
151 |
+
idx = steps.index("remove_non_words")
|
152 |
+
steps = (
|
153 |
+
steps[:idx]
|
154 |
+
+ ["remove_numbers", "remove_punctuation"]
|
155 |
+
+ steps[idx + 1 :]
|
156 |
+
)
|
157 |
+
|
158 |
+
components = [cls.pipeline_components()[step] for step in steps]
|
159 |
|
160 |
return make_pipeline(*components)
|
161 |
|
162 |
+
@staticmethod
|
163 |
+
def make_nlp(
|
164 |
+
lemmatization_step: Optional[str], language: str
|
165 |
+
) -> Union[spacy.language.Language, Callable]:
|
166 |
+
if (
|
167 |
+
lemmatization_step is None
|
168 |
+
or lemmatization_step == "Disable lemmatizer"
|
169 |
+
or (
|
170 |
+
lemmatization_step == "Spacy lemmatizer (keep stopwords)"
|
171 |
+
and language in ("MultiLanguage", "Chinese")
|
172 |
+
)
|
173 |
+
):
|
174 |
+
return identity
|
175 |
+
return spacy.load(Languages[language].value, disable=["parser", "ner"])
|
176 |
+
|
177 |
+
@classmethod
|
178 |
+
def make_lemma(cls, lemmatization_step: Optional[str], language: str) -> Callable:
|
179 |
+
|
180 |
+
if (
|
181 |
+
lemmatization_step is None
|
182 |
+
or lemmatization_step == "Disable lemmatizer"
|
183 |
+
or (
|
184 |
+
lemmatization_step == "Spacy lemmatizer (keep stopwords)"
|
185 |
+
and language in ("MultiLanguage", "Chinese")
|
186 |
+
)
|
187 |
+
):
|
188 |
+
return identity
|
189 |
+
|
190 |
+
elif (
|
191 |
+
lemmatization_step == "Spacy lemmatizer (remove stopwords)"
|
192 |
+
and language in ("MultiLanguage", "Chinese")
|
193 |
+
):
|
194 |
+
return cls.lemmatization_component().get("Remove stopwords")
|
195 |
+
|
196 |
+
return cls.lemmatization_component().get(lemmatization_step)
|
197 |
+
|
198 |
@staticmethod
|
199 |
def pipeline_components() -> "OrderedDict[str, Callable]":
|
200 |
"""Returns available cleaning steps in order"""
|
|
|
233 |
return OrderedDict(
|
234 |
[
|
235 |
("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
|
236 |
+
("Spacy lemmatizer (remove stopwords)", lemmatize_remove_stopwords),
|
237 |
("Disable lemmatizer", identity),
|
238 |
("Remove stopwords", remove_stopwords),
|
239 |
]
|
src/utils.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
import base64
|
2 |
from typing import List, Tuple
|
3 |
-
|
4 |
import streamlit as st
|
|
|
5 |
from PIL import Image
|
6 |
|
7 |
-
|
8 |
|
9 |
-
|
10 |
|
11 |
|
12 |
def get_col_indices(cols: List) -> Tuple[int, int]:
|
|
|
1 |
import base64
|
2 |
from typing import List, Tuple
|
3 |
+
|
4 |
import streamlit as st
|
5 |
+
from pandas.core.frame import DataFrame
|
6 |
from PIL import Image
|
7 |
|
8 |
+
from .configs import ColumnNames, SupportedFiles
|
9 |
|
10 |
+
# import altair as alt
|
11 |
|
12 |
|
13 |
def get_col_indices(cols: List) -> Tuple[int, int]:
|
tests/notebook.ipynb
CHANGED
@@ -21,7 +21,8 @@
|
|
21 |
"metadata": {},
|
22 |
"outputs": [],
|
23 |
"source": [
|
24 |
-
"df = pd.read_csv(\"../data/test_en.csv\")"
|
|
|
25 |
]
|
26 |
},
|
27 |
{
|
@@ -36,10 +37,10 @@
|
|
36 |
" \"normalize_bullet_points\",\n",
|
37 |
" \"normalize_hyphenated_words\",\n",
|
38 |
" \"normalize_quotation_marks\",\n",
|
39 |
-
" \"
|
40 |
" \"normalize_repeating_words\",\n",
|
41 |
" \"normalize_repeating_chars\",\n",
|
42 |
-
" \"
|
43 |
" # \"replace_currency_symbols\",\n",
|
44 |
" # \"replace_emails\",\n",
|
45 |
" # \"replace_emojis\",\n",
|
@@ -60,7 +61,7 @@
|
|
60 |
},
|
61 |
{
|
62 |
"cell_type": "code",
|
63 |
-
"execution_count":
|
64 |
"metadata": {},
|
65 |
"outputs": [],
|
66 |
"source": [
|
@@ -74,8 +75,8 @@
|
|
74 |
" # \"replace_emojis\",\n",
|
75 |
" # \"replace_phone_numbers\",\n",
|
76 |
" # \"replace_numbers\",\n",
|
77 |
-
" \"remove_html_tags\",\n",
|
78 |
-
" \"remove_accents\",\n",
|
79 |
" # \"remove_brackets\",\n",
|
80 |
" \"remove_non_words\",\n",
|
81 |
" # \"remove_numbers\",\n",
|
@@ -89,13 +90,13 @@
|
|
89 |
},
|
90 |
{
|
91 |
"cell_type": "code",
|
92 |
-
"execution_count":
|
93 |
"metadata": {},
|
94 |
"outputs": [],
|
95 |
"source": [
|
96 |
"pipe = PreprocessingPipeline(\n",
|
97 |
-
" language=\"
|
98 |
-
" lemmatization_step=\"Spacy lemmatizer (
|
99 |
" pre_steps=pre_steps,\n",
|
100 |
" post_steps=post_steps,\n",
|
101 |
")"
|
@@ -103,218 +104,125 @@
|
|
103 |
},
|
104 |
{
|
105 |
"cell_type": "code",
|
106 |
-
"execution_count":
|
107 |
"metadata": {},
|
108 |
"outputs": [
|
109 |
{
|
110 |
"data": {
|
111 |
"text/plain": [
|
112 |
-
"
|
113 |
]
|
114 |
},
|
115 |
-
"execution_count":
|
116 |
"metadata": {},
|
117 |
"output_type": "execute_result"
|
118 |
}
|
119 |
],
|
120 |
"source": [
|
121 |
-
"
|
122 |
]
|
123 |
},
|
124 |
{
|
125 |
"cell_type": "code",
|
126 |
-
"execution_count":
|
127 |
"metadata": {},
|
128 |
"outputs": [
|
129 |
{
|
130 |
"data": {
|
131 |
"text/plain": [
|
132 |
-
"'
|
133 |
]
|
134 |
},
|
135 |
-
"execution_count":
|
136 |
"metadata": {},
|
137 |
"output_type": "execute_result"
|
138 |
}
|
139 |
],
|
140 |
"source": [
|
141 |
-
"pipe.
|
142 |
]
|
143 |
},
|
144 |
{
|
145 |
"cell_type": "code",
|
146 |
-
"execution_count":
|
147 |
"metadata": {},
|
148 |
"outputs": [
|
149 |
{
|
150 |
"data": {
|
151 |
"text/plain": [
|
152 |
-
"'
|
153 |
]
|
154 |
},
|
155 |
-
"execution_count":
|
156 |
"metadata": {},
|
157 |
"output_type": "execute_result"
|
158 |
}
|
159 |
],
|
160 |
"source": [
|
161 |
-
"pipe.
|
162 |
]
|
163 |
},
|
164 |
{
|
165 |
"cell_type": "code",
|
166 |
-
"execution_count":
|
167 |
"metadata": {},
|
168 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
"source": [
|
170 |
-
"
|
171 |
]
|
172 |
},
|
173 |
{
|
174 |
"cell_type": "code",
|
175 |
-
"execution_count":
|
176 |
"metadata": {},
|
177 |
"outputs": [
|
178 |
{
|
179 |
"data": {
|
180 |
-
"text/html": [
|
181 |
-
"<div>\n",
|
182 |
-
"<style scoped>\n",
|
183 |
-
" .dataframe tbody tr th:only-of-type {\n",
|
184 |
-
" vertical-align: middle;\n",
|
185 |
-
" }\n",
|
186 |
-
"\n",
|
187 |
-
" .dataframe tbody tr th {\n",
|
188 |
-
" vertical-align: top;\n",
|
189 |
-
" }\n",
|
190 |
-
"\n",
|
191 |
-
" .dataframe thead th {\n",
|
192 |
-
" text-align: right;\n",
|
193 |
-
" }\n",
|
194 |
-
"</style>\n",
|
195 |
-
"<table border=\"1\" class=\"dataframe\">\n",
|
196 |
-
" <thead>\n",
|
197 |
-
" <tr style=\"text-align: right;\">\n",
|
198 |
-
" <th></th>\n",
|
199 |
-
" <th>label</th>\n",
|
200 |
-
" <th>text</th>\n",
|
201 |
-
" <th>processed_text</th>\n",
|
202 |
-
" </tr>\n",
|
203 |
-
" </thead>\n",
|
204 |
-
" <tbody>\n",
|
205 |
-
" <tr>\n",
|
206 |
-
" <th>0</th>\n",
|
207 |
-
" <td>0</td>\n",
|
208 |
-
" <td>I think it's time John Rambo move on with his ...</td>\n",
|
209 |
-
" <td>think time john rambo life try vietnam series ...</td>\n",
|
210 |
-
" </tr>\n",
|
211 |
-
" <tr>\n",
|
212 |
-
" <th>1</th>\n",
|
213 |
-
" <td>1</td>\n",
|
214 |
-
" <td>I've just watch 2 films of Pang brothers, The ...</td>\n",
|
215 |
-
" <td>watch film pang brother eye watch eye kind dis...</td>\n",
|
216 |
-
" </tr>\n",
|
217 |
-
" <tr>\n",
|
218 |
-
" <th>2</th>\n",
|
219 |
-
" <td>1</td>\n",
|
220 |
-
" <td>Jewel Thief is *THE* crime thriller of Bollywo...</td>\n",
|
221 |
-
" <td>jewel thief crime thriller bollywood direct bi...</td>\n",
|
222 |
-
" </tr>\n",
|
223 |
-
" <tr>\n",
|
224 |
-
" <th>3</th>\n",
|
225 |
-
" <td>0</td>\n",
|
226 |
-
" <td>This so called remake is terrible. I went to s...</td>\n",
|
227 |
-
" <td>call remake terrible go tonight day anticipati...</td>\n",
|
228 |
-
" </tr>\n",
|
229 |
-
" <tr>\n",
|
230 |
-
" <th>4</th>\n",
|
231 |
-
" <td>1</td>\n",
|
232 |
-
" <td>When Northfork debuted at the Cannes Film Fest...</td>\n",
|
233 |
-
" <td>northfork debut cannes film festival people li...</td>\n",
|
234 |
-
" </tr>\n",
|
235 |
-
" <tr>\n",
|
236 |
-
" <th>...</th>\n",
|
237 |
-
" <td>...</td>\n",
|
238 |
-
" <td>...</td>\n",
|
239 |
-
" <td>...</td>\n",
|
240 |
-
" </tr>\n",
|
241 |
-
" <tr>\n",
|
242 |
-
" <th>4995</th>\n",
|
243 |
-
" <td>0</td>\n",
|
244 |
-
" <td>The title tells it all -- Ed Gein, the butcher...</td>\n",
|
245 |
-
" <td>title tell ed gein butcher plainfield it zappy...</td>\n",
|
246 |
-
" </tr>\n",
|
247 |
-
" <tr>\n",
|
248 |
-
" <th>4996</th>\n",
|
249 |
-
" <td>0</td>\n",
|
250 |
-
" <td>This film makes about as much sense as an 'Ozz...</td>\n",
|
251 |
-
" <td>film make sense ozzie harriet father know best...</td>\n",
|
252 |
-
" </tr>\n",
|
253 |
-
" <tr>\n",
|
254 |
-
" <th>4997</th>\n",
|
255 |
-
" <td>0</td>\n",
|
256 |
-
" <td>\"Sex and the City\" has some great things going...</td>\n",
|
257 |
-
" <td>sex city great thing go problem saddle number ...</td>\n",
|
258 |
-
" </tr>\n",
|
259 |
-
" <tr>\n",
|
260 |
-
" <th>4998</th>\n",
|
261 |
-
" <td>0</td>\n",
|
262 |
-
" <td>Please...if anybody gets the chance to read th...</td>\n",
|
263 |
-
" <td>please if anybody get chance read watch movie ...</td>\n",
|
264 |
-
" </tr>\n",
|
265 |
-
" <tr>\n",
|
266 |
-
" <th>4999</th>\n",
|
267 |
-
" <td>0</td>\n",
|
268 |
-
" <td>...a film comes along that manages to be absol...</td>\n",
|
269 |
-
" <td>a film come manage absolutely terrible open ti...</td>\n",
|
270 |
-
" </tr>\n",
|
271 |
-
" </tbody>\n",
|
272 |
-
"</table>\n",
|
273 |
-
"<p>5000 rows × 3 columns</p>\n",
|
274 |
-
"</div>"
|
275 |
-
],
|
276 |
"text/plain": [
|
277 |
-
"
|
278 |
-
"0 0 I think it's time John Rambo move on with his ... \n",
|
279 |
-
"1 1 I've just watch 2 films of Pang brothers, The ... \n",
|
280 |
-
"2 1 Jewel Thief is *THE* crime thriller of Bollywo... \n",
|
281 |
-
"3 0 This so called remake is terrible. I went to s... \n",
|
282 |
-
"4 1 When Northfork debuted at the Cannes Film Fest... \n",
|
283 |
-
"... ... ... \n",
|
284 |
-
"4995 0 The title tells it all -- Ed Gein, the butcher... \n",
|
285 |
-
"4996 0 This film makes about as much sense as an 'Ozz... \n",
|
286 |
-
"4997 0 \"Sex and the City\" has some great things going... \n",
|
287 |
-
"4998 0 Please...if anybody gets the chance to read th... \n",
|
288 |
-
"4999 0 ...a film comes along that manages to be absol... \n",
|
289 |
-
"\n",
|
290 |
-
" processed_text \n",
|
291 |
-
"0 think time john rambo life try vietnam series ... \n",
|
292 |
-
"1 watch film pang brother eye watch eye kind dis... \n",
|
293 |
-
"2 jewel thief crime thriller bollywood direct bi... \n",
|
294 |
-
"3 call remake terrible go tonight day anticipati... \n",
|
295 |
-
"4 northfork debut cannes film festival people li... \n",
|
296 |
-
"... ... \n",
|
297 |
-
"4995 title tell ed gein butcher plainfield it zappy... \n",
|
298 |
-
"4996 film make sense ozzie harriet father know best... \n",
|
299 |
-
"4997 sex city great thing go problem saddle number ... \n",
|
300 |
-
"4998 please if anybody get chance read watch movie ... \n",
|
301 |
-
"4999 a film come manage absolutely terrible open ti... \n",
|
302 |
-
"\n",
|
303 |
-
"[5000 rows x 3 columns]"
|
304 |
]
|
305 |
},
|
306 |
-
"execution_count":
|
307 |
"metadata": {},
|
308 |
"output_type": "execute_result"
|
309 |
}
|
310 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
"source": [
|
312 |
"odf"
|
313 |
]
|
314 |
},
|
315 |
{
|
316 |
"cell_type": "code",
|
317 |
-
"execution_count":
|
318 |
"metadata": {},
|
319 |
"outputs": [],
|
320 |
"source": [
|
@@ -325,40 +233,18 @@
|
|
325 |
},
|
326 |
{
|
327 |
"cell_type": "code",
|
328 |
-
"execution_count":
|
329 |
"metadata": {},
|
330 |
-
"outputs": [
|
331 |
-
{
|
332 |
-
"data": {
|
333 |
-
"text/plain": [
|
334 |
-
"[1, 14, 2, 3, 4, 21, 23, 22, 5, 24]"
|
335 |
-
]
|
336 |
-
},
|
337 |
-
"execution_count": 16,
|
338 |
-
"metadata": {},
|
339 |
-
"output_type": "execute_result"
|
340 |
-
}
|
341 |
-
],
|
342 |
"source": [
|
343 |
"default_pre_steps_idx"
|
344 |
]
|
345 |
},
|
346 |
{
|
347 |
"cell_type": "code",
|
348 |
-
"execution_count":
|
349 |
"metadata": {},
|
350 |
-
"outputs": [
|
351 |
-
{
|
352 |
-
"data": {
|
353 |
-
"text/plain": [
|
354 |
-
"[0, 17, 15, 19, 23, 22, 21, 24]"
|
355 |
-
]
|
356 |
-
},
|
357 |
-
"execution_count": 17,
|
358 |
-
"metadata": {},
|
359 |
-
"output_type": "execute_result"
|
360 |
-
}
|
361 |
-
],
|
362 |
"source": [
|
363 |
"default_post_steps_idx"
|
364 |
]
|
@@ -383,7 +269,7 @@
|
|
383 |
},
|
384 |
{
|
385 |
"cell_type": "code",
|
386 |
-
"execution_count":
|
387 |
"metadata": {},
|
388 |
"outputs": [],
|
389 |
"source": [
|
@@ -392,7 +278,7 @@
|
|
392 |
},
|
393 |
{
|
394 |
"cell_type": "code",
|
395 |
-
"execution_count":
|
396 |
"metadata": {},
|
397 |
"outputs": [],
|
398 |
"source": [
|
@@ -401,20 +287,9 @@
|
|
401 |
},
|
402 |
{
|
403 |
"cell_type": "code",
|
404 |
-
"execution_count":
|
405 |
"metadata": {},
|
406 |
-
"outputs": [
|
407 |
-
{
|
408 |
-
"data": {
|
409 |
-
"text/plain": [
|
410 |
-
"'Mimmo '"
|
411 |
-
]
|
412 |
-
},
|
413 |
-
"execution_count": 28,
|
414 |
-
"metadata": {},
|
415 |
-
"output_type": "execute_result"
|
416 |
-
}
|
417 |
-
],
|
418 |
"source": [
|
419 |
"_re_non_words.sub(\" \", \"Mimmo23\")"
|
420 |
]
|
|
|
21 |
"metadata": {},
|
22 |
"outputs": [],
|
23 |
"source": [
|
24 |
+
"# df = pd.read_csv(\"../data/test_en.csv\")\n",
|
25 |
+
"df = pd.read_excel(\"../data/test_chinese.xlsx\")"
|
26 |
]
|
27 |
},
|
28 |
{
|
|
|
37 |
" \"normalize_bullet_points\",\n",
|
38 |
" \"normalize_hyphenated_words\",\n",
|
39 |
" \"normalize_quotation_marks\",\n",
|
40 |
+
" \"normalize_whitespaces\",\n",
|
41 |
" \"normalize_repeating_words\",\n",
|
42 |
" \"normalize_repeating_chars\",\n",
|
43 |
+
" \"normalize_useless_spaces\",\n",
|
44 |
" # \"replace_currency_symbols\",\n",
|
45 |
" # \"replace_emails\",\n",
|
46 |
" # \"replace_emojis\",\n",
|
|
|
61 |
},
|
62 |
{
|
63 |
"cell_type": "code",
|
64 |
+
"execution_count": 11,
|
65 |
"metadata": {},
|
66 |
"outputs": [],
|
67 |
"source": [
|
|
|
75 |
" # \"replace_emojis\",\n",
|
76 |
" # \"replace_phone_numbers\",\n",
|
77 |
" # \"replace_numbers\",\n",
|
78 |
+
" # \"remove_html_tags\",\n",
|
79 |
+
" # \"remove_accents\",\n",
|
80 |
" # \"remove_brackets\",\n",
|
81 |
" \"remove_non_words\",\n",
|
82 |
" # \"remove_numbers\",\n",
|
|
|
90 |
},
|
91 |
{
|
92 |
"cell_type": "code",
|
93 |
+
"execution_count": 12,
|
94 |
"metadata": {},
|
95 |
"outputs": [],
|
96 |
"source": [
|
97 |
"pipe = PreprocessingPipeline(\n",
|
98 |
+
" language=\"Chinese\",\n",
|
99 |
+
" lemmatization_step=\"Spacy lemmatizer (keep stopwords)\", # \"Disable lemmatizer\",\n",
|
100 |
" pre_steps=pre_steps,\n",
|
101 |
" post_steps=post_steps,\n",
|
102 |
")"
|
|
|
104 |
},
|
105 |
{
|
106 |
"cell_type": "code",
|
107 |
+
"execution_count": 13,
|
108 |
"metadata": {},
|
109 |
"outputs": [
|
110 |
{
|
111 |
"data": {
|
112 |
"text/plain": [
|
113 |
+
"'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
|
114 |
]
|
115 |
},
|
116 |
+
"execution_count": 13,
|
117 |
"metadata": {},
|
118 |
"output_type": "execute_result"
|
119 |
}
|
120 |
],
|
121 |
"source": [
|
122 |
+
"df.text[0]"
|
123 |
]
|
124 |
},
|
125 |
{
|
126 |
"cell_type": "code",
|
127 |
+
"execution_count": 14,
|
128 |
"metadata": {},
|
129 |
"outputs": [
|
130 |
{
|
131 |
"data": {
|
132 |
"text/plain": [
|
133 |
+
"'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
|
134 |
]
|
135 |
},
|
136 |
+
"execution_count": 14,
|
137 |
"metadata": {},
|
138 |
"output_type": "execute_result"
|
139 |
}
|
140 |
],
|
141 |
"source": [
|
142 |
+
"pipe.pre(df.text[0])"
|
143 |
]
|
144 |
},
|
145 |
{
|
146 |
"cell_type": "code",
|
147 |
+
"execution_count": 15,
|
148 |
"metadata": {},
|
149 |
"outputs": [
|
150 |
{
|
151 |
"data": {
|
152 |
"text/plain": [
|
153 |
+
"'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate7 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 MP4 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
|
154 |
]
|
155 |
},
|
156 |
+
"execution_count": 15,
|
157 |
"metadata": {},
|
158 |
"output_type": "execute_result"
|
159 |
}
|
160 |
],
|
161 |
"source": [
|
162 |
+
"pipe.lemma(pipe.nlp(pipe.pre(df.text[0])))"
|
163 |
]
|
164 |
},
|
165 |
{
|
166 |
"cell_type": "code",
|
167 |
+
"execution_count": 16,
|
168 |
"metadata": {},
|
169 |
+
"outputs": [
|
170 |
+
{
|
171 |
+
"data": {
|
172 |
+
"text/plain": [
|
173 |
+
"'全金属 指纹识别 垃圾 买手机 不行 指纹识别 不好 太慢 好多 失败 电池 哥哥 一部 华为 mate 手机 旅游 丢掉 我哥 算是 二手 二手手机 用个 两天 毛 手机 只能 大半天 玩 手机游戏 最多 看个 新闻 微信 不行 急 手机 买手机 谈谈 通话 想 问 一句 手机 通话 保证 畅通 手机 意义 一部 mp 区别 第一次 通话 五分钟 声音 说 女朋友 手机 朋友 父母 打电话 情况 毛呢 所有人 手机 利用 全金属 吸引 眼球 做工 体验 不好 电池 耐用 通话 易 无声 加油 拿出 诚意'"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
"execution_count": 16,
|
177 |
+
"metadata": {},
|
178 |
+
"output_type": "execute_result"
|
179 |
+
}
|
180 |
+
],
|
181 |
"source": [
|
182 |
+
"pipe.post(pipe.lemma(pipe.nlp(pipe.pre(df.text[0]))))"
|
183 |
]
|
184 |
},
|
185 |
{
|
186 |
"cell_type": "code",
|
187 |
+
"execution_count": 17,
|
188 |
"metadata": {},
|
189 |
"outputs": [
|
190 |
{
|
191 |
"data": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
"text/plain": [
|
193 |
+
"Compose(<function strip at 0x7ff4894750e0>, <function normalize_useless_spaces at 0x7ff48946eef0>, <function normalize_repeating_chars at 0x7ff48946ef80>, <function normalize_repeating_words at 0x7ff4871a7170>, <function punctuation at 0x7ff48946e4d0>, <function remove_numbers at 0x7ff4894754d0>, <function lowercase at 0x7ff489475050>)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
]
|
195 |
},
|
196 |
+
"execution_count": 17,
|
197 |
"metadata": {},
|
198 |
"output_type": "execute_result"
|
199 |
}
|
200 |
],
|
201 |
+
"source": [
|
202 |
+
"pipe.post"
|
203 |
+
]
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"cell_type": "code",
|
207 |
+
"execution_count": null,
|
208 |
+
"metadata": {},
|
209 |
+
"outputs": [],
|
210 |
+
"source": [
|
211 |
+
"odf = pipe.vaex_process(df, \"text\")"
|
212 |
+
]
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"cell_type": "code",
|
216 |
+
"execution_count": null,
|
217 |
+
"metadata": {},
|
218 |
+
"outputs": [],
|
219 |
"source": [
|
220 |
"odf"
|
221 |
]
|
222 |
},
|
223 |
{
|
224 |
"cell_type": "code",
|
225 |
+
"execution_count": null,
|
226 |
"metadata": {},
|
227 |
"outputs": [],
|
228 |
"source": [
|
|
|
233 |
},
|
234 |
{
|
235 |
"cell_type": "code",
|
236 |
+
"execution_count": null,
|
237 |
"metadata": {},
|
238 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
"source": [
|
240 |
"default_pre_steps_idx"
|
241 |
]
|
242 |
},
|
243 |
{
|
244 |
"cell_type": "code",
|
245 |
+
"execution_count": null,
|
246 |
"metadata": {},
|
247 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
"source": [
|
249 |
"default_post_steps_idx"
|
250 |
]
|
|
|
269 |
},
|
270 |
{
|
271 |
"cell_type": "code",
|
272 |
+
"execution_count": null,
|
273 |
"metadata": {},
|
274 |
"outputs": [],
|
275 |
"source": [
|
|
|
278 |
},
|
279 |
{
|
280 |
"cell_type": "code",
|
281 |
+
"execution_count": null,
|
282 |
"metadata": {},
|
283 |
"outputs": [],
|
284 |
"source": [
|
|
|
287 |
},
|
288 |
{
|
289 |
"cell_type": "code",
|
290 |
+
"execution_count": null,
|
291 |
"metadata": {},
|
292 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
"source": [
|
294 |
"_re_non_words.sub(\" \", \"Mimmo23\")"
|
295 |
]
|