Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
•
4fe22cb
1
Parent(s):
e330a04
fix bug with no-ops
Browse files- src/preprocessing.py +30 -20
src/preprocessing.py
CHANGED
@@ -75,6 +75,10 @@ def lemmatize_keep_stopwords(doc: spacy.tokens.doc.Doc) -> str:
|
|
75 |
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
|
76 |
|
77 |
|
|
|
|
|
|
|
|
|
78 |
# fmt: on
|
79 |
class PreprocessingPipeline:
|
80 |
def __init__(
|
@@ -90,8 +94,14 @@ class PreprocessingPipeline:
|
|
90 |
self.post_steps = post_steps
|
91 |
|
92 |
self.nlp = spacy.load(Languages[language].value, disable=["parser", "ner"])
|
93 |
-
self.pre =
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
self.lemma = self.lemmatization_component()[self.lemmatization_step]
|
96 |
|
97 |
# def apply_multiproc(fn, series):
|
@@ -111,28 +121,28 @@ class PreprocessingPipeline:
|
|
111 |
|
112 |
return df
|
113 |
|
114 |
-
def __call__(self, series: Series) -> Series:
|
115 |
-
|
116 |
-
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
|
127 |
-
|
128 |
-
|
129 |
|
130 |
-
|
131 |
|
132 |
-
|
133 |
-
|
134 |
|
135 |
-
|
136 |
|
137 |
def make_pre_post_component(self, steps: Optional[List[str]]) -> Optional[Callable]:
|
138 |
if not steps:
|
@@ -179,7 +189,7 @@ class PreprocessingPipeline:
|
|
179 |
[
|
180 |
("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
|
181 |
("Spacy lemmatizer (no stopwords)", lemmatize_remove_stopwords),
|
182 |
-
("Disable lemmatizer",
|
183 |
("Remove stopwords", remove_stopwords),
|
184 |
]
|
185 |
)
|
|
|
75 |
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
|
76 |
|
77 |
|
78 |
+
def identity(t):
|
79 |
+
return t
|
80 |
+
|
81 |
+
|
82 |
# fmt: on
|
83 |
class PreprocessingPipeline:
|
84 |
def __init__(
|
|
|
94 |
self.post_steps = post_steps
|
95 |
|
96 |
self.nlp = spacy.load(Languages[language].value, disable=["parser", "ner"])
|
97 |
+
self.pre = (
|
98 |
+
self.make_pre_post_component(self.pre_steps) if self.pre_steps else identity
|
99 |
+
)
|
100 |
+
self.post = (
|
101 |
+
self.make_pre_post_component(self.post_steps)
|
102 |
+
if self.post_steps
|
103 |
+
else identity
|
104 |
+
)
|
105 |
self.lemma = self.lemmatization_component()[self.lemmatization_step]
|
106 |
|
107 |
# def apply_multiproc(fn, series):
|
|
|
121 |
|
122 |
return df
|
123 |
|
124 |
+
# def __call__(self, series: Series) -> Series:
|
125 |
+
# if self.pre:
|
126 |
+
# series = series.map(self.pre)
|
127 |
|
128 |
+
# if self.lemma:
|
129 |
+
# total_steps = len(series) // 100
|
130 |
+
# res = []
|
131 |
+
# pbar = st.progress(0)
|
132 |
+
# for i, doc in enumerate(
|
133 |
+
# self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())
|
134 |
+
# ):
|
135 |
+
# res.append(self.lemma(doc))
|
136 |
|
137 |
+
# if i % total_steps == 0:
|
138 |
+
# pbar.progress(1)
|
139 |
|
140 |
+
# series = pd.Series(res)
|
141 |
|
142 |
+
# if self.post:
|
143 |
+
# series = series.map(self.post)
|
144 |
|
145 |
+
# return series
|
146 |
|
147 |
def make_pre_post_component(self, steps: Optional[List[str]]) -> Optional[Callable]:
|
148 |
if not steps:
|
|
|
189 |
[
|
190 |
("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
|
191 |
("Spacy lemmatizer (no stopwords)", lemmatize_remove_stopwords),
|
192 |
+
("Disable lemmatizer", identity),
|
193 |
("Remove stopwords", remove_stopwords),
|
194 |
]
|
195 |
)
|