Pietro Lesci commited on
Commit
4fe22cb
1 Parent(s): e330a04

fix bug with no-ops

Browse files
Files changed (1) hide show
  1. src/preprocessing.py +30 -20
src/preprocessing.py CHANGED
@@ -75,6 +75,10 @@ def lemmatize_keep_stopwords(doc: spacy.tokens.doc.Doc) -> str:
75
  return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
76
 
77
 
 
 
 
 
78
  # fmt: on
79
  class PreprocessingPipeline:
80
  def __init__(
@@ -90,8 +94,14 @@ class PreprocessingPipeline:
90
  self.post_steps = post_steps
91
 
92
  self.nlp = spacy.load(Languages[language].value, disable=["parser", "ner"])
93
- self.pre = self.make_pre_post_component(self.pre_steps)
94
- self.post = self.make_pre_post_component(self.post_steps)
 
 
 
 
 
 
95
  self.lemma = self.lemmatization_component()[self.lemmatization_step]
96
 
97
  # def apply_multiproc(fn, series):
@@ -111,28 +121,28 @@ class PreprocessingPipeline:
111
 
112
  return df
113
 
114
- def __call__(self, series: Series) -> Series:
115
- if self.pre:
116
- series = series.map(self.pre)
117
 
118
- if self.lemma:
119
- total_steps = len(series) // 100
120
- res = []
121
- pbar = st.progress(0)
122
- for i, doc in enumerate(
123
- self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())
124
- ):
125
- res.append(self.lemma(doc))
126
 
127
- if i % total_steps == 0:
128
- pbar.progress(1)
129
 
130
- series = pd.Series(res)
131
 
132
- if self.post:
133
- series = series.map(self.post)
134
 
135
- return series
136
 
137
  def make_pre_post_component(self, steps: Optional[List[str]]) -> Optional[Callable]:
138
  if not steps:
@@ -179,7 +189,7 @@ class PreprocessingPipeline:
179
  [
180
  ("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
181
  ("Spacy lemmatizer (no stopwords)", lemmatize_remove_stopwords),
182
- ("Disable lemmatizer", None),
183
  ("Remove stopwords", remove_stopwords),
184
  ]
185
  )
 
75
  return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
76
 
77
 
78
+ def identity(t):
79
+ return t
80
+
81
+
82
  # fmt: on
83
  class PreprocessingPipeline:
84
  def __init__(
 
94
  self.post_steps = post_steps
95
 
96
  self.nlp = spacy.load(Languages[language].value, disable=["parser", "ner"])
97
+ self.pre = (
98
+ self.make_pre_post_component(self.pre_steps) if self.pre_steps else identity
99
+ )
100
+ self.post = (
101
+ self.make_pre_post_component(self.post_steps)
102
+ if self.post_steps
103
+ else identity
104
+ )
105
  self.lemma = self.lemmatization_component()[self.lemmatization_step]
106
 
107
  # def apply_multiproc(fn, series):
 
121
 
122
  return df
123
 
124
+ # def __call__(self, series: Series) -> Series:
125
+ # if self.pre:
126
+ # series = series.map(self.pre)
127
 
128
+ # if self.lemma:
129
+ # total_steps = len(series) // 100
130
+ # res = []
131
+ # pbar = st.progress(0)
132
+ # for i, doc in enumerate(
133
+ # self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())
134
+ # ):
135
+ # res.append(self.lemma(doc))
136
 
137
+ # if i % total_steps == 0:
138
+ # pbar.progress(1)
139
 
140
+ # series = pd.Series(res)
141
 
142
+ # if self.post:
143
+ # series = series.map(self.post)
144
 
145
+ # return series
146
 
147
  def make_pre_post_component(self, steps: Optional[List[str]]) -> Optional[Callable]:
148
  if not steps:
 
189
  [
190
  ("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
191
  ("Spacy lemmatizer (no stopwords)", lemmatize_remove_stopwords),
192
+ ("Disable lemmatizer", identity),
193
  ("Remove stopwords", remove_stopwords),
194
  ]
195
  )