Pietro Lesci commited on
Commit
c718eb8
1 Parent(s): e8a4a19
Files changed (3) hide show
  1. src/configs.py +1 -1
  2. src/pages/home.py +7 -1
  3. src/preprocessing.py +1 -1
src/configs.py CHANGED
@@ -33,4 +33,4 @@ class Languages(Enum):
33
  class SupportedFiles(Enum):
34
  xlsx = (lambda x: pd.read_excel(x, dtype=str),)
35
  csv = (lambda x: pd.read_csv(x, dtype=str),)
36
- parquet = (lambda x: pd.read_parquet(x, dtype=str),)
 
33
  class SupportedFiles(Enum):
34
  xlsx = (lambda x: pd.read_excel(x, dtype=str),)
35
  csv = (lambda x: pd.read_csv(x, dtype=str),)
36
+ parquet = (lambda x: pd.read_parquet(x),)
src/pages/home.py CHANGED
@@ -108,7 +108,7 @@ def write(session, uploaded_file):
108
  pre_steps = pre_steps_elem.multiselect(
109
  "Select pre-lemmatization preprocessing steps (ordered)",
110
  options=steps_options,
111
- default=steps_options[1:],
112
  format_func=lambda x: x.replace("_", " ").title(),
113
  key=session.run_id,
114
  )
@@ -146,6 +146,8 @@ def write(session, uploaded_file):
146
  post_steps=post_steps,
147
  )
148
 
 
 
149
  # ==== 3. PROVIDE FEEDBACK ON OPTIONS ==== #
150
  if show_sample and not (label_column and text_column):
151
  st.warning("Please select `label` and `text` columns")
@@ -155,6 +157,8 @@ def write(session, uploaded_file):
155
  sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
156
  sample_data[text_column]
157
  ).values
 
 
158
  st.table(
159
  sample_data.loc[
160
  :, [label_column, text_column, f"preprocessed_{text_column}"]
@@ -174,6 +178,8 @@ def write(session, uploaded_file):
174
  data[text_column]
175
  ).values
176
 
 
 
177
  inputs = encode(data[f"preprocessed_{text_column}"], data[label_column])
178
  session.posdf, session.negdf = wordifier(**inputs)
179
  st.success("Wordified!")
 
108
  pre_steps = pre_steps_elem.multiselect(
109
  "Select pre-lemmatization preprocessing steps (ordered)",
110
  options=steps_options,
111
+ default=steps_options,
112
  format_func=lambda x: x.replace("_", " ").title(),
113
  key=session.run_id,
114
  )
 
146
  post_steps=post_steps,
147
  )
148
 
149
+ print(preprocessing_pipeline.pre_steps)
150
+
151
  # ==== 3. PROVIDE FEEDBACK ON OPTIONS ==== #
152
  if show_sample and not (label_column and text_column):
153
  st.warning("Please select `label` and `text` columns")
 
157
  sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
158
  sample_data[text_column]
159
  ).values
160
+
161
+ print(sample_data)
162
  st.table(
163
  sample_data.loc[
164
  :, [label_column, text_column, f"preprocessed_{text_column}"]
 
178
  data[text_column]
179
  ).values
180
 
181
+ print(data.head())
182
+
183
  inputs = encode(data[f"preprocessed_{text_column}"], data[label_column])
184
  session.posdf, session.negdf = wordifier(**inputs)
185
  st.success("Wordified!")
src/preprocessing.py CHANGED
@@ -115,7 +115,7 @@ class Lemmatizer:
115
  elif remove_stop and not lemmatization:
116
 
117
  def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
118
- return " ".join([t for t in doc if not t.is_stop])
119
 
120
  elif lemmatization and not remove_stop:
121
 
 
115
  elif remove_stop and not lemmatization:
116
 
117
  def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
118
+ return " ".join([t.text for t in doc if not t.is_stop])
119
 
120
  elif lemmatization and not remove_stop:
121