File size: 1,533 Bytes
32a1b64 b5c1366 32a1b64 b5c1366 32a1b64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
from utils.preprocessing import processingpipeline
def get_paragraphs(file_path_input):
# Declare params
SPLIT_BY = 'word'
# usually models have max-length of 384/512
SPLIT_LENGTH = 100
# too much overlap can lead to repeatitive text
# but as a rule fo thumb we keep (20% of Split Length)
SPLIT_OVERLAP = 10
# the text is cleaned for removing htmls and other annoying texts
# but if you need to remove all punctuations like ,.; etc.
# good to use for non-Transformers based models.
REMOVE_PUNC = False
# This param is used only for split_by ='word'
RESPECT_SENTENCE_BOUNDARY = True
# initialize the preprocessing pipeline and pass params for Preprocessor either
# on go or as per delcared variables above.
prep_pipeline = processingpipeline()
output_pre = prep_pipeline.run(file_paths = file_path_input,
params= {"FileConverter": {"file_path": file_path, \
"file_name": file_name},
"UdfPreProcessor": {"remove_punc": REMOVE_PUNC, \
"split_by": SPLIT_BY, \
"split_length":SPLIT_LENGTH,\
"split_overlap": SPLIT_OVERLAP, \
"split_respect_sentence_boundary":RESPECT_SENTENCE_BOUNDARY}})
output_pre.keys()
par_list = output_pre['paraList']
#print(par_list)
return par_list |