ppsingh commited on
Commit
bbfda0b
·
1 Parent(s): 05828e0

adding preprocessing

Browse files
utils/__pycache__/vulnerability_classifier.cpython-310.pyc CHANGED
Binary files a/utils/__pycache__/vulnerability_classifier.cpython-310.pyc and b/utils/__pycache__/vulnerability_classifier.cpython-310.pyc differ
 
utils/preprocessing.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes.base import BaseComponent
2
+ from haystack.schema import Document
3
+ from haystack.nodes import ImageToTextConverter, PDFToTextConverter
4
+ from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
+ from pdf2image import convert_from_path
6
+ from typing import Callable, Dict, List, Optional, Text, Tuple, Union
7
+ from typing_extensions import Literal
8
+ import pandas as pd
9
+ import logging
10
+ import re
11
+ import string
12
+ from haystack.pipelines import Pipeline
13
+ import streamlit as st
14
+
15
+ @st.cache_data
16
+ def useOCR(file_path: str)-> Text:
17
+ """
18
+ Converts image pdfs into text, Using the Farm-haystack[OCR]
19
+
20
+ Params
21
+ ----------
22
+ file_path: file_path of uploade file, returned by add_upload function in
23
+ uploadAndExample.py
24
+
25
+ Returns the text file as string.
26
+ """
27
+ # we need pdf file to be first converted into image file
28
+ # this will create each page as image file
29
+ images = convert_from_path(pdf_path = file_path)
30
+ list_ = []
31
+ # save image file in cache and read them one by one to pass it to OCR
32
+ for i, pdf in enumerate(images):
33
+ # Save pages as images in the pdf
34
+ pdf.save(f'PDF\image_converted_{i+1}.png', 'PNG')
35
+ list_.append(f'PDF\image_converted_{i+1}.png')
36
+
37
+ converter = ImageToTextConverter(remove_numeric_tables=True,
38
+ valid_languages=["eng"])
39
+ # placeholder to collect the text from each page
40
+ placeholder = []
41
+ for file in list_:
42
+ document = converter.convert(
43
+ file_path=file, meta=None,
44
+ )[0]
45
+
46
+ text = document.content
47
+ placeholder.append(text)
48
+ # join the text from each page by page separator
49
+ text = '\x0c'.join(placeholder)
50
+ return text
51
+
52
+
53
+
54
+ class FileConverter(BaseComponent):
55
+ """
56
+ Wrapper class to convert uploaded document into text by calling appropriate
57
+ Converter class, will use internally haystack PDFToTextOCR in case of image
58
+ pdf. Cannot use the FileClassifier from haystack as its doesnt has any
59
+ label/output class for image.
60
+ 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
61
+ 2. https://docs.haystack.deepset.ai/docs/file_converters
62
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
63
+ 4. https://docs.haystack.deepset.ai/reference/file-converters-api
64
+ """
65
+
66
+ outgoing_edges = 1
67
+
68
+ def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
69
+ id_hash_keys: Optional[List[str]] = None,
70
+ ) -> Tuple[dict,str]:
71
+ """ this is required method to invoke the component in
72
+ the pipeline implementation.
73
+
74
+ Params
75
+ ----------
76
+ file_name: name of file
77
+ file_path: file_path of uploade file, returned by add_upload function in
78
+ uploadAndExample.py
79
+
80
+ See the links provided in Class docstring/description to see other params
81
+
82
+ Return
83
+ ---------
84
+ output: dictionary, with key as identifier and value could be anything
85
+ we need to return. In this case its the List of Hasyatck Document
86
+
87
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
88
+ """
89
+ try:
90
+ if file_name.endswith('.pdf'):
91
+ converter = PDFToTextConverter(remove_numeric_tables=True)
92
+ if file_name.endswith('.txt'):
93
+ converter = TextConverter(remove_numeric_tables=True)
94
+ if file_name.endswith('.docx'):
95
+ converter = DocxToTextConverter()
96
+ except Exception as e:
97
+ logging.error(e)
98
+ return
99
+
100
+
101
+
102
+ documents = []
103
+
104
+ document = converter.convert(
105
+ file_path=file_path, meta=None,
106
+ encoding=encoding, id_hash_keys=id_hash_keys
107
+ )[0]
108
+
109
+ text = document.content
110
+
111
+ # in case of scanned/images only PDF the content might contain only
112
+ # the page separator (\f or \x0c). We check if is so and use
113
+ # use the OCR to get the text.
114
+ filtered = re.sub(r'\x0c', '', text)
115
+
116
+ if filtered == "":
117
+ logging.info("Using OCR")
118
+ text = useOCR(file_path)
119
+
120
+ documents.append(Document(content=text,
121
+ meta={"name": file_name},
122
+ id_hash_keys=id_hash_keys))
123
+
124
+
125
+
126
+ logging.info('file conversion succesful')
127
+ output = {'documents': documents}
128
+ return output, 'output_1'
129
+
130
+ def run_batch():
131
+ """
132
+ we dont have requirement to process the multiple files in one go
133
+ therefore nothing here, however to use the custom node we need to have
134
+ this method for the class.
135
+ """
136
+
137
+ return
138
+
139
+
140
+ def basic(s:str, remove_punc:bool = False):
141
+
142
+ """
143
+ Performs basic cleaning of text.
144
+ Params
145
+ ----------
146
+ s: string to be processed
147
+ removePunc: to remove all Punctuation including ',' and '.' or not
148
+
149
+ Returns: processed string: see comments in the source code for more info
150
+ """
151
+
152
+ # Remove URLs
153
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
154
+ s = re.sub(r"http\S+", " ", s)
155
+
156
+ # Remove new line characters
157
+ s = re.sub('\n', ' ', s)
158
+
159
+ # Remove punctuations
160
+ if remove_punc == True:
161
+ translator = str.maketrans(' ', ' ', string.punctuation)
162
+ s = s.translate(translator)
163
+ # Remove distracting single quotes and dotted pattern
164
+ s = re.sub("\'", " ", s)
165
+ s = s.replace("..","")
166
+
167
+ return s.strip()
168
+
169
+
170
+ def paraLengthCheck(paraList, max_len = 100):
171
+ """
172
+ There are cases where preprocessor cannot respect word limit, when using
173
+ respect sentence boundary flag due to missing sentence boundaries.
174
+ Therefore we run one more round of split here for those paragraphs
175
+
176
+ Params
177
+ ---------------
178
+ paraList : list of paragraphs/text
179
+ max_len : max length to be respected by sentences which bypassed
180
+ preprocessor strategy
181
+
182
+ """
183
+ new_para_list = []
184
+ for passage in paraList:
185
+ # check if para exceeds words limit
186
+ if len(passage.content.split()) > max_len:
187
+ # we might need few iterations example if para = 512 tokens
188
+ # we need to iterate 5 times to reduce para to size limit of '100'
189
+ iterations = int(len(passage.content.split())/max_len)
190
+ for i in range(iterations):
191
+ temp = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
192
+ new_para_list.append((temp,passage.meta['page']))
193
+ temp = " ".join(passage.content.split()[max_len*(i+1):])
194
+ new_para_list.append((temp,passage.meta['page']))
195
+ else:
196
+ # paragraphs which dont need any splitting
197
+ new_para_list.append((passage.content, passage.meta['page']))
198
+
199
+ logging.info("New paragraphs length {}".format(len(new_para_list)))
200
+ return new_para_list
201
+
202
+ class UdfPreProcessor(BaseComponent):
203
+ """
204
+ class to preprocess the document returned by FileConverter. It will check
205
+ for splitting strategy and splits the document by word or sentences and then
206
+ synthetically create the paragraphs.
207
+ 1. https://docs.haystack.deepset.ai/docs/preprocessor
208
+ 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
209
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
210
+ """
211
+ outgoing_edges = 1
212
+
213
+ def run(self, documents:List[Document], remove_punc:bool=False, apply_clean = True,
214
+ split_by: Literal["sentence", "word"] = 'sentence',
215
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
216
+ split_overlap:int = 0):
217
+
218
+ """ this is required method to invoke the component in
219
+ the pipeline implementation.
220
+
221
+ Params
222
+ ----------
223
+ documents: documents from the output dictionary returned by Fileconverter
224
+ remove_punc: to remove all Punctuation including ',' and '.' or not
225
+ split_by: document splitting strategy either as word or sentence
226
+ split_length: when synthetically creating the paragrpahs from document,
227
+ it defines the length of paragraph.
228
+ split_respect_sentence_boundary: Used when using 'word' strategy for
229
+ splititng of text.
230
+ split_overlap: Number of words or sentences that overlap when creating
231
+ the paragraphs. This is done as one sentence or 'some words' make sense
232
+ when read in together with others. Therefore the overlap is used.
233
+
234
+ Return
235
+ ---------
236
+ output: dictionary, with key as identifier and value could be anything
237
+ we need to return. In this case the output will contain 4 objects
238
+ the paragraphs text list as List, Haystack document, Dataframe and
239
+ one raw text file.
240
+
241
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
242
+
243
+ """
244
+
245
+ if split_by == 'sentence':
246
+ split_respect_sentence_boundary = False
247
+
248
+ else:
249
+ split_respect_sentence_boundary = split_respect_sentence_boundary
250
+
251
+ preprocessor = PreProcessor(
252
+ clean_empty_lines=True,
253
+ clean_whitespace=True,
254
+ clean_header_footer=True,
255
+ split_by=split_by,
256
+ split_length=split_length,
257
+ split_respect_sentence_boundary= split_respect_sentence_boundary,
258
+ split_overlap=split_overlap,
259
+
260
+ # will add page number only in case of PDF not for text/docx file.
261
+ add_page_number=True
262
+ )
263
+
264
+ for i in documents:
265
+ # # basic cleaning before passing it to preprocessor.
266
+ # i = basic(i)
267
+ docs_processed = preprocessor.process([i])
268
+ if apply_clean:
269
+ for item in docs_processed:
270
+ item.content = basic(item.content, remove_punc= remove_punc)
271
+ else:
272
+ pass
273
+
274
+ df = pd.DataFrame(docs_processed)
275
+ all_text = " ".join(df.content.to_list())
276
+ para_list = df.content.to_list()
277
+ logging.info('document split into {} paragraphs'.format(len(para_list)))
278
+ output = {'documents': docs_processed,
279
+ 'dataframe': df,
280
+ 'text': all_text,
281
+ 'paraList': para_list
282
+ }
283
+ return output, "output_1"
284
+ def run_batch():
285
+ """
286
+ we dont have requirement to process the multiple files in one go
287
+ therefore nothing here, however to use the custom node we need to have
288
+ this method for the class.
289
+ """
290
+ return
291
+
292
+ def processingpipeline():
293
+ """
294
+ Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
295
+ from utils.preprocessing
296
+ """
297
+
298
+ preprocessing_pipeline = Pipeline()
299
+ file_converter = FileConverter()
300
+ custom_preprocessor = UdfPreProcessor()
301
+
302
+ preprocessing_pipeline.add_node(component=file_converter,
303
+ name="FileConverter", inputs=["File"])
304
+ preprocessing_pipeline.add_node(component = custom_preprocessor,
305
+ name ='UdfPreProcessor', inputs=["FileConverter"])
306
+
307
+ return preprocessing_pipeline