Spaces:
Runtime error
Runtime error
Update scripts/process.py
Browse files- scripts/process.py +10 -19
scripts/process.py
CHANGED
@@ -58,39 +58,29 @@ def load_document(
|
|
58 |
id_hash_keys=id_hash_keys))
|
59 |
|
60 |
return documents
|
61 |
-
|
62 |
-
def preprocessing(document
|
63 |
-
split_by: Literal["sentence", "word"] = 'sentence',
|
64 |
-
split_length:int = 3):
|
65 |
-
|
66 |
"""
|
67 |
-
takes in haystack document object and splits it into
|
68 |
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
|
69 |
list that contains all text joined together.
|
70 |
"""
|
71 |
-
|
72 |
-
split_respect_sentence_boundary = False
|
73 |
-
split_overlap=0
|
74 |
-
else:
|
75 |
-
split_respect_sentence_boundary = True
|
76 |
-
split_overlap= 20
|
77 |
-
|
78 |
preprocessor = PreProcessor(
|
79 |
clean_empty_lines=True,
|
80 |
clean_whitespace=True,
|
81 |
clean_header_footer=True,
|
82 |
-
split_by=
|
83 |
-
split_length=
|
84 |
-
split_respect_sentence_boundary=
|
85 |
-
split_overlap=
|
86 |
)
|
87 |
for i in document:
|
88 |
docs_processed = preprocessor.process([i])
|
89 |
for item in docs_processed:
|
90 |
item.content = basic(item.content)
|
91 |
|
92 |
-
|
93 |
-
# logger.info("document has been splitted to {}".format(len(docs_processed)))
|
94 |
|
95 |
# create dataframe of text and list of all text
|
96 |
#df = pd.DataFrame(docs_processed)
|
@@ -98,5 +88,6 @@ def load_document(
|
|
98 |
#par_list = df.content.to_list()
|
99 |
|
100 |
return docs_processed #, df, all_text, par_list
|
|
|
101 |
|
102 |
|
|
|
58 |
id_hash_keys=id_hash_keys))
|
59 |
|
60 |
return documents
|
61 |
+
|
62 |
+
def preprocessing(document):
|
|
|
|
|
|
|
63 |
"""
|
64 |
+
takes in haystack document object and splits it into paragraphs and applies simple cleaning.
|
65 |
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
|
66 |
list that contains all text joined together.
|
67 |
"""
|
68 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
preprocessor = PreProcessor(
|
70 |
clean_empty_lines=True,
|
71 |
clean_whitespace=True,
|
72 |
clean_header_footer=True,
|
73 |
+
split_by="sentence",
|
74 |
+
split_length=3,
|
75 |
+
split_respect_sentence_boundary=False,
|
76 |
+
split_overlap=1
|
77 |
)
|
78 |
for i in document:
|
79 |
docs_processed = preprocessor.process([i])
|
80 |
for item in docs_processed:
|
81 |
item.content = basic(item.content)
|
82 |
|
83 |
+
st.write("your document has been splitted to", len(docs_processed), "paragraphs")
|
|
|
84 |
|
85 |
# create dataframe of text and list of all text
|
86 |
#df = pd.DataFrame(docs_processed)
|
|
|
88 |
#par_list = df.content.to_list()
|
89 |
|
90 |
return docs_processed #, df, all_text, par_list
|
91 |
+
|
92 |
|
93 |
|