Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Delete auditqa/doc_process.py
Browse files- auditqa/doc_process.py +0 -57
auditqa/doc_process.py
DELETED
@@ -1,57 +0,0 @@
|
|
1 |
-
import glob
|
2 |
-
import os
|
3 |
-
#from langchain_text_splitters import MarkdownHeaderTextSplitter
|
4 |
-
#from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
5 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
|
6 |
-
from transformers import AutoTokenizer
|
7 |
-
from langchain_community.document_loaders import PyMuPDFLoader
|
8 |
-
path_to_data = "./data/"
|
9 |
-
|
10 |
-
def process_markdown():
|
11 |
-
headers_to_split_on = [
|
12 |
-
("#", "Header 1"),
|
13 |
-
("##", "Header 2"),
|
14 |
-
("###", "Header 3"),
|
15 |
-
("####", "Header 4"),
|
16 |
-
("#####", "Header 5")
|
17 |
-
]
|
18 |
-
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
|
19 |
-
|
20 |
-
files = glob.glob(path_to_data+"*.md")
|
21 |
-
print(files)
|
22 |
-
docs = []
|
23 |
-
for file in files:
|
24 |
-
try:
|
25 |
-
with open(file) as f:
|
26 |
-
docs.append(f.read())
|
27 |
-
except Exception as e:
|
28 |
-
print("Exception: ", e)
|
29 |
-
docs_processed = [markdown_splitter.split_text(doc) for doc in docs]
|
30 |
-
print(len(docs_processed))
|
31 |
-
print(docs_processed[0])
|
32 |
-
|
33 |
-
def process_pdf():
|
34 |
-
files = glob.glob(path_to_data+"*.pdf")
|
35 |
-
docs = []
|
36 |
-
for file in files:
|
37 |
-
try:
|
38 |
-
docs.append(PyMuPDFLoader(file).load())
|
39 |
-
except Exception as e:
|
40 |
-
print("Exception: ", e)
|
41 |
-
|
42 |
-
|
43 |
-
chunk_size = 256
|
44 |
-
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
45 |
-
AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
|
46 |
-
chunk_size=chunk_size,
|
47 |
-
chunk_overlap=int(chunk_size / 10),
|
48 |
-
add_start_index=True,
|
49 |
-
strip_whitespace=True,
|
50 |
-
separators=["\n\n", "\n", ".", " ", ""],
|
51 |
-
)
|
52 |
-
docs_processed = [text_splitter.split_documents(doc) for doc in docs]
|
53 |
-
docs_processed = [item for sublist in docs_processed for item in sublist]
|
54 |
-
print("length of text chunks:",len(docs_processed))
|
55 |
-
return docs_processed
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|