ppsingh commited on
Commit
e5e876b
1 Parent(s): c91091d

Delete auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +0 -57
auditqa/doc_process.py DELETED
@@ -1,57 +0,0 @@
1
- import glob
2
- import os
3
- #from langchain_text_splitters import MarkdownHeaderTextSplitter
4
- #from langchain_community.document_loaders import UnstructuredMarkdownLoader
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
6
- from transformers import AutoTokenizer
7
- from langchain_community.document_loaders import PyMuPDFLoader
8
- path_to_data = "./data/"
9
-
10
- def process_markdown():
11
- headers_to_split_on = [
12
- ("#", "Header 1"),
13
- ("##", "Header 2"),
14
- ("###", "Header 3"),
15
- ("####", "Header 4"),
16
- ("#####", "Header 5")
17
- ]
18
- markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
19
-
20
- files = glob.glob(path_to_data+"*.md")
21
- print(files)
22
- docs = []
23
- for file in files:
24
- try:
25
- with open(file) as f:
26
- docs.append(f.read())
27
- except Exception as e:
28
- print("Exception: ", e)
29
- docs_processed = [markdown_splitter.split_text(doc) for doc in docs]
30
- print(len(docs_processed))
31
- print(docs_processed[0])
32
-
33
- def process_pdf():
34
- files = glob.glob(path_to_data+"*.pdf")
35
- docs = []
36
- for file in files:
37
- try:
38
- docs.append(PyMuPDFLoader(file).load())
39
- except Exception as e:
40
- print("Exception: ", e)
41
-
42
-
43
- chunk_size = 256
44
- text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
45
- AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
46
- chunk_size=chunk_size,
47
- chunk_overlap=int(chunk_size / 10),
48
- add_start_index=True,
49
- strip_whitespace=True,
50
- separators=["\n\n", "\n", ".", " ", ""],
51
- )
52
- docs_processed = [text_splitter.split_documents(doc) for doc in docs]
53
- docs_processed = [item for sublist in docs_processed for item in sublist]
54
- print("length of text chunks:",len(docs_processed))
55
- return docs_processed
56
-
57
-