Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update auditqa/doc_process.py
Browse files- auditqa/doc_process.py +2 -2
auditqa/doc_process.py
CHANGED
@@ -27,11 +27,11 @@ def process_pdf():
|
|
27 |
# text splitter based on the tokenizer of a model of your choosing
|
28 |
# to make texts fit exactly a transformer's context window size
|
29 |
# langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
|
30 |
-
chunk_size =
|
31 |
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
32 |
AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
|
33 |
chunk_size=chunk_size,
|
34 |
-
chunk_overlap=
|
35 |
add_start_index=True,
|
36 |
strip_whitespace=True,
|
37 |
separators=["\n\n", "\n"],
|
|
|
27 |
# text splitter based on the tokenizer of a model of your choosing
|
28 |
# to make texts fit exactly a transformer's context window size
|
29 |
# langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
|
30 |
+
chunk_size = 512
|
31 |
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
32 |
AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
|
33 |
chunk_size=chunk_size,
|
34 |
+
chunk_overlap=10,
|
35 |
add_start_index=True,
|
36 |
strip_whitespace=True,
|
37 |
separators=["\n\n", "\n"],
|