zliang commited on
Commit
cc38132
1 Parent(s): beca6a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -3
app.py CHANGED
@@ -17,6 +17,7 @@ from langchain_core.output_parsers import StrOutputParser
17
  from langchain_community.document_loaders import PyMuPDFLoader
18
  from langchain_openai import OpenAIEmbeddings
19
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
20
  from langchain_core.prompts import ChatPromptTemplate
21
  from langchain_openai import ChatOpenAI
22
  import re
@@ -73,8 +74,8 @@ def summarize_pdf(pdf_file_path, num_clusters=10):
73
  docs = loader.load()
74
  full_text = "\n".join(doc.page_content for doc in docs)
75
  cleaned_full_text = clean_text(remove_references(full_text))
76
-
77
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0, separators=["\n\n", "\n", ".", " "])
78
  split_contents = text_splitter.split_text(cleaned_full_text)
79
  embeddings = embeddings_model.embed_documents(split_contents)
80
 
@@ -103,8 +104,9 @@ def qa_pdf(pdf_file_path, query, num_clusters=5, similarity_threshold=0.6):
103
  docs = loader.load()
104
  full_text = "\n".join(doc.page_content for doc in docs)
105
  cleaned_full_text = clean_text(remove_references(full_text))
 
106
 
107
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=0, separators=["\n\n", "\n", ".", " "])
108
  split_contents = text_splitter.split_text(cleaned_full_text)
109
  embeddings = embeddings_model.embed_documents(split_contents)
110
 
 
17
  from langchain_community.document_loaders import PyMuPDFLoader
18
  from langchain_openai import OpenAIEmbeddings
19
  from langchain_text_splitters import RecursiveCharacterTextSplitter
20
+ from langchain_text_splitters import SpacyTextSplitter
21
  from langchain_core.prompts import ChatPromptTemplate
22
  from langchain_openai import ChatOpenAI
23
  import re
 
74
  docs = loader.load()
75
  full_text = "\n".join(doc.page_content for doc in docs)
76
  cleaned_full_text = clean_text(remove_references(full_text))
77
+ text_splitter = SpacyTextSplitter(chunk_size=500)
78
+ #text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0, separators=["\n\n", "\n", ".", " "])
79
  split_contents = text_splitter.split_text(cleaned_full_text)
80
  embeddings = embeddings_model.embed_documents(split_contents)
81
 
 
104
  docs = loader.load()
105
  full_text = "\n".join(doc.page_content for doc in docs)
106
  cleaned_full_text = clean_text(remove_references(full_text))
107
+ text_splitter = SpacyTextSplitter(chunk_size=500)
108
 
109
+ #text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=0, separators=["\n\n", "\n", ".", " "])
110
  split_contents = text_splitter.split_text(cleaned_full_text)
111
  embeddings = embeddings_model.embed_documents(split_contents)
112