import pandas as pd from langchain_community.document_loaders import TextLoader from langchain_community.docstore.document import Document from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.retrievers import BM25Retriever from langchain_community.llms import OpenAI from langchain_openai import ChatOpenAI from langchain.chains import RetrievalQA from langchain.schema import AIMessage, HumanMessage from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder import os from langchain.retrievers import ParentDocumentRetriever from langchain.storage import InMemoryStore def split_with_source(text, source): splitter = CharacterTextSplitter( separator = "\n", chunk_size = 400, chunk_overlap = 0, length_function = len, add_start_index = True, ) documents = splitter.create_documents([text]) # print(documents) for doc in documents: doc.metadata["source"] = source # print(doc.metadata) return documents def get_document_from_raw_text_each_line(): documents = [Document(page_content="", metadata={'source': 0})] files = os.listdir(os.path.join(os.getcwd(), "raw_data")) # print(files) for i in files: file_path = i with open(os.path.join(os.path.join(os.getcwd(), "raw_data"),file_path), 'r', encoding="utf-8") as file: # Xử lý bằng text_spliter # Tiền xử lý văn bản content = file.readlines() text = [] #Split for line in content: line = line.strip() documents.append(Document(page_content=line, metadata={"source": i})) return documents def count_files_in_folder(folder_path): # Kiểm tra xem đường dẫn thư mục có tồn tại không if not os.path.isdir(folder_path): print("Đường dẫn không hợp lệ.") return None # Sử dụng os.listdir() để lấy danh sách các tập tin và thư mục trong thư mục files = os.listdir(folder_path) # Đếm số lượng tập tin trong danh sách file_count = len(files) return file_count def get_document_from_raw_text(): documents = [Document(page_content="", metadata={'source': 0})] files = os.listdir(os.path.join(os.getcwd(), "raw_data")) # print(files) for i in files: file_path = i with open(os.path.join(os.path.join(os.getcwd(), "raw_data"),file_path), 'r', encoding="utf-8") as file: # Xử lý bằng text_spliter # Tiền xử lý văn bản content = file.read().replace('\n\n', "\n") # content = ''.join(content.split('.')) new_doc = content texts = split_with_source(new_doc, i) # texts = get_document_from_raw_text_each_line() documents = documents + texts ##Xử lý mỗi khi xuống dòng # for line in file: # # Loại bỏ khoảng trắng thừa và ký tự xuống dòng ở đầu và cuối mỗi dòng # line = line.strip() # documents.append(Document(page_content=line, metadata={"source": i})) # print(documents) return documents def get_document_from_table(): documents = [Document(page_content="", metadata={'source': 0})] files = os.listdir(os.path.join(os.getcwd(), "table_data")) # print(files) for i in files: file_path = i data = pd.read_csv(os.path.join(os.path.join(os.getcwd(), "table_data"),file_path)) for j, row in data.iterrows(): documents.append(Document(page_content=row['data'], metadata={"source": file_path})) return documents def load_the_embedding_retrieve(is_ready = False, k = 3, model= 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'): embeddings = HuggingFaceEmbeddings(model_name=model) if is_ready: retriever = Chroma(persist_directory=os.path.join(os.getcwd(), "Data"), embedding_function=embeddings).as_retriever( search_kwargs={"k": k} ) else: documents = get_document_from_raw_text() + get_document_from_table() # print(type(documents)) retriever = Chroma.from_documents(documents, embeddings).as_retriever( search_kwargs={"k": k} ) return retriever def load_the_bm25_retrieve(k = 3): documents = get_document_from_raw_text() + get_document_from_table() bm25_retriever = BM25Retriever.from_documents(documents) bm25_retriever.k = k return bm25_retriever def load_the_parent_document_retrieve(model= 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'): embeddings = HuggingFaceEmbeddings(model_name=model) vectorstore = Chroma( collection_name="split_parents", embedding_function=embeddings ) store = InMemoryStore() parent_splitter = RecursiveCharacterTextSplitter( chunk_size=1200, chunk_overlap=0, length_function=len, add_start_index=True, ) child_splitter = RecursiveCharacterTextSplitter( chunk_size=400, chunk_overlap=0, length_function=len, add_start_index=True, ) retriever = ParentDocumentRetriever( vectorstore=vectorstore, docstore=store, child_splitter=child_splitter, parent_splitter=parent_splitter, ) docs = get_document_from_raw_text() retriever.add_documents(docs) return retriever def get_qachain(llm_name = "gpt-3.5-turbo-0125", chain_type = "stuff", retriever = None, return_source_documents = True): llm = ChatOpenAI(temperature=0, model_name=llm_name) return RetrievalQA.from_chain_type(llm=llm, chain_type=chain_type, retriever=retriever, return_source_documents=return_source_documents) def summarize_messages(demo_ephemeral_chat_history, llm): stored_messages = demo_ephemeral_chat_history.messages human_chat = stored_messages[0].content ai_chat = stored_messages[1].content if len(stored_messages) == 0: return False summarization_prompt = ChatPromptTemplate.from_messages( [ ( "system", os.environ['SUMARY_MESSAGE_PROMPT'], ), ( "human", ''' History: Human: {human} AI: {AI} Output: ''' ) , ] ) summarization_chain = summarization_prompt | llm summary_message = summarization_chain.invoke({"AI": ai_chat, "human": human_chat}) demo_ephemeral_chat_history.clear() demo_ephemeral_chat_history.add_message(summary_message) return demo_ephemeral_chat_history def get_question_from_summarize(summary, question, llm): new_qa_prompt = ChatPromptTemplate.from_messages([ ("system", os.environ['NEW_QUESTION_PROMPT']), ("human", ''' Summary: {summary} Question: {question} Output: ''' ) ] ) new_qa_chain = new_qa_prompt | llm return new_qa_chain.invoke({'summary': summary, 'question': question}).content def get_final_answer(question, context, prompt, llm): qa_prompt = ChatPromptTemplate.from_messages( [ ("system", prompt), ("human", ''' Context: {context} Question: {question} Output: '''), ] ) answer_chain = qa_prompt | llm answer = answer_chain.invoke({'question': question, 'context': context}) return answer.content def process_llm_response(llm_response): print(llm_response['result']) print('\n\nSources:') for source in llm_response["source_documents"]: print(source.metadata['source'])