import os from langchain_qdrant import QdrantVectorStore import pickle from langchain_openai import OpenAIEmbeddings from dotenv import load_dotenv load_dotenv() from langchain_community.vectorstores import Qdrant from langchain_core.documents import Document from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings from langchain_groq import ChatGroq from langchain_qdrant import QdrantVectorStore from qdrant_client import QdrantClient from qdrant_client.http.models import Distance, VectorParams from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.document_loaders import TextLoader from langchain.text_splitter import MarkdownHeaderTextSplitter from langchain.retrievers.self_query.base import SelfQueryRetriever from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain.document_loaders import TextLoader from langchain.docstore.document import Document from typing import List, Dict from langchain.chains import RetrievalQA import os llm = ChatGroq(model_name="llama3-70b-8192", temperature=0.1,api_key= os.getenv('llm_api_1')) def load_and_chunk_data(data_path): docs = [] # Load all .txt files from the specified folder and its subfolders for root, _, files in os.walk(data_path): for filename in files: if filename.endswith('.txt'): file_path = os.path.join(root, filename) loader = TextLoader(file_path, encoding='utf-8') docs.extend(loader.load()) headers_to_split_on = [ ("#", "Header_1"), ("##", "Header_2"), ("###", "Header_3"), ] markdown_splitter = MarkdownHeaderTextSplitter( headers_to_split_on=headers_to_split_on, strip_headers=False ) chunk_size = 512 chunk_overlap = 0 text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) chunked_docs = [] for doc in docs: md_header_splits = markdown_splitter.split_text(doc.page_content) chunked_docs.extend(text_splitter.split_documents(md_header_splits)) return chunked_docs data_path = '/home/azureuser/data/gioithieuhocvien' chunked_data = load_and_chunk_data(data_path) # Save the documents list with pickle import pickle with open('gioithieuhocvien_filter.pkl', 'wb') as f: pickle.dump(chunked_data, f) os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_KEY') HF_EMBEDDING = OpenAIEmbeddings(model='text-embedding-3-small') url="http://localhost:6333" qdrant = QdrantVectorStore.from_documents( chunked_data, HF_EMBEDDING, url=url, collection_name="gioithieuhocvien_filter", )