DiamondYin commited on
Commit
f71f3ee
1 Parent(s): b185613

Update app_utils.py

Browse files
Files changed (1) hide show
  1. app_utils.py +14 -4
app_utils.py CHANGED
@@ -6,6 +6,7 @@ import boto3 # AWS Polly
6
  from pydub import AudioSegment # AudioSegment is a class in the pydub module that can be used to manipulate audio files.
7
  from pydub.playback import play # play is a function in the pydub.playback module that can be used to play audio files.
8
  import logging
 
9
 
10
  from langchain import OpenAI
11
  from langchain.chains import RetrievalQA # RetrievalQA is a class in the langchain.chains module that can be used to build a retrieval-based question answering system.
@@ -38,11 +39,20 @@ def initialize_knowledge_base():
38
  loader = DirectoryLoader('profiles', glob='**/*.txt') #文件夹加载器 profiles文件夹下的所有txt文件
39
  docs = loader.load()
40
 
41
- char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
42
- doc_texts = char_text_splitter.split_documents(docs)
43
 
44
- openAI_embeddings = OpenAIEmbeddings()
45
- vStore = Chroma.from_documents(doc_texts, openAI_embeddings)
 
 
 
 
 
 
 
 
 
46
 
47
  conv_model = RetrievalQA.from_chain_type(
48
  llm=OpenAI(model_name="gpt-3.5-turbo-16k"),
 
6
  from pydub import AudioSegment # AudioSegment is a class in the pydub module that can be used to manipulate audio files.
7
  from pydub.playback import play # play is a function in the pydub.playback module that can be used to play audio files.
8
  import logging
9
+ import numpy as np
10
 
11
  from langchain import OpenAI
12
  from langchain.chains import RetrievalQA # RetrievalQA is a class in the langchain.chains module that can be used to build a retrieval-based question answering system.
 
39
  loader = DirectoryLoader('profiles', glob='**/*.txt') #文件夹加载器 profiles文件夹下的所有txt文件
40
  docs = loader.load()
41
 
42
+ char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) #文本分割器 chunk_size=1000, chunk_overlap=0
43
+ doc_texts = char_text_splitter.split_documents(docs) #文档分割器,作用是将文档分割成小块
44
 
45
+ # Embed each chunk of text
46
+ embeddings = []
47
+ embedder = OpenAIEmbeddings()
48
+ for doc in doc_texts:
49
+ embedding = embedder.embed(doc)
50
+ embeddings.append(embedding)
51
+
52
+ vStore = np.concatenate(embeddings, axis=0)
53
+
54
+ #openAI_embeddings = OpenAIEmbeddings()
55
+ #vStore = Chroma.from_documents(doc_texts, openAI_embeddings) #Chroma是一个类,用于存储向量,from_documents是一个方法,用于从文档中创建向量存储器,openAI_embeddings是一个类,用于将文本转换为向量
56
 
57
  conv_model = RetrievalQA.from_chain_type(
58
  llm=OpenAI(model_name="gpt-3.5-turbo-16k"),