singhjagpreet commited on
Commit
10330bc
1 Parent(s): 5e20c77

loading file into chat

Browse files
Files changed (5) hide show
  1. app.py +24 -2
  2. requirements.txt +3 -1
  3. src/config.py +5 -0
  4. src/model.py +24 -0
  5. src/utils.py +31 -2
app.py CHANGED
@@ -1,13 +1,17 @@
1
  import os
 
2
 
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain.embeddings.openai import OpenAIEmbeddings
5
  import chainlit as cl
 
 
 
 
 
6
 
7
 
8
 
9
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
10
- embeddings = OpenAIEmbeddings()
11
 
12
  welcome_message = """ Upload your file here"""
13
 
@@ -25,3 +29,21 @@ async def start():
25
  file = files[0]
26
  msg = cl.Message(content=f"Processing `{type(files)}` {file.name}....")
27
  await msg.send()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import logging
3
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain.embeddings.openai import OpenAIEmbeddings
6
  import chainlit as cl
7
+ from src.utils import get_docSearch
8
+ from src.model import load_chain
9
+
10
+
11
+
12
 
13
 
14
 
 
 
15
 
16
  welcome_message = """ Upload your file here"""
17
 
 
29
  file = files[0]
30
  msg = cl.Message(content=f"Processing `{type(files)}` {file.name}....")
31
  await msg.send()
32
+
33
+ docsearch = get_docSearch(file)
34
+
35
+
36
+ chain = load_chain(docsearch)
37
+
38
+ logging.info(f"Model loaded successfully")
39
+
40
+
41
+ ## let the user know when system is ready
42
+
43
+ msg.content = f"{file.name} processed. You begin asking questions"
44
+
45
+ await msg.update()
46
+
47
+ cl.user_session.set("chain", chain)
48
+
49
+
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
  langchain
2
  openai
3
  python-dotenv
4
- chainlit
 
 
 
1
  langchain
2
  openai
3
  python-dotenv
4
+ chainlit
5
+ chromadb
6
+ tiktoken
src/config.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ class Config:
2
+ temperature = 0
3
+ streaming = True
4
+ chain_type = "stuff"
5
+ max_token_limit = 4098
src/model.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import RetrievalQAWithSourcesChain
2
+ from langchain.chat_models import ChatOpenAI
3
+ import logging
4
+
5
+
6
+
7
+ from src.config import Config
8
+
9
+
10
+
11
+
12
+
13
+ def load_model():
14
+ model = ChatOpenAI(temperature=Config.temperature,
15
+ streaming=Config.streaming)
16
+ return model
17
+
18
+
19
+ def load_chain(docsearch):
20
+ model = load_model()
21
+ chain = RetrievalQAWithSourcesChain.from_chain_type(model,
22
+ chain_type=Config.chain_type,
23
+ retriever=docsearch.as_retriever(max_tokens_limit=Config.max_token_limit))
24
+ return chain
src/utils.py CHANGED
@@ -1,8 +1,37 @@
1
  from chainlit.types import AskFileResponse
2
  from langchain.document_loaders import TextLoader
 
 
 
 
 
 
 
3
 
4
  def process_file(file: AskFileResponse):
5
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def get_docSearch(file: AskFileResponse):
8
- pass
 
 
 
 
 
 
 
1
  from chainlit.types import AskFileResponse
2
  from langchain.document_loaders import TextLoader
3
+ from langchain.document_loaders import PyPDFDirectoryLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.vectorstores import Chroma
6
+ from langchain.embeddings import OpenAIEmbeddings
7
+
8
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
9
+ embeddings = OpenAIEmbeddings()
10
 
11
  def process_file(file: AskFileResponse):
12
+ import tempfile
13
+
14
+ if file.type == "text/plain":
15
+ Loader = TextLoader
16
+ elif file.type == "application/pdf":
17
+ Loader = PyPDFDirectoryLoader
18
+
19
+ with tempfile.NamedTemporaryFile() as tempfile:
20
+ tempfile.write(file.content)
21
+ loader = Loader(tempfile.name)
22
+ documents = loader.load()
23
+ # text_splitter = text_splitter()
24
+ docs = text_splitter.split_documents(documents)
25
+
26
+ for i, doc in enumerate(docs):
27
+ doc.metadata["source"] = f"source_{i}"
28
+ return docs
29
 
30
  def get_docSearch(file: AskFileResponse):
31
+ docs = process_file(file)
32
+
33
+ ## save data in user session
34
+
35
+ docsearch = Chroma.from_documents(docs, embeddings)
36
+
37
+ return docsearch