ramortegui commited on
Commit
a5e07db
·
unverified ·
1 Parent(s): 4e93c8d

Add vector data

Browse files
Files changed (2) hide show
  1. app.py +61 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain import HuggingFacePipeline
2
+ from langchain.chains import RetrievalQA
3
+ from langchain.document_loaders import BSHTMLLoader, DirectoryLoader
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from transformers import AutoTokenizer
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.vectorstores import Chroma
8
+
9
+
10
+ !git clone https://github.com/TheMITTech/shakespeare
11
+
12
+ from glob import glob
13
+
14
+ files = glob("./shakespeare/**/*.html")
15
+
16
+ import shutil
17
+ import os
18
+
19
+ os.mkdir('./data')
20
+
21
+ destination_folder = './data/'
22
+
23
+ for html_file in files:
24
+ shutil.move(html_file, destination_folder + html_file.split("/")[-1])
25
+ bshtml_dir_loader = DirectoryLoader('./data/', loader_cls=BSHTMLLoader)
26
+
27
+ data = bshtml_dir_loader.load()
28
+
29
+ bloomz_tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-1b7")
30
+
31
+ text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(bloomz_tokenizer,
32
+ chunk_size=100,
33
+ chunk_overlap=0,
34
+ separator="\n")
35
+
36
+ documents = text_splitter.split_documents(data)
37
+
38
+ embeddings = HuggingFaceEmbeddings()
39
+
40
+
41
+
42
+ persist_directory = "vector_db"
43
+
44
+ vectordb = Chroma.from_documents(documents=documents, embedding=embeddings,
45
+ persist_directory=persist_directory)
46
+
47
+ llm = HuggingFacePipeline.from_model_id(
48
+ model_id="bigscience/bloomz-1b7",
49
+ task="text-generation",
50
+ model_kwargs={"temperature" : 0, "max_length" : 500})
51
+
52
+ doc_retriever = vectordb.as_retriever()
53
+
54
+
55
+ shakespeare_qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=doc_retriever)
56
+
57
+ def query(query):
58
+ shakespeare_qa.run(query)
59
+
60
+ iface = gr.Interface(fn=query, inputs="text", outputs="text")
61
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ langchain
2
+ beautifulsoup4
3
+ transformers
4
+ huggingface-hub
5
+ sentence_transformers
6
+ chromadb
7
+