nkasmanoff commited on
Commit
4748526
1 Parent(s): 7f7e9cb

Create vectorize_dataset.py

Browse files
Files changed (1) hide show
  1. vectorize_dataset.py +29 -0
vectorize_dataset.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from helpers import clean_up_tags
3
+ from langchain.text_splitter import CharacterTextSplitter
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.document_loaders import DataFrameLoader
6
+
7
+
8
+ def load_descriptions_data():
9
+ hf_datasets = load_dataset('nkasmanoff/huggingface-datasets')
10
+ hf_df = hf_datasets['train'].to_pandas()
11
+ hf_df['tags_cleaned'] = hf_df['tags'].apply(clean_up_tags)
12
+ hf_df.dropna(subset=['description'],inplace=True)
13
+ hf_df['description_full'] = hf_df['description'].fillna('') + ' ' + hf_df['tags_cleaned']
14
+ hf_df = hf_df[hf_df['description_full'] != ' ']
15
+ hf_df = hf_df[['id','description_full']]
16
+
17
+ return hf_df
18
+
19
+
20
+ def create_db(hf_df, embeddings):
21
+ loader = DataFrameLoader(hf_df, page_content_column="description_full")
22
+ documents = loader.load()
23
+ # split the documents into chunks
24
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
25
+ texts = text_splitter.split_documents(documents)
26
+ # select which embeddings we want to use
27
+ # create the vectorestore to use as the index
28
+ db = Chroma.from_documents(texts, embeddings)
29
+ return db