nkasmanoff commited on
Commit
06758b6
1 Parent(s): 0657cdd

Create dataset_recommender.py

Browse files
Files changed (1) hide show
  1. dataset_recommender.py +36 -0
dataset_recommender.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import RetrievalQA
2
+ from langchain.llms import OpenAI
3
+ from langchain.embeddings import OpenAIEmbeddings
4
+ from vectorize_dataset import load_descriptions_data, create_db
5
+ from helpers import clean_up_tags, get_dataset_metadata
6
+
7
+
8
+
9
+ class DatasetRecommender:
10
+ def __init__(self, llm_backbone = OpenAI(), embeddings_backbone = OpenAIEmbeddings()):
11
+ self.llm_backbone = llm_backbone
12
+ self.embeddings_backbone = embeddings_backbone
13
+ self.hf_df = load_descriptions_data()
14
+ self.db = create_db(self.hf_df, self.embeddings_backbone)
15
+ self.datasets_url_base = "https://huggingface.co/datasets/"
16
+ # expose this index in a retriever interface
17
+ self.retriever = self.db.as_retriever(search_type="similarity", search_kwargs={"k":2})
18
+ # create a chain to answer questions
19
+ self.qa = RetrievalQA.from_chain_type(
20
+ llm=self.llm_backbone, chain_type="stuff", retriever=self.retriever, return_source_documents=True)
21
+
22
+ def recommend_based_on_text(self, query):
23
+ result = self.qa({"query": query})
24
+ response_text = result['result']
25
+ source_documents = result['source_documents']
26
+ linked_datasets = [f"{self.datasets_url_base}{x.metadata['id']}" for x in source_documents]
27
+ return {'message': response_text, 'datasets': linked_datasets}
28
+
29
+ def get_similar_datasets(self, query_url):
30
+ retrieved_metadata = get_dataset_metadata(query_url)
31
+ if 'description' not in retrieved_metadata:
32
+ return {'error': 'no description found for this dataset.'}
33
+ cleaned_description = retrieved_metadata['description'] + clean_up_tags(retrieved_metadata['tags'])
34
+ similar_documents = self.db.similarity_search(cleaned_description)
35
+ similar_datasets = [f"{self.datasets_url_base}{x.metadata['id']}" for x in similar_documents if x.metadata['id'] not in query_url]
36
+ return {'datasets': similar_datasets}