update
Browse files- rag.py +69 -7
- raw_data/faq.xlsx +0 -0
rag.py
CHANGED
|
@@ -27,6 +27,68 @@ vectorstore3, retriever3 = process_data(data3, child_text_splitter, embedding, "
|
|
| 27 |
|
| 28 |
##############################################################################
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
ANYSCALE_API_BASE = "credential-1711634141163"
|
| 31 |
ANYSCALE_API_KEY = "esecret_chitz7splr5ut6vfvqpn72itd3"
|
| 32 |
ANYSCALE_MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
|
|
@@ -159,13 +221,13 @@ rag_chain_with_source3 = RunnableParallel(
|
|
| 159 |
|
| 160 |
############################################################################################
|
| 161 |
|
| 162 |
-
from flashtext import KeywordProcessor
|
| 163 |
-
keyword_processor = KeywordProcessor()
|
| 164 |
-
# keyword_processor.add_keyword(<unclean name>, <standardised name>)
|
| 165 |
-
keyword_processor.add_keyword('thạc sĩ')
|
| 166 |
-
keyword_processor.add_keyword('học viên')
|
| 167 |
-
keyword_processor.add_keyword('nghiên cứu sinh')
|
| 168 |
-
keyword_processor.add_keyword('tiến sĩ')
|
| 169 |
|
| 170 |
################################################################################
|
| 171 |
|
|
|
|
| 27 |
|
| 28 |
##############################################################################
|
| 29 |
|
| 30 |
+
from flashtext import KeywordProcessor
|
| 31 |
+
keyword_processor = KeywordProcessor()
|
| 32 |
+
# keyword_processor.add_keyword(<unclean name>, <standardised name>)
|
| 33 |
+
keyword_processor.add_keyword('thạc sĩ')
|
| 34 |
+
keyword_processor.add_keyword('học viên')
|
| 35 |
+
keyword_processor.add_keyword('nghiên cứu sinh')
|
| 36 |
+
keyword_processor.add_keyword('tiến sĩ')
|
| 37 |
+
|
| 38 |
+
################################################################################
|
| 39 |
+
|
| 40 |
+
import pandas as pd
|
| 41 |
+
|
| 42 |
+
faq = "raw_data/faq.xlsx"
|
| 43 |
+
df = pd.read_excel(faq)
|
| 44 |
+
questions = df["question"].tolist()
|
| 45 |
+
answers = df["answer"].tolist()
|
| 46 |
+
|
| 47 |
+
faq_thsi_q = []
|
| 48 |
+
faq_thsi_a = []
|
| 49 |
+
faq_tsi_q = []
|
| 50 |
+
faq_tsi_a = []
|
| 51 |
+
|
| 52 |
+
for i in range(len(questions)):
|
| 53 |
+
keywords_found = keyword_processor.extract_keywords(questions[i])
|
| 54 |
+
if 'thạc sĩ' in keywords_found or 'học viên' in keywords_found:
|
| 55 |
+
faq_thsi_q.append(questions[i])
|
| 56 |
+
faq_thsi_a.append(answers[i])
|
| 57 |
+
|
| 58 |
+
elif 'nghiên cứu sinh' in keywords_found or 'tiến sĩ' in keywords_found:
|
| 59 |
+
faq_tsi_q.append(questions[i])
|
| 60 |
+
faq_tsi_a.append(answers[i])
|
| 61 |
+
|
| 62 |
+
import uuid
|
| 63 |
+
from langchain_core.documents import Document
|
| 64 |
+
|
| 65 |
+
def add_faq(retriever, vectorstore, questions, answers):
|
| 66 |
+
id_key = "doc_id"
|
| 67 |
+
|
| 68 |
+
doc_ids = [str(uuid.uuid4()) for _ in answers]
|
| 69 |
+
|
| 70 |
+
question_ = [
|
| 71 |
+
Document(page_content=s, metadata={id_key: doc_ids[i]})
|
| 72 |
+
for i, s in enumerate(questions)
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
answers_ = [ Document(page_content=s) for s in answers]
|
| 76 |
+
|
| 77 |
+
retriever.vectorstore.add_documents(question_)
|
| 78 |
+
|
| 79 |
+
retriever.docstore.mset(list(zip(doc_ids, answers_)))
|
| 80 |
+
|
| 81 |
+
# Add FAQ to vectorstore
|
| 82 |
+
|
| 83 |
+
add_faq(retriever2, vectorstore2, faq_thsi_q, faq_thsi_a)
|
| 84 |
+
|
| 85 |
+
add_faq(retriever3, vectorstore3, faq_tsi_q, faq_tsi_a)
|
| 86 |
+
|
| 87 |
+
add_faq(retriever1, vectorstore1, questions, answers)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
##################################################################################
|
| 91 |
+
|
| 92 |
ANYSCALE_API_BASE = "credential-1711634141163"
|
| 93 |
ANYSCALE_API_KEY = "esecret_chitz7splr5ut6vfvqpn72itd3"
|
| 94 |
ANYSCALE_MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
|
|
|
|
| 221 |
|
| 222 |
############################################################################################
|
| 223 |
|
| 224 |
+
# from flashtext import KeywordProcessor
|
| 225 |
+
# keyword_processor = KeywordProcessor()
|
| 226 |
+
# # keyword_processor.add_keyword(<unclean name>, <standardised name>)
|
| 227 |
+
# keyword_processor.add_keyword('thạc sĩ')
|
| 228 |
+
# keyword_processor.add_keyword('học viên')
|
| 229 |
+
# keyword_processor.add_keyword('nghiên cứu sinh')
|
| 230 |
+
# keyword_processor.add_keyword('tiến sĩ')
|
| 231 |
|
| 232 |
################################################################################
|
| 233 |
|
raw_data/faq.xlsx
ADDED
|
Binary file (22.2 kB). View file
|
|
|