Switching to ChromaDB, adding data
Browse files- app.py +76 -57
- knowledge-plain.txt +0 -0
app.py
CHANGED
@@ -3,16 +3,16 @@ from haystack.utils import Secret
|
|
3 |
from haystack.components.builders.prompt_builder import PromptBuilder
|
4 |
from haystack.components.routers import ConditionalRouter
|
5 |
from haystack import Pipeline
|
6 |
-
from haystack.components.writers import DocumentWriter
|
7 |
-
from haystack.components.embedders import SentenceTransformersTextEmbedder
|
8 |
-
from haystack.components.preprocessors import DocumentSplitter
|
9 |
-
from haystack.components.converters.txt import TextFileToDocument
|
10 |
-
from haystack.components.preprocessors import DocumentCleaner
|
11 |
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
|
12 |
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
|
13 |
|
14 |
-
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
15 |
-
from haystack.components.retrievers import InMemoryEmbeddingRetriever
|
16 |
|
17 |
import gradio as gr
|
18 |
|
@@ -23,44 +23,48 @@ embedding_model = "Alibaba-NLP/gte-multilingual-base"
|
|
23 |
####### Indexing #######
|
24 |
########################
|
25 |
|
|
|
|
|
26 |
# In memory version for now
|
27 |
-
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
|
28 |
|
29 |
-
converter = TextFileToDocument()
|
30 |
|
31 |
-
cleaner = DocumentCleaner()
|
32 |
|
33 |
-
splitter = DocumentSplitter(split_by="word", split_length=200, split_overlap=100)
|
34 |
|
35 |
-
embedder = SentenceTransformersDocumentEmbedder(model=embedding_model,
|
36 |
-
|
37 |
|
38 |
-
writer = DocumentWriter(document_store=document_store)
|
39 |
|
40 |
-
indexing = Pipeline()
|
41 |
|
42 |
-
indexing.add_component("converter", converter)
|
43 |
-
indexing.add_component("cleaner", cleaner)
|
44 |
-
indexing.add_component("splitter", splitter)
|
45 |
-
indexing.add_component("embedder", embedder)
|
46 |
-
indexing.add_component("writer", writer)
|
47 |
|
48 |
-
indexing.connect("converter", "cleaner")
|
49 |
-
indexing.connect("cleaner", "splitter")
|
50 |
-
indexing.connect("splitter", "embedder")
|
51 |
-
indexing.connect("embedder", "writer")
|
52 |
|
53 |
-
indexing.run({"sources": ["knowledge-plain.txt"]})
|
54 |
|
55 |
|
56 |
# Chroma version (no support for overlaps in documents)
|
57 |
# document_store = ChromaDocumentStore(persist_path="vstore_4012")
|
58 |
|
|
|
|
|
|
|
59 |
|
60 |
##################################
|
61 |
####### Answering pipeline #######
|
62 |
##################################
|
63 |
-
|
64 |
no_answer_message = (
|
65 |
"I'm not allowed to answer this question. Please ask something related to "
|
66 |
"APIs access in accordance DSA’s transparency and data-sharing provisions. "
|
@@ -72,11 +76,13 @@ Classify whether this user is asking for something related to social media APIs,
|
|
72 |
the Digital Services Act (DSA), or any topic related to online platforms’ compliance
|
73 |
with legal and data-sharing frameworks.
|
74 |
|
75 |
-
Relevant topics include
|
76 |
-
|
|
|
|
|
|
|
77 |
|
78 |
Here is their message:
|
79 |
-
|
80 |
{{query}}
|
81 |
|
82 |
Here are the two previous messages. ONLY refer to these if the above message refers previous ones.
|
@@ -86,7 +92,18 @@ Here are the two previous messages. ONLY refer to these if the above message ref
|
|
86 |
|
87 |
{% endfor %}
|
88 |
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
routes = [
|
92 |
{
|
@@ -103,48 +120,46 @@ routes = [
|
|
103 |
}
|
104 |
]
|
105 |
|
106 |
-
query_prompt_template = """
|
|
|
107 |
{{conv_history}}
|
108 |
|
109 |
Here is what the user has requested:
|
110 |
-
|
111 |
{{query}}
|
112 |
|
113 |
-
|
|
|
|
|
|
|
|
|
114 |
|
|
|
115 |
{% for document in documents %}
|
116 |
-
|
117 |
-
|
118 |
{% endfor %}
|
119 |
|
120 |
-
Do not mention the documents in your answer, present it as your own knowledge.
|
121 |
"""
|
122 |
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
-
llm = OpenAIGenerator(
|
126 |
-
api_key=Secret.from_env_var("OPENAI_API_KEY"),
|
127 |
-
model="gpt-4o-mini",
|
128 |
-
generation_kwargs = {"max_tokens": 8192}
|
129 |
-
)
|
130 |
|
131 |
-
|
132 |
-
|
133 |
-
embedder = SentenceTransformersTextEmbedder(model=embedding_model, trust_remote_code=True)
|
134 |
|
135 |
-
# Again: in memory for now
|
136 |
-
retriever = InMemoryEmbeddingRetriever(document_store)
|
137 |
|
138 |
-
|
139 |
-
|
140 |
|
|
|
|
|
141 |
prompt_builder2 = PromptBuilder(template=query_prompt_template)
|
142 |
|
143 |
-
llm2 = OpenAIGenerator(
|
144 |
-
api_key=Secret.from_env_var("OPENAI_API_KEY"),
|
145 |
-
model="gpt-4o-mini",
|
146 |
-
generation_kwargs = {"max_tokens": 8192}
|
147 |
-
)
|
148 |
|
149 |
answer_query = Pipeline()
|
150 |
|
@@ -174,10 +189,13 @@ def chat(message, history):
|
|
174 |
"""
|
175 |
Chat function for Gradio. Uses the pipeline to produce next answer.
|
176 |
"""
|
177 |
-
conv_history = "\n\n".join([f"{message[
|
178 |
user_history = [message for message in history if message["role"] == "user"]
|
179 |
-
results = answer_query.run({"user_history": user_history,
|
180 |
-
"
|
|
|
|
|
|
|
181 |
if "llm2" in results:
|
182 |
answer = results["llm2"]["replies"][0]
|
183 |
elif "router" in results and "no_answer" in results["router"]:
|
@@ -187,4 +205,5 @@ def chat(message, history):
|
|
187 |
return answer
|
188 |
|
189 |
if __name__ == "__main__":
|
|
|
190 |
gr.ChatInterface(chat, type="messages").launch()
|
|
|
3 |
from haystack.components.builders.prompt_builder import PromptBuilder
|
4 |
from haystack.components.routers import ConditionalRouter
|
5 |
from haystack import Pipeline
|
6 |
+
# from haystack.components.writers import DocumentWriter
|
7 |
+
from haystack.components.embedders import SentenceTransformersTextEmbedder #, SentenceTransformersDocumentEmbedder
|
8 |
+
# from haystack.components.preprocessors import DocumentSplitter
|
9 |
+
# from haystack.components.converters.txt import TextFileToDocument
|
10 |
+
# from haystack.components.preprocessors import DocumentCleaner
|
11 |
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
|
12 |
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
|
13 |
|
14 |
+
# from haystack.document_stores.in_memory import InMemoryDocumentStore
|
15 |
+
# from haystack.components.retrievers import InMemoryEmbeddingRetriever
|
16 |
|
17 |
import gradio as gr
|
18 |
|
|
|
23 |
####### Indexing #######
|
24 |
########################
|
25 |
|
26 |
+
# Skipped: now using Chroma
|
27 |
+
|
28 |
# In memory version for now
|
29 |
+
# document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
|
30 |
|
31 |
+
# converter = TextFileToDocument()
|
32 |
|
33 |
+
# cleaner = DocumentCleaner()
|
34 |
|
35 |
+
# splitter = DocumentSplitter(split_by="word", split_length=200, split_overlap=100)
|
36 |
|
37 |
+
# embedder = SentenceTransformersDocumentEmbedder(model=embedding_model,
|
38 |
+
# trust_remote_code=True)
|
39 |
|
40 |
+
# writer = DocumentWriter(document_store=document_store)
|
41 |
|
42 |
+
# indexing = Pipeline()
|
43 |
|
44 |
+
# indexing.add_component("converter", converter)
|
45 |
+
# indexing.add_component("cleaner", cleaner)
|
46 |
+
# indexing.add_component("splitter", splitter)
|
47 |
+
# indexing.add_component("embedder", embedder)
|
48 |
+
# indexing.add_component("writer", writer)
|
49 |
|
50 |
+
# indexing.connect("converter", "cleaner")
|
51 |
+
# indexing.connect("cleaner", "splitter")
|
52 |
+
# indexing.connect("splitter", "embedder")
|
53 |
+
# indexing.connect("embedder", "writer")
|
54 |
|
55 |
+
# indexing.run({"sources": ["knowledge-plain.txt"]})
|
56 |
|
57 |
|
58 |
# Chroma version (no support for overlaps in documents)
|
59 |
# document_store = ChromaDocumentStore(persist_path="vstore_4012")
|
60 |
|
61 |
+
document_store = ChromaDocumentStore(
|
62 |
+
persist_path="vstore_4012"
|
63 |
+
)
|
64 |
|
65 |
##################################
|
66 |
####### Answering pipeline #######
|
67 |
##################################
|
|
|
68 |
no_answer_message = (
|
69 |
"I'm not allowed to answer this question. Please ask something related to "
|
70 |
"APIs access in accordance DSA’s transparency and data-sharing provisions. "
|
|
|
76 |
the Digital Services Act (DSA), or any topic related to online platforms’ compliance
|
77 |
with legal and data-sharing frameworks.
|
78 |
|
79 |
+
Relevant topics include:
|
80 |
+
- Social media API access
|
81 |
+
- Data transparency
|
82 |
+
- Compliance with DSA provisions
|
83 |
+
- Online platform regulations
|
84 |
|
85 |
Here is their message:
|
|
|
86 |
{{query}}
|
87 |
|
88 |
Here are the two previous messages. ONLY refer to these if the above message refers previous ones.
|
|
|
92 |
|
93 |
{% endfor %}
|
94 |
|
95 |
+
Instructions:
|
96 |
+
- Respond with “YES” if the query pertains to any of the relevant topics listed above and not mixed with off-topic content.
|
97 |
+
- Respond with “NO” if the query is off-topic and does not relate to the topics listed above.
|
98 |
+
|
99 |
+
Examples:
|
100 |
+
- Query: "How does the DSA affect API usage?"
|
101 |
+
- Response: "YES"
|
102 |
+
|
103 |
+
- Query: "How to make a pancake with APIs?"
|
104 |
+
- Response: "NO"
|
105 |
+
|
106 |
+
"""
|
107 |
|
108 |
routes = [
|
109 |
{
|
|
|
120 |
}
|
121 |
]
|
122 |
|
123 |
+
query_prompt_template = """
|
124 |
+
Conversation history:
|
125 |
{{conv_history}}
|
126 |
|
127 |
Here is what the user has requested:
|
|
|
128 |
{{query}}
|
129 |
|
130 |
+
Instructions:
|
131 |
+
- Craft a concise, short informative answer to the user's request using the information provided below.
|
132 |
+
- Synthesize the key points into a seamless response that appears as your own expert knowledge.
|
133 |
+
- Avoid direct quotes or explicit references to the documents.
|
134 |
+
- You are directly answering the user's query.
|
135 |
|
136 |
+
Relevant Information:
|
137 |
{% for document in documents %}
|
138 |
+
- {{ document.content }}
|
|
|
139 |
{% endfor %}
|
140 |
|
|
|
141 |
"""
|
142 |
|
143 |
+
def setup_generator(model_name, api_key_env_var="GROQ_API_KEY", max_tokens=8192):
|
144 |
+
return OpenAIGenerator(
|
145 |
+
api_key=Secret.from_env_var(api_key_env_var),
|
146 |
+
api_base_url="https://api.groq.com/openai/v1",
|
147 |
+
model=model_name,
|
148 |
+
generation_kwargs={"max_tokens": max_tokens}
|
149 |
+
)
|
150 |
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
+
llm = setup_generator("llama3-8b-8192", max_tokens=30)
|
153 |
+
llm2 = setup_generator("llama3-8b-8192")
|
|
|
154 |
|
|
|
|
|
155 |
|
156 |
+
embedder = SentenceTransformersTextEmbedder(model=embedding_model, trust_remote_code=True)
|
157 |
+
retriever = ChromaEmbeddingRetriever(document_store)
|
158 |
|
159 |
+
router = ConditionalRouter(routes=routes)
|
160 |
+
prompt_builder = PromptBuilder(template=relevance_prompt_template)
|
161 |
prompt_builder2 = PromptBuilder(template=query_prompt_template)
|
162 |
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
answer_query = Pipeline()
|
165 |
|
|
|
189 |
"""
|
190 |
Chat function for Gradio. Uses the pipeline to produce next answer.
|
191 |
"""
|
192 |
+
conv_history = "\n\n".join([f"{message["role"]}: {message["content"]}" for message in history[-2:]])
|
193 |
user_history = [message for message in history if message["role"] == "user"]
|
194 |
+
results = answer_query.run({"user_history": user_history,
|
195 |
+
"query": message,
|
196 |
+
"conv_history": conv_history,
|
197 |
+
"top_k":3}, include_outputs_from=["retriever"])
|
198 |
+
print(results["retriever"]["documents"])
|
199 |
if "llm2" in results:
|
200 |
answer = results["llm2"]["replies"][0]
|
201 |
elif "router" in results and "no_answer" in results["router"]:
|
|
|
205 |
return answer
|
206 |
|
207 |
if __name__ == "__main__":
|
208 |
+
print("length of document store: ", document_store.count_documents())
|
209 |
gr.ChatInterface(chat, type="messages").launch()
|
knowledge-plain.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|