EmileH commited on
Commit
5168e0b
·
1 Parent(s): 921a12d

Switching to ChromaDB, adding data

Browse files
Files changed (2) hide show
  1. app.py +76 -57
  2. knowledge-plain.txt +0 -0
app.py CHANGED
@@ -3,16 +3,16 @@ from haystack.utils import Secret
3
  from haystack.components.builders.prompt_builder import PromptBuilder
4
  from haystack.components.routers import ConditionalRouter
5
  from haystack import Pipeline
6
- from haystack.components.writers import DocumentWriter
7
- from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
8
- from haystack.components.preprocessors import DocumentSplitter
9
- from haystack.components.converters.txt import TextFileToDocument
10
- from haystack.components.preprocessors import DocumentCleaner
11
  from haystack_integrations.document_stores.chroma import ChromaDocumentStore
12
  from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
13
 
14
- from haystack.document_stores.in_memory import InMemoryDocumentStore
15
- from haystack.components.retrievers import InMemoryEmbeddingRetriever
16
 
17
  import gradio as gr
18
 
@@ -23,44 +23,48 @@ embedding_model = "Alibaba-NLP/gte-multilingual-base"
23
  ####### Indexing #######
24
  ########################
25
 
 
 
26
  # In memory version for now
27
- document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
28
 
29
- converter = TextFileToDocument()
30
 
31
- cleaner = DocumentCleaner()
32
 
33
- splitter = DocumentSplitter(split_by="word", split_length=200, split_overlap=100)
34
 
35
- embedder = SentenceTransformersDocumentEmbedder(model=embedding_model,
36
- trust_remote_code=True)
37
 
38
- writer = DocumentWriter(document_store=document_store)
39
 
40
- indexing = Pipeline()
41
 
42
- indexing.add_component("converter", converter)
43
- indexing.add_component("cleaner", cleaner)
44
- indexing.add_component("splitter", splitter)
45
- indexing.add_component("embedder", embedder)
46
- indexing.add_component("writer", writer)
47
 
48
- indexing.connect("converter", "cleaner")
49
- indexing.connect("cleaner", "splitter")
50
- indexing.connect("splitter", "embedder")
51
- indexing.connect("embedder", "writer")
52
 
53
- indexing.run({"sources": ["knowledge-plain.txt"]})
54
 
55
 
56
  # Chroma version (no support for overlaps in documents)
57
  # document_store = ChromaDocumentStore(persist_path="vstore_4012")
58
 
 
 
 
59
 
60
  ##################################
61
  ####### Answering pipeline #######
62
  ##################################
63
-
64
  no_answer_message = (
65
  "I'm not allowed to answer this question. Please ask something related to "
66
  "APIs access in accordance DSA’s transparency and data-sharing provisions. "
@@ -72,11 +76,13 @@ Classify whether this user is asking for something related to social media APIs,
72
  the Digital Services Act (DSA), or any topic related to online platforms’ compliance
73
  with legal and data-sharing frameworks.
74
 
75
- Relevant topics include social media API access, data transparency, compliance
76
- with DSA provisions, and online platform regulations.
 
 
 
77
 
78
  Here is their message:
79
-
80
  {{query}}
81
 
82
  Here are the two previous messages. ONLY refer to these if the above message refers previous ones.
@@ -86,7 +92,18 @@ Here are the two previous messages. ONLY refer to these if the above message ref
86
 
87
  {% endfor %}
88
 
89
- If the request is related to these topics, respond “YES”. If it is off-topic (e.g., unrelated to APIs, the DSA, or legal frameworks), respond “NO”."""
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  routes = [
92
  {
@@ -103,48 +120,46 @@ routes = [
103
  }
104
  ]
105
 
106
- query_prompt_template = """Conversation history:
 
107
  {{conv_history}}
108
 
109
  Here is what the user has requested:
110
-
111
  {{query}}
112
 
113
- Reply to the question with a short paragraph according to the following documents:
 
 
 
 
114
 
 
115
  {% for document in documents %}
116
- * {{document.content}}
117
-
118
  {% endfor %}
119
 
120
- Do not mention the documents in your answer, present it as your own knowledge.
121
  """
122
 
123
- prompt_builder = PromptBuilder(template=relevance_prompt_template)
 
 
 
 
 
 
124
 
125
- llm = OpenAIGenerator(
126
- api_key=Secret.from_env_var("OPENAI_API_KEY"),
127
- model="gpt-4o-mini",
128
- generation_kwargs = {"max_tokens": 8192}
129
- )
130
 
131
- router = ConditionalRouter(routes=routes)
132
-
133
- embedder = SentenceTransformersTextEmbedder(model=embedding_model, trust_remote_code=True)
134
 
135
- # Again: in memory for now
136
- retriever = InMemoryEmbeddingRetriever(document_store)
137
 
138
- # Chroma
139
- # retriever = ChromaEmbeddingRetriever(document_store=document_store)
140
 
 
 
141
  prompt_builder2 = PromptBuilder(template=query_prompt_template)
142
 
143
- llm2 = OpenAIGenerator(
144
- api_key=Secret.from_env_var("OPENAI_API_KEY"),
145
- model="gpt-4o-mini",
146
- generation_kwargs = {"max_tokens": 8192}
147
- )
148
 
149
  answer_query = Pipeline()
150
 
@@ -174,10 +189,13 @@ def chat(message, history):
174
  """
175
  Chat function for Gradio. Uses the pipeline to produce next answer.
176
  """
177
- conv_history = "\n\n".join([f"{message['role']}: {message['content']}" for message in history[-2:]])
178
  user_history = [message for message in history if message["role"] == "user"]
179
- results = answer_query.run({"user_history": user_history, "query": message,
180
- "conv_history": conv_history})
 
 
 
181
  if "llm2" in results:
182
  answer = results["llm2"]["replies"][0]
183
  elif "router" in results and "no_answer" in results["router"]:
@@ -187,4 +205,5 @@ def chat(message, history):
187
  return answer
188
 
189
  if __name__ == "__main__":
 
190
  gr.ChatInterface(chat, type="messages").launch()
 
3
  from haystack.components.builders.prompt_builder import PromptBuilder
4
  from haystack.components.routers import ConditionalRouter
5
  from haystack import Pipeline
6
+ # from haystack.components.writers import DocumentWriter
7
+ from haystack.components.embedders import SentenceTransformersTextEmbedder #, SentenceTransformersDocumentEmbedder
8
+ # from haystack.components.preprocessors import DocumentSplitter
9
+ # from haystack.components.converters.txt import TextFileToDocument
10
+ # from haystack.components.preprocessors import DocumentCleaner
11
  from haystack_integrations.document_stores.chroma import ChromaDocumentStore
12
  from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
13
 
14
+ # from haystack.document_stores.in_memory import InMemoryDocumentStore
15
+ # from haystack.components.retrievers import InMemoryEmbeddingRetriever
16
 
17
  import gradio as gr
18
 
 
23
  ####### Indexing #######
24
  ########################
25
 
26
+ # Skipped: now using Chroma
27
+
28
  # In memory version for now
29
+ # document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
30
 
31
+ # converter = TextFileToDocument()
32
 
33
+ # cleaner = DocumentCleaner()
34
 
35
+ # splitter = DocumentSplitter(split_by="word", split_length=200, split_overlap=100)
36
 
37
+ # embedder = SentenceTransformersDocumentEmbedder(model=embedding_model,
38
+ # trust_remote_code=True)
39
 
40
+ # writer = DocumentWriter(document_store=document_store)
41
 
42
+ # indexing = Pipeline()
43
 
44
+ # indexing.add_component("converter", converter)
45
+ # indexing.add_component("cleaner", cleaner)
46
+ # indexing.add_component("splitter", splitter)
47
+ # indexing.add_component("embedder", embedder)
48
+ # indexing.add_component("writer", writer)
49
 
50
+ # indexing.connect("converter", "cleaner")
51
+ # indexing.connect("cleaner", "splitter")
52
+ # indexing.connect("splitter", "embedder")
53
+ # indexing.connect("embedder", "writer")
54
 
55
+ # indexing.run({"sources": ["knowledge-plain.txt"]})
56
 
57
 
58
  # Chroma version (no support for overlaps in documents)
59
  # document_store = ChromaDocumentStore(persist_path="vstore_4012")
60
 
61
+ document_store = ChromaDocumentStore(
62
+ persist_path="vstore_4012"
63
+ )
64
 
65
  ##################################
66
  ####### Answering pipeline #######
67
  ##################################
 
68
  no_answer_message = (
69
  "I'm not allowed to answer this question. Please ask something related to "
70
  "APIs access in accordance DSA’s transparency and data-sharing provisions. "
 
76
  the Digital Services Act (DSA), or any topic related to online platforms’ compliance
77
  with legal and data-sharing frameworks.
78
 
79
+ Relevant topics include:
80
+ - Social media API access
81
+ - Data transparency
82
+ - Compliance with DSA provisions
83
+ - Online platform regulations
84
 
85
  Here is their message:
 
86
  {{query}}
87
 
88
  Here are the two previous messages. ONLY refer to these if the above message refers previous ones.
 
92
 
93
  {% endfor %}
94
 
95
+ Instructions:
96
+ - Respond with “YES” if the query pertains to any of the relevant topics listed above and not mixed with off-topic content.
97
+ - Respond with “NO” if the query is off-topic and does not relate to the topics listed above.
98
+
99
+ Examples:
100
+ - Query: "How does the DSA affect API usage?"
101
+ - Response: "YES"
102
+
103
+ - Query: "How to make a pancake with APIs?"
104
+ - Response: "NO"
105
+
106
+ """
107
 
108
  routes = [
109
  {
 
120
  }
121
  ]
122
 
123
+ query_prompt_template = """
124
+ Conversation history:
125
  {{conv_history}}
126
 
127
  Here is what the user has requested:
 
128
  {{query}}
129
 
130
+ Instructions:
131
+ - Craft a concise, short informative answer to the user's request using the information provided below.
132
+ - Synthesize the key points into a seamless response that appears as your own expert knowledge.
133
+ - Avoid direct quotes or explicit references to the documents.
134
+ - You are directly answering the user's query.
135
 
136
+ Relevant Information:
137
  {% for document in documents %}
138
+ - {{ document.content }}
 
139
  {% endfor %}
140
 
 
141
  """
142
 
143
+ def setup_generator(model_name, api_key_env_var="GROQ_API_KEY", max_tokens=8192):
144
+ return OpenAIGenerator(
145
+ api_key=Secret.from_env_var(api_key_env_var),
146
+ api_base_url="https://api.groq.com/openai/v1",
147
+ model=model_name,
148
+ generation_kwargs={"max_tokens": max_tokens}
149
+ )
150
 
 
 
 
 
 
151
 
152
+ llm = setup_generator("llama3-8b-8192", max_tokens=30)
153
+ llm2 = setup_generator("llama3-8b-8192")
 
154
 
 
 
155
 
156
+ embedder = SentenceTransformersTextEmbedder(model=embedding_model, trust_remote_code=True)
157
+ retriever = ChromaEmbeddingRetriever(document_store)
158
 
159
+ router = ConditionalRouter(routes=routes)
160
+ prompt_builder = PromptBuilder(template=relevance_prompt_template)
161
  prompt_builder2 = PromptBuilder(template=query_prompt_template)
162
 
 
 
 
 
 
163
 
164
  answer_query = Pipeline()
165
 
 
189
  """
190
  Chat function for Gradio. Uses the pipeline to produce next answer.
191
  """
192
+ conv_history = "\n\n".join([f"{message["role"]}: {message["content"]}" for message in history[-2:]])
193
  user_history = [message for message in history if message["role"] == "user"]
194
+ results = answer_query.run({"user_history": user_history,
195
+ "query": message,
196
+ "conv_history": conv_history,
197
+ "top_k":3}, include_outputs_from=["retriever"])
198
+ print(results["retriever"]["documents"])
199
  if "llm2" in results:
200
  answer = results["llm2"]["replies"][0]
201
  elif "router" in results and "no_answer" in results["router"]:
 
205
  return answer
206
 
207
  if __name__ == "__main__":
208
+ print("length of document store: ", document_store.count_documents())
209
  gr.ChatInterface(chat, type="messages").launch()
knowledge-plain.txt DELETED
The diff for this file is too large to render. See raw diff