Spaces:
Sleeping
Sleeping
dinhquangson
commited on
Commit
•
2b22ee8
1
Parent(s):
a273ba0
Update app.py
Browse files
app.py
CHANGED
@@ -185,7 +185,6 @@ def search(prompt: str):
|
|
185 |
from haystack.components.generators import OpenAIGenerator
|
186 |
from haystack.utils import Secret
|
187 |
from haystack.components.builders import PromptBuilder
|
188 |
-
from QueryMetadataExtractor import QueryMetadataExtractor
|
189 |
|
190 |
start_time = time.time()
|
191 |
|
@@ -211,14 +210,12 @@ def search(prompt: str):
|
|
211 |
model="meta-llama-3-8b-instruct",
|
212 |
generation_kwargs = {"max_tokens": 512}
|
213 |
)
|
214 |
-
metadata_extractor = QueryMetadataExtractor()
|
215 |
|
216 |
querying = Pipeline()
|
217 |
querying.add_component("sparse_text_embedder", FastembedSparseTextEmbedder(model="Qdrant/bm42-all-minilm-l6-v2-attentions"))
|
218 |
querying.add_component("dense_text_embedder", FastembedTextEmbedder(
|
219 |
model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", prefix="Đại diện cho câu này để tìm kiếm các đoạn văn có liên quan: ")
|
220 |
)
|
221 |
-
querying.add_component(instance=metadata_extractor, name="metadata_extractor")
|
222 |
querying.add_component("retriever", QdrantHybridRetriever(document_store=document_store))
|
223 |
querying.add_component("document_joiner", DocumentJoiner())
|
224 |
querying.add_component("ranker", TransformersSimilarityRanker(model="BAAI/bge-m3"))
|
@@ -227,18 +224,15 @@ def search(prompt: str):
|
|
227 |
|
228 |
querying.connect("sparse_text_embedder.sparse_embedding", "retriever.query_sparse_embedding")
|
229 |
querying.connect("dense_text_embedder.embedding", "retriever.query_embedding")
|
230 |
-
querying.connect("metadata_extractor.filters", "retriever.filters")
|
231 |
querying.connect("retriever", "document_joiner")
|
232 |
querying.connect("document_joiner", "ranker")
|
233 |
querying.connect("ranker.documents", "prompt_builder.documents")
|
234 |
querying.connect("prompt_builder", "llm")
|
235 |
querying.debug=True
|
236 |
-
metadata_fields = {"publish_date", "publisher", "document_type"}
|
237 |
results = querying.run(
|
238 |
{
|
239 |
"dense_text_embedder": {"text": prompt},
|
240 |
"sparse_text_embedder": {"text": prompt},
|
241 |
-
"metadata_extractor": {"query": prompt, "metadata_fields": metadata_fields},
|
242 |
"ranker": {"query": prompt},
|
243 |
"prompt_builder": {"question": prompt}
|
244 |
}
|
@@ -282,6 +276,15 @@ def truncate_text(text: str) -> str:
|
|
282 |
else:
|
283 |
return text[:3000]
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
@app.post("/pdf2text/")
|
286 |
async def convert_upload_file(file: UploadFile = File(...)):
|
287 |
import pytesseract
|
|
|
185 |
from haystack.components.generators import OpenAIGenerator
|
186 |
from haystack.utils import Secret
|
187 |
from haystack.components.builders import PromptBuilder
|
|
|
188 |
|
189 |
start_time = time.time()
|
190 |
|
|
|
210 |
model="meta-llama-3-8b-instruct",
|
211 |
generation_kwargs = {"max_tokens": 512}
|
212 |
)
|
|
|
213 |
|
214 |
querying = Pipeline()
|
215 |
querying.add_component("sparse_text_embedder", FastembedSparseTextEmbedder(model="Qdrant/bm42-all-minilm-l6-v2-attentions"))
|
216 |
querying.add_component("dense_text_embedder", FastembedTextEmbedder(
|
217 |
model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", prefix="Đại diện cho câu này để tìm kiếm các đoạn văn có liên quan: ")
|
218 |
)
|
|
|
219 |
querying.add_component("retriever", QdrantHybridRetriever(document_store=document_store))
|
220 |
querying.add_component("document_joiner", DocumentJoiner())
|
221 |
querying.add_component("ranker", TransformersSimilarityRanker(model="BAAI/bge-m3"))
|
|
|
224 |
|
225 |
querying.connect("sparse_text_embedder.sparse_embedding", "retriever.query_sparse_embedding")
|
226 |
querying.connect("dense_text_embedder.embedding", "retriever.query_embedding")
|
|
|
227 |
querying.connect("retriever", "document_joiner")
|
228 |
querying.connect("document_joiner", "ranker")
|
229 |
querying.connect("ranker.documents", "prompt_builder.documents")
|
230 |
querying.connect("prompt_builder", "llm")
|
231 |
querying.debug=True
|
|
|
232 |
results = querying.run(
|
233 |
{
|
234 |
"dense_text_embedder": {"text": prompt},
|
235 |
"sparse_text_embedder": {"text": prompt},
|
|
|
236 |
"ranker": {"query": prompt},
|
237 |
"prompt_builder": {"question": prompt}
|
238 |
}
|
|
|
276 |
else:
|
277 |
return text[:3000]
|
278 |
|
279 |
+
@app.post("/query2metadata/")
|
280 |
+
async def extract_metadata_from_query(query: str):
|
281 |
+
from QueryMetadataExtractor import QueryMetadataExtractor
|
282 |
+
|
283 |
+
extractor = QueryMetadataExtractor()
|
284 |
+
metadata_fields = {"publisher", "publish_date", "document_type"}
|
285 |
+
|
286 |
+
result = extractor.run(query, metadata_fields)
|
287 |
+
|
288 |
@app.post("/pdf2text/")
|
289 |
async def convert_upload_file(file: UploadFile = File(...)):
|
290 |
import pytesseract
|