Spaces:
Sleeping
Sleeping
File size: 2,664 Bytes
f92376d cb48f96 668d00f cb48f96 f92376d cb48f96 f92376d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import json
from typing import Dict, List
from haystack import Pipeline, component
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator
@component()
class QueryMetadataExtractor:
def __init__(self):
prompt = """
You are part of an information system that processes users queries.
Given a user query you extract information from it that matches a given list of metadata fields.
The information to be extracted from the query must match the semantics associated with the given metadata fields.
The information that you extracted from the query will then be used as filters to narrow down the search space
when querying an index.
Just include the value of the extracted metadata without including the name of the metadata field.
The extracted information in 'Extracted metadata' must be returned as a valid JSON structure.
###
Example 1:
Query: "What was the revenue of Nvidia in 2022?"
Metadata fields: {"company", "year"}
Extracted metadata fields: {"company": "nvidia", "year": 2022}
###
Example 2:
Query: "What were the most influential publications in 2023 regarding Alzheimer's disease?"
Metadata fields: {"disease", "year"}
Extracted metadata fields: {"disease": "Alzheimer", "year": 2023}
###
Example 3:
Query: "{{query}}"
Metadata fields: "{{metadata_fields}}"
Extracted metadata fields:
"""
generator = OpenAIGenerator(
api_key=Secret.from_env_var("OCTOAI_TOKEN"),
api_base_url="https://text.octoai.run/v1",
model="meta-llama-3-8b-instruct",
generation_kwargs = {"max_tokens": 512}
)
self.pipeline = Pipeline()
self.pipeline.add_component(name="builder", instance=PromptBuilder(prompt))
self.pipeline.add_component(name="llm", instance=generator)
self.pipeline.connect("builder", "llm")
@component.output_types(filters=Dict[str, str])
def run(self, query: str, metadata_fields: List[str]):
result = self.pipeline.run({'builder': {'query': query, 'metadata_fields': metadata_fields}})
metadata = json.loads(result['llm']['replies'][0])
# this can be done with specific data structures and in a more sophisticated way
filters = []
for key, value in metadata.items():
field = f"meta.{key}"
filters.append({f"field": field, "operator": "==", "value": value})
return {"filters": {"operator": "AND", "conditions": filters}}
|