import json from typing import Dict, List from haystack import Pipeline, component from haystack.components.builders import PromptBuilder from haystack.components.generators import OpenAIGenerator from haystack.utils import Secret @component() class QueryMetadataExtractor: def __init__(self): prompt = """ You are part of an information system that processes users queries. Given a user query you extract information from it that matches a given list of metadata fields. The information to be extracted from the query must match the semantics associated with the given metadata fields. The information that you extracted from the query will then be used as filters to narrow down the search space when querying an index. Just include the value of the extracted metadata without including the name of the metadata field. The extracted information in 'Extracted metadata' must be returned as a valid JSON structure. ### Example 1: Query: "What was the revenue of Nvidia in 2022?" Metadata fields: {"company", "year"} Extracted metadata fields: {"company": "nvidia", "year": 2022} ### Example 2: Query: "What were the most influential publications in 2023 regarding Alzheimer's disease?" Metadata fields: {"disease", "year"} Extracted metadata fields: {"disease": "Alzheimer", "year": 2023} ### Example 3: Query: "{{query}}" Metadata fields: "{{metadata_fields}}" Extracted metadata fields: """ generator = OpenAIGenerator( api_key=Secret.from_env_var("OCTOAI_TOKEN"), api_base_url="https://text.octoai.run/v1", model="meta-llama-3-8b-instruct", generation_kwargs = {"max_tokens": 512} ) self.pipeline = Pipeline() self.pipeline.add_component(name="builder", instance=PromptBuilder(prompt)) self.pipeline.add_component(name="llm", instance=generator) self.pipeline.connect("builder", "llm") @component.output_types(filters=Dict[str, str]) def run(self, query: str, metadata_fields: List[str]): result = self.pipeline.run({'builder': {'query': query, 'metadata_fields': metadata_fields}}) metadata = json.loads(result['llm']['replies'][0]) # this can be done with specific data structures and in a more sophisticated way filters = [] for key, value in metadata.items(): field = f"meta.{key}" filters.append({f"field": field, "operator": "==", "value": value}) return {"filters": {"operator": "AND", "conditions": filters}}