Spaces:
Sleeping
Sleeping
import json | |
from typing import Dict, List | |
from haystack import Pipeline, component | |
from haystack.components.builders import PromptBuilder | |
from haystack.components.generators import OpenAIGenerator | |
from haystack.utils import Secret | |
class QueryMetadataExtractor: | |
def __init__(self): | |
prompt = """ | |
You are part of an information system that processes users queries. | |
Given a user query you extract information from it that matches a given list of metadata fields. | |
The information to be extracted from the query must match the semantics associated with the given metadata fields. | |
The information that you extracted from the query will then be used as filters to narrow down the search space | |
when querying an index. | |
Just include the value of the extracted metadata without including the name of the metadata field. | |
The extracted information in 'Extracted metadata' must be returned as a valid JSON structure. | |
### | |
Example 1: | |
Query: "What was the revenue of Nvidia in 2022?" | |
Metadata fields: {"company", "year"} | |
Extracted metadata fields: {"company": "nvidia", "year": 2022} | |
### | |
Example 2: | |
Query: "What were the most influential publications in 2023 regarding Alzheimer's disease?" | |
Metadata fields: {"disease", "year"} | |
Extracted metadata fields: {"disease": "Alzheimer", "year": 2023} | |
### | |
Example 3: | |
Query: "{{query}}" | |
Metadata fields: "{{metadata_fields}}" | |
Extracted metadata fields: | |
""" | |
generator = OpenAIGenerator( | |
api_key=Secret.from_env_var("OCTOAI_TOKEN"), | |
api_base_url="https://text.octoai.run/v1", | |
model="meta-llama-3-8b-instruct", | |
generation_kwargs = {"max_tokens": 512} | |
) | |
self.pipeline = Pipeline() | |
self.pipeline.add_component(name="builder", instance=PromptBuilder(prompt)) | |
self.pipeline.add_component(name="llm", instance=generator) | |
self.pipeline.connect("builder", "llm") | |
def run(self, query: str, metadata_fields: List[str]): | |
result = self.pipeline.run({'builder': {'query': query, 'metadata_fields': metadata_fields}}) | |
metadata = json.loads(result['llm']['replies'][0]) | |
# this can be done with specific data structures and in a more sophisticated way | |
filters = [] | |
for key, value in metadata.items(): | |
field = f"meta.{key}" | |
filters.append({f"field": field, "operator": "==", "value": value}) | |
return {"filters": {"operator": "AND", "conditions": filters}} | |