##################################################### ### DOCUMENT PROCESSOR [Metadata Adders] ##################################################### ### Jonathan Wang # ABOUT: # This creates an app to chat with PDFs. # This is the Metadata Adders # Which are classes that add metadata fields to documents. # This often is used for summaries or keywords. ##################################################### ### TODO Board: # Seems like this overlaps well with the `metadata extractors` interface from llama_index. # These are TransformComponents which take a Sequence of Nodes as input, and returns a list of Dicts as output (with the dicts storing metdata for each node). # We should add a wrapper which adds this metadata to nodes. # We should also add a wrapper # https://github.com/run-llama/llama_index/blob/be3bd619ec114d26cf328d12117c033762695b3f/llama-index-core/llama_index/core/extractors/interface.py#L21 # https://github.com/run-llama/llama_index/blob/be3bd619ec114d26cf328d12117c033762695b3f/llama-index-core/llama_index/core/extractors/metadata_extractors.py#L332 ##################################################### ### PROGRAM SETTINGS ##################################################### ### PROGRAM IMPORTS from __future__ import annotations import logging import re from abc import abstractmethod from typing import Any, List, Optional, TypeVar, Sequence from llama_index.core.bridge.pydantic import Field, PrivateAttr from llama_index.core.schema import BaseNode, TransformComponent # Own modules ##################################################### ### CONSTANTS # ah how beautiful the regex # handy visualizer and checker: https://www.debuggex.com/, https://www.regexpr.com/ logger = logging.getLogger(__name__) GenericNode = TypeVar("GenericNode", bound=BaseNode) DATE_REGEX = re.compile(r"(?:(? None: super().__init__(**kwargs) self.metadata_name = metadata_name # self.num_workers = num_workers @classmethod def class_name(cls) -> str: return "MetadataAdder" @abstractmethod def get_node_metadata(self, node: BaseNode) -> str | None: """Given a node, get the metadata for the node.""" def add_node_metadata(self, node: GenericNode, metadata_value: Any | None) -> GenericNode: """Given a node and the metadata, add the metadata to the node's `metadata_name` field.""" if (metadata_value is None): return node else: node.metadata[self.metadata_name] = metadata_value return node def process_nodes(self, nodes: list[GenericNode]) -> list[GenericNode]: """Process the list of nodes. This gets called by __call__. Args: nodes (List[GenericNode]): The nodes to process. Returns: List[GenericNode]: The processed nodes, with metadata field metadata_name added. """ output_nodes = [] for node in nodes: node_metadata = self.get_node_metadata(node) node_with_metadata = self.add_node_metadata(node, node_metadata) output_nodes.append(node_with_metadata) return(output_nodes) def __call__(self, nodes: Sequence[BaseNode], **kwargs: Any) -> list[BaseNode]: """Check whether nodes have the specified regex pattern.""" return self.process_nodes(nodes) class RegexMetadataAdder(MetadataAdder): """Adds regex metadata to a document. Args: regex_pattern: The regex pattern to search for. metadata_name: The name of the metadata to add to the document. Defaults to 'regex_metadata'. # num_workers: The number of workers to use for parallel processing. By default, use all available cores minus one. """ _regex_pattern: re.Pattern = PrivateAttr() _boolean_mode: bool = PrivateAttr() # num_workers: int = Field( # default=DEFAULT_NUM_WORKERS, # description="The number of workers to use for parallel processing. By default, use all available cores minus one.", # ) def __init__( self, regex_pattern: re.Pattern | str = DATE_REGEX, metadata_name: str = "regex_metadata", boolean_mode: bool = False, # num_workers: int = DEFAULT_NUM_WORKERS, **kwargs: Any, ) -> None: """Init params.""" if (isinstance(regex_pattern, str)): regex_pattern = re.compile(regex_pattern) # self.num_workers = num_workers super().__init__(metadata_name=metadata_name, **kwargs) # ah yes i love oop :) self._regex_pattern=regex_pattern self._boolean_mode=boolean_mode @classmethod def class_name(cls) -> str: return "RegexMetadataAdder" def get_node_metadata(self, node: BaseNode) -> str | None: """Given a node with text, return the regex match if it exists. Args: node (BaseNode): The base node to extract from. Returns: Optional[str]: The regex match if it exists. If not, return None. """ if (getattr(node, "text", None) is None): return None if (self._boolean_mode): return str(self._regex_pattern.match(node.text) is not None) else: return str(self._regex_pattern.findall(node.text)) # NOTE: we are saving these as a string'd list since this is easier class ModelMetadataAdder(MetadataAdder): """Adds metadata to nodes based on a language model.""" prompt_template: str = Field( description="The prompt to use to generate the metadata. Defaults to DEFAULT_SUMMARY_TEMPLATE.", ) def __init__( self, metadata_name: str, prompt_template: str | None = None, **kwargs: Any ) -> None: """Init params.""" super().__init__(metadata_name=metadata_name, prompt_template=prompt_template, **kwargs) @classmethod def class_name(cls) -> str: return "ModelMetadataAdder" @abstractmethod def get_node_metadata(self, node: BaseNode) -> str | None: """Given a node, get the metadata for the node. Args: node (BaseNode): The node to add metadata to. Returns: Optional[str]: The metadata if it exists. If not, return None. """ class UnstructuredPDFPostProcessor(TransformComponent): """Handles postprocessing of PDF which was read in using UnstructuredIO.""" ### NOTE: okay technically we could have done this in the IngestionPipeline abstraction. Maybe we integrate in the future? # This component doesn't play nice with multi-processing due to having non-async LLMs. # _embed_model: Optional[BaseEmbedding] = PrivateAttr() _metadata_adders: list[MetadataAdder] = PrivateAttr() def __init__( self, # embed_model: Optional[BaseEmbedding] = None, metadata_adders: list[MetadataAdder] | None = None, **kwargs: Any, ) -> None: super().__init__(**kwargs) # self._embed_model = embed_model or Settings.embed_model self._metadata_adders = metadata_adders or [] @classmethod def class_name(cls) -> str: return "UnstructuredPDFPostProcessor" # def _apply_embed_model(self, nodes: List[BaseNode]) -> List[BaseNode]: # if (self._embed_model is not None): # nodes = self._embed_model(nodes) # return nodes def _apply_metadata_adders(self, nodes: list[GenericNode]) -> list[GenericNode]: for metadata_adder in self._metadata_adders: nodes = metadata_adder(nodes) return nodes def __call__(self, nodes: list[GenericNode], **kwargs: Any) -> Sequence[BaseNode]: return self._apply_metadata_adders(nodes) # nodes = self._apply_embed_model(nodes) # this goes second in case we want to embed the metadata. # def has_email(input_text: str) -> bool: # """ # Given a chunk of text, determine whether it has an email address or not. # We're using the long complex email regex from https://emailregex.com/index.html # """ # return (EMAIL_REGEX.search(input_text) is not None) # def has_phone(input_text: str) -> bool: # """ # Given a chunk of text, determine whether it has a phone number or not. # """ # has_phone = PHONE_REGEX.search(input_text) # return (has_phone is not None) # def has_mail_addr(input_text: str) -> bool: # """ # Given a chunk of text, determine whether it has a mailing address or not. # NOTE: This is difficult to do with regex. # ... We could use spacy's English language NER model instead / as well: # Assume that addresses will have a GSP (geospatial political) or GPE (geopolitical entity). # DOCS SEE: https://www.nltk.org/book/ch07.html | https://spacy.io/usage/linguistic-features # """ # has_addr = MAIL_ADDR_REGEX.search(input_text) # return (has_addr is not None) # def has_date(input_text: str) -> bool: # """ # Given a chunk of text, determine whether it has a date or not. # NOTE: relative dates are stuff like "within 30 days" # """ # has_date = DATE_REGEX.search(input_text) # return (has_date is not None)