Autodoc-Lifter / metadata_adder.py
Jonathan Wang
initial commit
89cbc4d
#####################################################
### DOCUMENT PROCESSOR [Metadata Adders]
#####################################################
### Jonathan Wang
# ABOUT:
# This creates an app to chat with PDFs.
# This is the Metadata Adders
# Which are classes that add metadata fields to documents.
# This often is used for summaries or keywords.
#####################################################
### TODO Board:
# Seems like this overlaps well with the `metadata extractors` interface from llama_index.
# These are TransformComponents which take a Sequence of Nodes as input, and returns a list of Dicts as output (with the dicts storing metdata for each node).
# We should add a wrapper which adds this metadata to nodes.
# We should also add a wrapper
# https://github.com/run-llama/llama_index/blob/be3bd619ec114d26cf328d12117c033762695b3f/llama-index-core/llama_index/core/extractors/interface.py#L21
# https://github.com/run-llama/llama_index/blob/be3bd619ec114d26cf328d12117c033762695b3f/llama-index-core/llama_index/core/extractors/metadata_extractors.py#L332
#####################################################
### PROGRAM SETTINGS
#####################################################
### PROGRAM IMPORTS
from __future__ import annotations
import logging
import re
from abc import abstractmethod
from typing import Any, List, Optional, TypeVar, Sequence
from llama_index.core.bridge.pydantic import Field, PrivateAttr
from llama_index.core.schema import BaseNode, TransformComponent
# Own modules
#####################################################
### CONSTANTS
# ah how beautiful the regex
# handy visualizer and checker: https://www.debuggex.com/, https://www.regexpr.com/
logger = logging.getLogger(__name__)
GenericNode = TypeVar("GenericNode", bound=BaseNode)
DATE_REGEX = re.compile(r"(?:(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?\s+(?:of\s+)?(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)|(?:jan\.?|january|feb\.?|february|mar\.?|march|apr\.?|april|may|jun\.?|june|jul\.?|july|aug\.?|august|sep\.?|september|oct\.?|october|nov\.?|november|dec\.?|december)\s+(?<!\:)(?<!\:\d)[0-3]?\d(?:st|nd|rd|th)?)(?:\,)?\s*(?:\d{4})?|[0-3]?\d[-\./][0-3]?\d[-\./]\d{2,4}", re.IGNORECASE)
TIME_REGEX = re.compile(r"\d{1,2}:\d{2} ?(?:[ap]\.?m\.?)?|\d[ap]\.?m\.?", re.IGNORECASE)
EMAIL_REGEX = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)")
PHONE_REGEX = re.compile(r"((?:(?<![\d-])(?:\+?\d{1,3}[-.\s*]?)?(?:\(?\d{3}\)?[-.\s*]?)?\d{3}[-.\s*]?\d{4}(?![\d-]))|(?:(?<![\d-])(?:(?:\(\+?\d{2}\))|(?:\+?\d{2}))\s*\d{2}\s*\d{3}\s*\d{4}(?![\d-])))")
MAIL_ADDR_REGEX = re.compile(r"\d{1,4}.{1,10}[\w\s]{1,20}[\s]+(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|parkway|pkwy|circle|cir|boulevard|blvd)\W?(?=\s|$)", re.IGNORECASE)
# DEFAULT_NUM_WORKERS = os.cpu_count() - 1 if os.cpu_count() else 1 # type: ignore
#####################################################
### SCRIPT
class MetadataAdder(TransformComponent):
"""Adds metadata to a node.
Args:
metadata_name: The name of the metadata to add to the node. Defaults to 'metadata'.
# num_workers: The number of workers to use for parallel processing. By default, use all available cores minus one. currently WIP.
"""
metadata_name: str = Field(
default="metadata",
description="The name of the metadata field to add to the document. Defaults to 'metadata'.",
)
# num_workers: int = Field(
# default=DEFAULT_NUM_WORKERS,
# description="The number of workers to use for parallel processing. By default, use all available cores minus one.",
# )
def __init__(
self, metadata_name: str = "metadata", **kwargs: Any
) -> None:
super().__init__(**kwargs)
self.metadata_name = metadata_name
# self.num_workers = num_workers
@classmethod
def class_name(cls) -> str:
return "MetadataAdder"
@abstractmethod
def get_node_metadata(self, node: BaseNode) -> str | None:
"""Given a node, get the metadata for the node."""
def add_node_metadata(self, node: GenericNode, metadata_value: Any | None) -> GenericNode:
"""Given a node and the metadata, add the metadata to the node's `metadata_name` field."""
if (metadata_value is None):
return node
else:
node.metadata[self.metadata_name] = metadata_value
return node
def process_nodes(self, nodes: list[GenericNode]) -> list[GenericNode]:
"""Process the list of nodes. This gets called by __call__.
Args:
nodes (List[GenericNode]): The nodes to process.
Returns:
List[GenericNode]: The processed nodes, with metadata field metadata_name added.
"""
output_nodes = []
for node in nodes:
node_metadata = self.get_node_metadata(node)
node_with_metadata = self.add_node_metadata(node, node_metadata)
output_nodes.append(node_with_metadata)
return(output_nodes)
def __call__(self, nodes: Sequence[BaseNode], **kwargs: Any) -> list[BaseNode]:
"""Check whether nodes have the specified regex pattern."""
return self.process_nodes(nodes)
class RegexMetadataAdder(MetadataAdder):
"""Adds regex metadata to a document.
Args:
regex_pattern: The regex pattern to search for.
metadata_name: The name of the metadata to add to the document. Defaults to 'regex_metadata'.
# num_workers: The number of workers to use for parallel processing. By default, use all available cores minus one.
"""
_regex_pattern: re.Pattern = PrivateAttr()
_boolean_mode: bool = PrivateAttr()
# num_workers: int = Field(
# default=DEFAULT_NUM_WORKERS,
# description="The number of workers to use for parallel processing. By default, use all available cores minus one.",
# )
def __init__(
self,
regex_pattern: re.Pattern | str = DATE_REGEX,
metadata_name: str = "regex_metadata",
boolean_mode: bool = False,
# num_workers: int = DEFAULT_NUM_WORKERS,
**kwargs: Any,
) -> None:
"""Init params."""
if (isinstance(regex_pattern, str)):
regex_pattern = re.compile(regex_pattern)
# self.num_workers = num_workers
super().__init__(metadata_name=metadata_name, **kwargs) # ah yes i love oop :)
self._regex_pattern=regex_pattern
self._boolean_mode=boolean_mode
@classmethod
def class_name(cls) -> str:
return "RegexMetadataAdder"
def get_node_metadata(self, node: BaseNode) -> str | None:
"""Given a node with text, return the regex match if it exists.
Args:
node (BaseNode): The base node to extract from.
Returns:
Optional[str]: The regex match if it exists. If not, return None.
"""
if (getattr(node, "text", None) is None):
return None
if (self._boolean_mode):
return str(self._regex_pattern.match(node.text) is not None)
else:
return str(self._regex_pattern.findall(node.text)) # NOTE: we are saving these as a string'd list since this is easier
class ModelMetadataAdder(MetadataAdder):
"""Adds metadata to nodes based on a language model."""
prompt_template: str = Field(
description="The prompt to use to generate the metadata. Defaults to DEFAULT_SUMMARY_TEMPLATE.",
)
def __init__(
self,
metadata_name: str,
prompt_template: str | None = None,
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(metadata_name=metadata_name, prompt_template=prompt_template, **kwargs)
@classmethod
def class_name(cls) -> str:
return "ModelMetadataAdder"
@abstractmethod
def get_node_metadata(self, node: BaseNode) -> str | None:
"""Given a node, get the metadata for the node.
Args:
node (BaseNode): The node to add metadata to.
Returns:
Optional[str]: The metadata if it exists. If not, return None.
"""
class UnstructuredPDFPostProcessor(TransformComponent):
"""Handles postprocessing of PDF which was read in using UnstructuredIO."""
### NOTE: okay technically we could have done this in the IngestionPipeline abstraction. Maybe we integrate in the future?
# This component doesn't play nice with multi-processing due to having non-async LLMs.
# _embed_model: Optional[BaseEmbedding] = PrivateAttr()
_metadata_adders: list[MetadataAdder] = PrivateAttr()
def __init__(
self,
# embed_model: Optional[BaseEmbedding] = None,
metadata_adders: list[MetadataAdder] | None = None,
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
# self._embed_model = embed_model or Settings.embed_model
self._metadata_adders = metadata_adders or []
@classmethod
def class_name(cls) -> str:
return "UnstructuredPDFPostProcessor"
# def _apply_embed_model(self, nodes: List[BaseNode]) -> List[BaseNode]:
# if (self._embed_model is not None):
# nodes = self._embed_model(nodes)
# return nodes
def _apply_metadata_adders(self, nodes: list[GenericNode]) -> list[GenericNode]:
for metadata_adder in self._metadata_adders:
nodes = metadata_adder(nodes)
return nodes
def __call__(self, nodes: list[GenericNode], **kwargs: Any) -> Sequence[BaseNode]:
return self._apply_metadata_adders(nodes)
# nodes = self._apply_embed_model(nodes) # this goes second in case we want to embed the metadata.
# def has_email(input_text: str) -> bool:
# """
# Given a chunk of text, determine whether it has an email address or not.
# We're using the long complex email regex from https://emailregex.com/index.html
# """
# return (EMAIL_REGEX.search(input_text) is not None)
# def has_phone(input_text: str) -> bool:
# """
# Given a chunk of text, determine whether it has a phone number or not.
# """
# has_phone = PHONE_REGEX.search(input_text)
# return (has_phone is not None)
# def has_mail_addr(input_text: str) -> bool:
# """
# Given a chunk of text, determine whether it has a mailing address or not.
# NOTE: This is difficult to do with regex.
# ... We could use spacy's English language NER model instead / as well:
# Assume that addresses will have a GSP (geospatial political) or GPE (geopolitical entity).
# DOCS SEE: https://www.nltk.org/book/ch07.html | https://spacy.io/usage/linguistic-features
# """
# has_addr = MAIL_ADDR_REGEX.search(input_text)
# return (has_addr is not None)
# def has_date(input_text: str) -> bool:
# """
# Given a chunk of text, determine whether it has a date or not.
# NOTE: relative dates are stuff like "within 30 days"
# """
# has_date = DATE_REGEX.search(input_text)
# return (has_date is not None)